# Data cleaning and profiling for U.S. equities news data
The dataset used for this profiling is data of financial news related to U.S. equities. The aim of this notebook is to do data profiling, cleaning, and ingestion. The news dataset is obtained from Kaggle platform at : [Link to dataset on Kaggle](https://www.kaggle.com/datasets/gennadiyr/us-equities-news-data)

## Acknowledgements
The origial datasource is from https://www.investing.com/ . Investing.com is an online data and news website that provides financial information. Every row of this dataset includes attribution to the data provider and link on the source.

<img src="https://i-invdn-com.investing.com/logos/knowledge_graph_151x151.png"/>

## Data loading

The first step in the step in the data cleansing and profiling is to load the data from the source, in apropriate format. Loading the data from csv file stored in HDFS system.

In [2]:
val newsFilePath = "us_equities_news_dataset.csv"

val rawDF = spark.read
  .option("header", "true")
  .option("multiLine", "true")
  .option("inferSchema", "true")
  .option("escape", "\"")
  .csv(newsFilePath)


rawDF.cache()
z.show(rawDF, 5)

In [3]:
println(s"Total columns : ${rawDF.columns.length}") 
println(s"Total rows : ${rawDF.count()}") 

rawDF.printSchema

In [4]:
import org.apache.spark.sql.functions.{col,when, count}
import org.apache.spark.sql.{Column, SparkSession}

def countCols(columns:Array[String]):Array[Column]={
    columns.map(c=>{
      count(when(col(c).isNull,c)).alias(c)
    })
}


In [5]:
println("Count of null entries in the columns : ")
z.show(rawDF.select(countCols(rawDF.columns):_*))

In [6]:
val nullRemovedDF = rawDF.filter($"content".isNotNull)
                         .withColumnRenamed("article_id","articleId")
                         .withColumnRenamed("release_date","releaseDate")

println(s"Total columns : ${nullRemovedDF.columns.length}") 
println(s"Total rows : ${nullRemovedDF.count()}") 
println("Count of null entries in the columns : ")

z.show(nullRemovedDF.select(countCols(nullRemovedDF.columns):_*))

In [7]:
val cleanDF = nullRemovedDF.withColumn("content", trim(col("content")))
                           .where(length($"content") >= length($"title") )
                           .where(length($"content") > 10)
                           .withColumn("content", regexp_replace($"content", "\t", " "))
        
z.show(cleanDF)

## Handling duplicate values

Next task is to remove duplicate news from the dataset, thus counting distinct news and removing duplicates if any.

In [9]:
val dropDisDF = cleanDF.dropDuplicates("content")

println(s"Total columns : ${dropDisDF.columns.length}") 
println(s"Total rows : ${dropDisDF.count()}") 

In [10]:
val countByPublishers = dropDisDF.groupBy("provider").count().sort(col("count").desc)

println(s"The total number of distinct news providers covered by this dataset : ${countByPublishers.count()}")
println("Top 10 news providers in this dataset are : ")

countByPublishers.show(10, false)
z.show(countByPublishers.limit(10), 10)

In [11]:
val correctDateDF = dropDisDF.withColumn("releaseDate", to_timestamp(col("releaseDate")))
z.show(correctDateDF)

In [12]:
correctDateDF.printSchema

In [13]:
val financialQuarterDF = correctDateDF.withColumn("quarter",quarter(correctDateDF.col("releaseDate")))
                                      .withColumn("year",year(correctDateDF.col("releaseDate")))
                                      .select(concat($"year", lit("Q"), $"quarter") as "financialQuarter")
                                      .groupBy("financialQuarter").count()
                                      .sort("financialQuarter")
z.show(financialQuarterDF)


In [14]:
val maximumCountForTitleProviderArticle = correctDateDF.groupBy("title", "provider", "articleId").count().select(max("count"))
maximumCountForTitleProviderArticle.show()


In [15]:
z.show(correctDateDF)

In [16]:
import java.util.Properties
import edu.stanford.nlp.pipeline.StanfordCoreNLP
import edu.stanford.nlp.ling.CoreAnnotations
import edu.stanford.nlp.neural.rnn.RNNCoreAnnotations
import edu.stanford.nlp.sentiment.SentimentCoreAnnotations
import scala.collection.JavaConverters._
import org.apache.spark.SparkContext
import edu.stanford.nlp.util.CoreMap

In [17]:

 
 def sentiment(tweets: String): String = {
    var mainSentiment = 0
    var longest = 0;
    val sentimentText = Array("Very Negative", "Negative", "Neutral", "Positive", "Very Positive")
    val props = new Properties();
    props.setProperty("annotators", "tokenize, ssplit, parse, sentiment");
    new StanfordCoreNLP(props).process(tweets).get(classOf[CoreAnnotations.SentencesAnnotation]).asScala.foreach((sentence: CoreMap) => {
      val sentiment = RNNCoreAnnotations.getPredictedClass(sentence.get(classOf[SentimentCoreAnnotations.SentimentAnnotatedTree]));
      val partText = sentence.toString();
      if (partText.length() > longest) {
        mainSentiment = sentiment;
        longest = partText.length();
      }
    })
    sentimentText(mainSentiment)
  }
  
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._

def mysentiment = udf((x: String) => 
    {
        sentiment(x)
    }
    )

In [18]:
sentiment("Why Shares of Chinese Electric Car Maker NIO Are Flying High Today,news, What s happenin Shares of Chinese electric car maker NIO  NYSE NIO  were sharply higher on Wednesday morning after a Chinese business news outlet reported that the cash strapped company had secured new financing from a major automaker  As of 12 p m  EST  NIO s American depositary shares  ADS  were up about 16  from Tuesday s closing price  So what If this report is accurate and if the deal closes  then it s extremely bullish for NIO" )

In [19]:
val df3 = correctDateDF.groupBy("ticker","releaseDate").count()


In [20]:
z.show(df3)

In [21]:
df3.printSchema

In [22]:
val dfWithWeekNumber = df3.withColumn("dayOfWeek", date_format(col("releaseDate"), "E"))

z.show(dfWithWeekNumber)

In [23]:
val noNulls = dfWithWeekNumber.na.drop("any")

z.show(noNulls)
noNulls.printSchema

Shifting the news published on Saturday and Sunday to the next buisness day as per stock market working hours.

In [25]:
val df4 = dfWithWeekNumber.withColumn("shiftedDate", when( col("dayOfWeek") === "Sat", date_add(col("releaseDate"),2))
.when(col("dayOfWeek") === "Sun", date_add(col("releaseDate"),1))
.otherwise(col("releaseDate")))


In [26]:
df4.show()

In [27]:
val checkiIfWork = df4.filter(col("releaseDate") =!= col("shiftedDate"))

Checking if the date shifting to next buisness day is working

In [29]:
z.show(checkiIfWork)

In [30]:
df4.write.option("header",true).csv("/tmp/news_data_cleaned/news_features_final")

In [31]:
z.show(df4)

In [32]:
val tsla = df4.filter(col("ticker") === "TSLA")

In [33]:
val plottslaNews = tsla.select(col("shiftedDate"), col("count"))

In [34]:
val plottslaNewsSorted = plottslaNews.sort(col("shiftedDate"))
z.show(plottslaNewsSorted)