#0. Installing and Importing the packages

In [None]:
import pandas as pd
import numpy as np
import spacy
from datetime import datetime, timedelta
from nltk.stem.snowball import SnowballStemmer
import yfinance as yf 
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
import nltk

#1. Importing data

In [2]:
text_df=pd.read_csv('/content/drive/MyDrive/NLP_Financial_Sentiment_Analysis_Final/us_equities_news_dataset.csv')
text_df.head(3)

Unnamed: 0,id,ticker,title,category,content,release_date,provider,url,article_id
0,221515,NIO,Why Shares of Chinese Electric Car Maker NIO A...,news,What s happening\nShares of Chinese electric c...,2020-01-15,The Motley Fool,https://invst.ly/pigqi,2060327
1,221516,NIO,NIO only consumer gainer Workhorse Group amon...,news,Gainers NIO NYSE NIO 7 \nLosers MGP Ingr...,2020-01-18,Seeking Alpha,https://invst.ly/pje9c,2062196
2,221517,NIO,NIO leads consumer gainers Beyond Meat and Ma...,news,Gainers NIO NYSE NIO 14 Village Farms In...,2020-01-15,Seeking Alpha,https://invst.ly/pifmv,2060249


In [4]:
text_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 221513 entries, 0 to 221512
Data columns (total 9 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   id            221513 non-null  int64 
 1   ticker        221513 non-null  object
 2   title         221513 non-null  object
 3   category      221513 non-null  object
 4   content       221505 non-null  object
 5   release_date  221513 non-null  object
 6   provider      221513 non-null  object
 7   url           221513 non-null  object
 8   article_id    221513 non-null  int64 
dtypes: int64(2), object(7)
memory usage: 15.2+ MB


#2. Data Pre-Processing

##2.0 Data Types

In [5]:
#Get data types
text_df.dtypes

id               int64
ticker          object
title           object
category        object
content         object
release_date    object
provider        object
url             object
article_id       int64
dtype: object

In [6]:
#Change some data types
text_df['release_date']=pd.to_datetime(text_df['release_date'],format='%Y-%m-%d')
text_df['title']=text_df['title'].astype('string')
text_df['content']=text_df['content'].astype('string')
text_df['category']=text_df['category'].astype('string')
text_df['ticker']=text_df['ticker'].astype('string')
text_df['provider']=text_df['provider'].astype('string')


#Check
text_df.dtypes

id                       int64
ticker                  string
title                   string
category                string
content                 string
release_date    datetime64[ns]
provider                string
url                     object
article_id               int64
dtype: object

In [7]:
#Sort values by date (from oldest to newest)
text_df=text_df.sort_values(by='release_date')

In [8]:
#Check first and last date
first_df_date=text_df.iloc[0,5].date()
last_df_date=text_df.iloc[-1,5].date()
print(f'First day in the dataset:{first_df_date}')
print(f'First day in the dataset:{last_df_date}')

First day in the dataset:2008-10-02
First day in the dataset:2020-02-13


##2.1 Text Cleaning

Articles and headlines

Since lemmatization and entities recognition procedures are particularly time-consuming we apply them only on headlines, while articles words are "simply" stemmed.

In [9]:
#Cleaning articles text

nltk.download('stopwords')
nltk.download('punkt')
stemmer = SnowballStemmer(language='english')
stops = set(stopwords.words('english'))

def language_processing_simple(col):
  """Removes stop words, numbers and then applies a stemmer

     Args:
     series (pd.Series or list): series formed by strings

     Returns:
     list 
  """ 
  new_col=[]
  for n in col:
    try:
      tokens_list = word_tokenize(n)
      words_only = [word for word in tokens_list if word.isalpha()]
      stm = [stemmer.stem(w.lower()) for w in words_only if not w.lower() in stops]
      new_col.append(' '.join(stm))
    except:
      new_col.append(np.nan)
  return new_col

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [10]:
text_df['Processed_Articles']=language_processing_simple(text_df['content'])

In [13]:
#Processing headlines

#First of all, we load the spacy "vocabulary"
nlp = spacy.load('en_core_web_sm')

def language_processing_lem(col):
  """Lemmatize and removes stop words, numbers and entities 

     Args:
     series (pd.Series or list): series formed by strings

     Returns:
     list 
  """    
  stopwords = spacy.lang.en.stop_words.STOP_WORDS
  new_col=[]
  for n in col:
    d=nlp(n)
    l=[w.lemma_.lower() for w in d if (not w.is_stop) and (w.is_alpha) and (w.text not in [e.text for e in d.ents])]
    new_col.append(' '.join(l))
  return new_col

In [14]:
text_df['Processed_Titles']=language_processing_lem(text_df['title'])

text_df.head(3)

Unnamed: 0,id,ticker,title,category,content,release_date,provider,url,article_id,Processed_Articles,Processed_Titles
6235,227750,KYOCY,Nikkei down 1 4 pct on economy fears autos drop,news,Nikkei down 1 4 pct hits lowest point in ove...,2008-10-02,Reuters,https://www.investing.com/news/forex-news/nikk...,669,nikkei pct hit lowest point year fear global e...,nikkei pct economy fear auto drop
105598,327113,TGT,FOREX Dollar poised for biggest weekly gain in...,news,Money market squeeze ECB shift fuel dollar ...,2008-10-03,Reuters,https://www.investing.com/news/forex-news/fore...,671,money market squeez ecb shift fuel dollar surg...,forex dollar poise big gain year
214228,435744,C,GLOBAL MARKETS Stocks rally on Wells Wachovia ...,news,U S stocks rally dollar gains on European w...,2008-10-03,Reuters,https://www.investing.com/news/forex-news/glob...,682,u stock ralli dollar gain european weak well f...,global markets stock rally wells wachovia deal...


##2.2 Financial Returns

We will label news through financial returns

In [None]:
#Labelling news through returns requires first of all the prices
all_prices = yf.download(list(text_df['ticker'].unique()), '2008-10-01','2020-02-14',progress=False)['Close']

In [16]:
#Get the daily returns linked to every text

def get_single_returns(texts_df,prices_df,tickers_col='ticker',date_col='release_date'):
  """Recognizing date and ticker from the first dataset, this function get the daily returns using the prices in the second dataset

     Args:
     (DataFrame): dataframe containing news, reference ticker in the 'ticker' column and date in the 'release_date' column
     (DataFrame): dataframe with the prices(all Close prices for every ticker in the entire news timespan)

     Returns:
     list of returns
  """      
  rets=[]
  trading_days=[d for d in all_prices.index]
  for index,row in texts_df.iterrows():
    try:
      ticker=row[tickers_col]
      day=row[date_col]
      p=prices_df.loc[day,ticker]
      index = trading_days.index(day)
      previous_day = trading_days[index-1]
      previous_p=prices_df.loc[previous_day,ticker]
      r=(p-previous_p)/previous_p
      rets.append(r)
    except:
      rets.append(np.nan)
  return rets

In [17]:
text_df['Returns']=get_single_returns(text_df,all_prices)

text_df.tail(3)

Unnamed: 0,id,ticker,title,category,content,release_date,provider,url,article_id,Processed_Articles,Processed_Titles,Returns
217764,439280,WFC,Comerica taps Wells Fargo exec to head wealth ...,news,Comerica CMA 1 4 names Greg Carr as new he...,2020-02-13,Seeking Alpha,https://invst.ly/pu9ob,2083018,comerica cma name greg carr new head wealth ma...,comerica tap wells fargo exec head wealth mana...,0.006905
217766,439282,WFC,Wells Fargo ends forced arbitration for sexual...,news,By Imani Moise Reuters Wells Fargo NYSE...,2020-02-13,Reuters,https://www.investing.com/news/stock-market-ne...,2083147,imani mois reuter well fargo nyse wfc co wedne...,wells fargo end force arbitration sexual haras...,0.006905
217275,438791,TEVA,7 Stock Charts To Watch AVXL COLL FSLR LUN...,opinion,1 Anavex Life Sciences Corp AVXL Anavex...,2020-02-13,Harry Boxer,https://www.investing.com/analysis/7-stock-cha...,200507108,anavex life scienc corp avxl anavex life scien...,stock charts watch avxl coll fslr prpl tndm,-0.065428


In [18]:
#Due to tickers and dates mismatch there could be a lot of missing values in the 'Returns' column

text_df.isna().sum()

id                        0
ticker                    0
title                     0
category                  0
content                   8
release_date              0
provider                  0
url                       0
article_id                0
Processed_Articles        8
Processed_Titles          0
Returns               37980
dtype: int64

In [19]:
#Even if the missing values are a lot we will drop all the corresponding rows because the dataset will maintain significant dimensions 
text_df=text_df.dropna()
text_df.isna().sum()

id                    0
ticker                0
title                 0
category              0
content               0
release_date          0
provider              0
url                   0
article_id            0
Processed_Articles    0
Processed_Titles      0
Returns               0
dtype: int64

In this way we have a single headline(article) - return correspondance

On the other side we can aggregate news daily and use a stock index to extract the overall market sentiment. In this case we go for the SP500.

##2.3 Daily aggregation

In [20]:
#First, SP500 price time series

#We will get the time series with one more day before because we are interested in returns
sp = pd.DataFrame(yf.download('^GSPC', '2008-10-01','2020-02-14',progress=False)['Close'])
sp['date']=sp.index
sp.index=[i for i in range(0,len(sp),1)]
sp.head(3)

Unnamed: 0,Close,date
0,1161.060059,2008-10-01
1,1114.280029,2008-10-02
2,1099.22998,2008-10-03


In [21]:
#To Compute returns

def ret(series):
  """Input: Prices time series 
     Output: Returns computed through the following formula  R_t=(P_t - P_t-1)/P_t-1

     Args:
     series (pd.Series or list): Prices time series

     Returns:
     list : list starting with np.nan (we assume not to have the price before t0)
  """
  rets=[np.nan]
  for j in range(len(series)-1):
    r=(series[j+1]-series[j])/series[j]
    rets.append(r)
    j+=1
  return rets

In [22]:
#Calculating Returns

sp['Returns']=ret(sp['Close'])

sp.head(3)

Unnamed: 0,Close,date,Returns
0,1161.060059,2008-10-01,
1,1114.280029,2008-10-02,-0.040291
2,1099.22998,2008-10-03,-0.013507


In [23]:
#To aggregate news daily combined with pd.DataFrame.groupby

def string_paster(s):
  """Input: series of strings (sentences for instance)
     Output: One single string merging all the input ones

     Args:
     series (pd.Series or list): series formed by strings

     Returns:
     str : a single string formed by the inputs paste
  """    
  result=''
  for e in s:
        result=result+' '+e
  return result

In [24]:
#Aggregating news daily

hls_by_day=pd.DataFrame(text_df.groupby('release_date')['Processed_Titles'].apply(string_paster))
hls_by_day['date']=hls_by_day.index 
hls_by_day.index=[i for i in range(0,len(hls_by_day),1)] 

In [25]:
#The same with articles

art_by_day=pd.DataFrame(text_df.groupby('release_date')['Processed_Articles'].apply(string_paster))
art_by_day['date']=art_by_day.index 
art_by_day.index=[i for i in range(0,len(art_by_day),1)] 


In [26]:
daily_df=art_by_day.merge(hls_by_day,how='inner',on='date').merge(sp,how='inner',on='date')

In [27]:
daily_df.head(3)

Unnamed: 0,Processed_Articles,date,Processed_Titles,Close,Returns
0,nikkei pct hit lowest point year fear global ...,2008-10-02,nikkei pct economy fear auto drop,1114.280029,-0.040291
1,money market squeez ecb shift fuel dollar sur...,2008-10-03,forex dollar poise big gain year global marke...,1099.22998,-0.013507
2,stock plung european bank action stir new fea...,2008-10-06,global markets stock worldwide oil fall crisi...,1056.890015,-0.038518


We proceed the same as before but aggregating news only (removing titles/articles in the opinion category)

Are opinions able to impact the market? This question is the rationale behind this slicing.

In [28]:
news_only=text_df[text_df['category']=='news']

In [29]:
news_hl_by_day=pd.DataFrame(news_only.groupby('release_date')['Processed_Titles'].apply(string_paster))
news_hl_by_day['date']=news_hl_by_day.index 
news_hl_by_day.index=[i for i in range(0,len(news_hl_by_day),1)] 

In [30]:
news_art_by_day=pd.DataFrame(news_only.groupby('release_date')['Processed_Articles'].apply(string_paster))
news_art_by_day['date']=news_art_by_day.index 
news_art_by_day.index=[i for i in range(0,len(news_art_by_day),1)] 

In [31]:
daily_news_df=news_art_by_day.merge(news_hl_by_day,how='inner',on='date').merge(sp,how='inner',on='date')

In [32]:
daily_news_df.head(3)

Unnamed: 0,Processed_Articles,date,Processed_Titles,Close,Returns
0,nikkei pct hit lowest point year fear global ...,2008-10-02,nikkei pct economy fear auto drop,1114.280029,-0.040291
1,money market squeez ecb shift fuel dollar sur...,2008-10-03,forex dollar poise big gain year global marke...,1099.22998,-0.013507
2,stock plung european bank action stir new fea...,2008-10-06,global markets stock worldwide oil fall crisi...,1056.890015,-0.038518


#4. Exporting

Exporting everything for further use

In [33]:
text_df.to_csv('/content/drive/MyDrive/NLP_Financial_Sentiment_Analysis_Final/text_df.csv')
daily_df.to_csv('/content/drive/MyDrive/NLP_Financial_Sentiment_Analysis_Final/daily.csv')
daily_news_df.to_csv('/content/drive/MyDrive/NLP_Financial_Sentiment_Analysis_Final/daily_news.csv')

#5. Conclusion

We will end up with a series of texts to explore

In fact we have 2 (single,daily) * 2 (headlines,articles) * 2 (news+opinions, news only) = 8 possible textual inputs