#0. Installing and Importing the packages

Installing packages

In [None]:
!pip install yfinance
#After this command we need to restart the Runtime (Ctrl+M or >Runtime>Restart Runtime)

Importing packages

In [2]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import spacy
from datetime import datetime, timedelta
from nltk.stem.snowball import SnowballStemmer
import yfinance as yf 
import seaborn as sns
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
import nltk

#1. Importing data

In [3]:
text_df=pd.read_csv('/content/drive/MyDrive/NLP_Financial_Sentiment_Analysis/us_equities_news_dataset.csv')
text_df.head(3)

Unnamed: 0,id,ticker,title,category,content,release_date,provider,url,article_id
0,221515,NIO,Why Shares of Chinese Electric Car Maker NIO A...,news,What s happening\nShares of Chinese electric c...,2020-01-15,The Motley Fool,https://invst.ly/pigqi,2060327
1,221516,NIO,NIO only consumer gainer Workhorse Group amon...,news,Gainers NIO NYSE NIO 7 \nLosers MGP Ingr...,2020-01-18,Seeking Alpha,https://invst.ly/pje9c,2062196
2,221517,NIO,NIO leads consumer gainers Beyond Meat and Ma...,news,Gainers NIO NYSE NIO 14 Village Farms In...,2020-01-15,Seeking Alpha,https://invst.ly/pifmv,2060249


#2. Data Pre-Processing

Focus on datatypes

In [9]:
#Get data types
text_df.dtypes

id                       int64
ticker                  string
title                   string
category                string
content                 string
release_date    datetime64[ns]
provider                string
url                     object
article_id               int64
dtype: object

In [10]:
#Let's change some of them 
text_df['release_date']=pd.to_datetime(text_df['release_date'],format='%Y-%m-%d')
text_df['title']=text_df['title'].astype('string')
text_df['content']=text_df['content'].astype('string')
text_df['category']=text_df['category'].astype('string')
text_df['ticker']=text_df['ticker'].astype('string')
text_df['provider']=text_df['provider'].astype('string')


#Is everything ok?
text_df.dtypes

id                       int64
ticker                  string
title                   string
category                string
content                 string
release_date    datetime64[ns]
provider                string
url                     object
article_id               int64
dtype: object

Chronological order sorting

In [11]:
#Sort values by date (from oldest to newest)
text_df=text_df.sort_values(by='release_date')
text_df.head(3)

Unnamed: 0,id,ticker,title,category,content,release_date,provider,url,article_id
6235,227750,KYOCY,Nikkei down 1 4 pct on economy fears autos drop,news,Nikkei down 1 4 pct hits lowest point in ove...,2008-10-02,Reuters,https://www.investing.com/news/forex-news/nikk...,669
105598,327113,TGT,FOREX Dollar poised for biggest weekly gain in...,news,Money market squeeze ECB shift fuel dollar ...,2008-10-03,Reuters,https://www.investing.com/news/forex-news/fore...,671
214228,435744,C,GLOBAL MARKETS Stocks rally on Wells Wachovia ...,news,U S stocks rally dollar gains on European w...,2008-10-03,Reuters,https://www.investing.com/news/forex-news/glob...,682


In [12]:
#Check first and last date
first_df_date=text_df.iloc[0,5].date()
last_df_date=text_df.iloc[-1,5].date()
print(f'First day in the dataset:{first_df_date}')
print(f'First day in the dataset:{last_df_date}')

First day in the dataset:2008-10-02
First day in the dataset:2020-02-13


**Text Cleaning**

Articles and headlines

Since lemmatization and entities recognition procedures are particularly time-consuming we apply them only on headlines, while articles words are "simply" stemmed.

In [13]:
#Cleaning articles text

nltk.download('stopwords')
nltk.download('punkt')
stemmer = SnowballStemmer(language='english')
stops = set(stopwords.words('english'))

def language_processing_simple(col):
  """Removes stop words, numbers and then applies a stemmer

     Args:
     series (pd.Series or list): series formed by strings

     Returns:
     list 
  """ 
  new_col=[]
  for n in col:
    try:
      tokens_list = word_tokenize(n)
      words_only = [word for word in tokens_list if word.isalpha()]
      stm = [stemmer.stem(w.lower()) for w in words_only if not w.lower() in stops]
      new_col.append(' '.join(stm))
    except:
      new_col.append(np.nan)
  return new_col


text_df['Processed_Articles']=language_processing_simple(text_df['content'])

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [14]:
#Processing headlines

#First of all, we load the spacy "vocabulary"
nlp = spacy.load('en_core_web_sm')

def language_processing_lem(col):
  """Lemmatize and removes stop words, numbers and entities 

     Args:
     series (pd.Series or list): series formed by strings

     Returns:
     list 
  """    
  stopwords = spacy.lang.en.stop_words.STOP_WORDS
  new_col=[]
  for n in col:
    d=nlp(n)
    l=[w.lemma_ for w in d if (not w.is_stop) and (w.is_alpha) and (w.text not in [e.text for e in d.ents])]
    new_col.append(' '.join(l))
  return new_col

#Applying the function to the news in "title" 

text_df['Processed_Titles']=language_processing_lem(text_df['title'])

text_df.head(3)

Unnamed: 0,id,ticker,title,category,content,release_date,provider,url,article_id,Processed_Articles,Processed_Titles
6235,227750,KYOCY,Nikkei down 1 4 pct on economy fears autos drop,news,Nikkei down 1 4 pct hits lowest point in ove...,2008-10-02,Reuters,https://www.investing.com/news/forex-news/nikk...,669,nikkei pct hit lowest point year fear global e...,Nikkei pct economy fear auto drop
105598,327113,TGT,FOREX Dollar poised for biggest weekly gain in...,news,Money market squeeze ECB shift fuel dollar ...,2008-10-03,Reuters,https://www.investing.com/news/forex-news/fore...,671,money market squeez ecb shift fuel dollar surg...,FOREX Dollar poise big gain year
214228,435744,C,GLOBAL MARKETS Stocks rally on Wells Wachovia ...,news,U S stocks rally dollar gains on European w...,2008-10-03,Reuters,https://www.investing.com/news/forex-news/glob...,682,u stock ralli dollar gain european weak well f...,GLOBAL markets stock rally Wells Wachovia deal...


#2. Labelling



We will label news using financial returns

In [None]:
#Labelling news through returns requires first of all the prices
all_prices = yf.download(list(text_df['ticker'].unique()), '2008-10-01','2020-02-14',progress=False)['Close']

In [16]:
#Get the daily returns linked to every text

def get_single_returns(texts_df,prices_df):
  """Recognizing date and ticker from the first dataset, this function get the daily returns using the prices in the second dataset

     Args:
     (DataFrame): dataframe containing news, reference ticker in the 'ticker' column and date in the 'release_date' column
     (DataFrame): dataframe with the prices(all Close prices for every ticker in the entire news timespan)

     Returns:
     list of returns
  """      
  rets=[]
  trading_days=[d for d in all_prices.index]
  for index,row in texts_df.iterrows():
    try:
      ticker=row['ticker']
      day=row['release_date']
      p=prices_df.loc[day,ticker]
      index = trading_days.index(day)
      previous_day = trading_days[index-1]
      previous_p=prices_df.loc[previous_day,ticker]
      r=(p-previous_p)/previous_p
      rets.append(r)
    except:
      rets.append(np.nan)
  return rets


text_df['Returns']=get_single_returns(text_df,all_prices)

text_df.tail()

Unnamed: 0,id,ticker,title,category,content,release_date,provider,url,article_id,Processed_Articles,Processed_Titles,Returns
220030,441546,T,Why T Mobile s deal with Sprint could be the w...,news,T Mobile s deal with Sprint may usher in the n...,2020-02-13,CNBC,https://invst.ly/pubux,2083207,mobil deal sprint may usher next wave major u ...,T Mobile s deal warm wild decade merger,0.007098
209186,430702,C,6 Low Price to Book Stocks With Great Growth P...,opinion,It s not an easy job to find value stocks Bei...,2020-02-13,Zacks Investment Research,https://www.investing.com/analysis/6-low-price...,200506740,easi job find valu stock awar compani key fina...,Low price Book Stocks great Growth prospect,-0.00416
215825,437341,CSCO,Cisco Systems CSCO Q2 Earnings And Revenues ...,opinion,Cisco Systems NASDAQ CSCO came out with quar...,2020-02-13,Zacks Investment Research,https://www.investing.com/analysis/cisco-syste...,200507053,cisco system nasdaq csco came quarter earn per...,Cisco Systems Earnings Revenues Beat Estimates,-0.052273
220143,441659,T,Improve Your Retirement Income With These 3 To...,opinion,Here s a revealing data point older Americans...,2020-02-13,Zacks Investment Research,https://www.investing.com/analysis/improve-you...,200507294,reveal data point older american scare outliv ...,improve Retirement Income Ranked Dividend stoc...,0.007098
217275,438791,TEVA,7 Stock Charts To Watch AVXL COLL FSLR LUN...,opinion,1 Anavex Life Sciences Corp AVXL Anavex...,2020-02-13,Harry Boxer,https://www.investing.com/analysis/7-stock-cha...,200507108,anavex life scienc corp avxl anavex life scien...,Stock Charts watch AVXL COLL FSLR PRPL TNDM,-0.065428


In [17]:
#Due to tickers and dates mismatch there could be a lot of missing values in the 'Returns' column

text_df.isna().sum()

id                        0
ticker                    0
title                     0
category                  0
content                   8
release_date              0
provider                  0
url                       0
article_id                0
Processed_Articles        8
Processed_Titles          0
Returns               37980
dtype: int64

In [18]:
#Even if the missing values are a lot we will drop all the corresponding rows because the dataset will maintain significant dimensions 
print('Dataset rows',len(text_df))
text_df=text_df.dropna()
print('Dataset rows after dropping missing values',len(text_df))
text_df.isna().sum()

Dataset rows 221513
Dataset rows after dropping missing values 183525


id                    0
ticker                0
title                 0
category              0
content               0
release_date          0
provider              0
url                   0
article_id            0
Processed_Articles    0
Processed_Titles      0
Returns               0
dtype: int64

From returns to labels in three different ways

In [19]:
#Binary (Positive/Negative)
def binary_labeling(ret_series):
  """Input: Returns time series 
     Output: Corresponding Binary Labels (2=Positive/0=Negative)

     Args:
     series (pd.Series or list): Time series of financial returns

     Returns:
     list : list of 2/0/np.nan
  """  
  labels=[]
  for r in ret_series:
    if r>=0:
      labels.append(2)
    elif r<0:
      labels.append(0)
    else:
      labels.append(np.nan)
  return labels

#Ternary (Positive/Neutral/Negative)

def ternary_labeling_fixed(ret_series,lim=0.0015):
  """Input: Returns time series 
     Output: Corresponding Ternary Labels (2=Positive/1=Neutral/0=Negative) assuming the following range (-lim , +lim) for neutral

     Args:
     series (pd.Series or list): Time series of financial returns

     Returns:
     list : list composed of 1/0/-1/np.nan
  """  
  labels=[]
  for r in ret_series:
    if r>=lim:
      labels.append(2)
    elif r<0.0015 and r>-lim:
      labels.append(1)
    elif r<=-lim:
      labels.append(0)
    else:
      labels.append(np.nan)
  return labels

def ternary_labeling_parametrized(ret_series,d=10):
  """Input: Returns time series 
     Output: Corresponding Ternary Labels (2=Positive/1=Neutral/0=Negative) assuming the following range (- (mean return + (std of returns /d)) , +(mean return + (std of returns /d))) for neutral

     Args:
     series (pd.Series or list): Time series of financial returns
     d (int) : d divides the series standard deviation to determine the neutral range as describe above (Output: ...)

     Returns:
     list : list composed of 2/1/0/np.nan
  """  
  labels=[]
  m=np.mean(ret_series)
  std=np.std(ret_series)
  b=(std/d)+m
  for r in ret_series:
    if r>=b:
      labels.append(2)
    elif r<b and r>-b:
      labels.append(1)
    elif r<=-b:
      labels.append(0)
    else:
      labels.append(np.nan)
  return labels


In [20]:
text_df['Binary_Labels']=binary_labeling(text_df['Returns'])
text_df['Ternary_Labels_fixed']=ternary_labeling_fixed(text_df['Returns'])
text_df['Ternary_Labels_par']=ternary_labeling_parametrized(text_df['Returns'])

In [21]:
text_df.tail()

Unnamed: 0,id,ticker,title,category,content,release_date,provider,url,article_id,Processed_Articles,Processed_Titles,Returns,Binary_Labels,Ternary_Labels_fixed,Ternary_Labels_par
220030,441546,T,Why T Mobile s deal with Sprint could be the w...,news,T Mobile s deal with Sprint may usher in the n...,2020-02-13,CNBC,https://invst.ly/pubux,2083207,mobil deal sprint may usher next wave major u ...,T Mobile s deal warm wild decade merger,0.007098,2,2,2
209186,430702,C,6 Low Price to Book Stocks With Great Growth P...,opinion,It s not an easy job to find value stocks Bei...,2020-02-13,Zacks Investment Research,https://www.investing.com/analysis/6-low-price...,200506740,easi job find valu stock awar compani key fina...,Low price Book Stocks great Growth prospect,-0.00416,0,0,1
215825,437341,CSCO,Cisco Systems CSCO Q2 Earnings And Revenues ...,opinion,Cisco Systems NASDAQ CSCO came out with quar...,2020-02-13,Zacks Investment Research,https://www.investing.com/analysis/cisco-syste...,200507053,cisco system nasdaq csco came quarter earn per...,Cisco Systems Earnings Revenues Beat Estimates,-0.052273,0,0,0
220143,441659,T,Improve Your Retirement Income With These 3 To...,opinion,Here s a revealing data point older Americans...,2020-02-13,Zacks Investment Research,https://www.investing.com/analysis/improve-you...,200507294,reveal data point older american scare outliv ...,improve Retirement Income Ranked Dividend stoc...,0.007098,2,2,2
217275,438791,TEVA,7 Stock Charts To Watch AVXL COLL FSLR LUN...,opinion,1 Anavex Life Sciences Corp AVXL Anavex...,2020-02-13,Harry Boxer,https://www.investing.com/analysis/7-stock-cha...,200507108,anavex life scienc corp avxl anavex life scien...,Stock Charts watch AVXL COLL FSLR PRPL TNDM,-0.065428,0,0,0


In this way we have a single headline(article) - label correspondance

On the other side we can aggregate news daily and use a stock index to extraxt the overall market sentiment. In this case we can use the SP500.

In [26]:
#First, SP500 price time series

#We will get the time series with one more day before because we are interested in returns
sp = pd.DataFrame(yf.download('^GSPC', '2008-10-01','2020-02-14',progress=False)['Close'])
sp['date']=sp.index
sp.index=[i for i in range(0,len(sp),1)]
sp.head(3)

Unnamed: 0,Close,date
0,1161.060059,2008-10-01
1,1114.280029,2008-10-02
2,1099.22998,2008-10-03


In [27]:
#To Compute returns

def ret(series):
  """Input: Prices time series 
     Output: Returns computed through the following formula  R_t=(P_t - P_t-1)/P_t-1

     Args:
     series (pd.Series or list): Prices time series

     Returns:
     list : list starting with np.nan (we do not to have the price before t0)
  """
  rets=[np.nan]
  for j in range(len(series)-1):
    r=(series[j+1]-series[j])/series[j]
    rets.append(r)
    j+=1
  return rets

In [28]:
#Calculating Returns

sp['Returns']=ret(sp['Close'])

sp.head(3)

Unnamed: 0,Close,date,Returns
0,1161.060059,2008-10-01,
1,1114.280029,2008-10-02,-0.040291
2,1099.22998,2008-10-03,-0.013507


In [29]:
#To aggregate news daily combined with pd.DataFrame.groupby

def string_paster(s):
  """Input: series of strings (sentences for instance)
     Output: One single string merging all the input ones

     Args:
     series (pd.Series or list): series formed by strings

     Returns:
     str : a single string formed by the inputs paste
  """    
  result=''
  for e in s:
        result=result+' '+e
  return result




In [30]:
#Aggregating news daily

hls_by_day=pd.DataFrame(text_df.groupby('release_date')['Processed_Titles'].apply(string_paster))
hls_by_day['date']=hls_by_day.index 
hls_by_day.index=[i for i in range(0,len(hls_by_day),1)] 

#Merging and obtaining the sentiment

daily_hls_df=hls_by_day.merge(sp,how='inner',on='date')
daily_hls_df['Binary_Labels']=binary_labeling(daily_hls_df['Returns'])
daily_hls_df['Ternary_Labels_fixed']=ternary_labeling_fixed(daily_hls_df['Returns'])
daily_hls_df['Ternary_Labels_par']=ternary_labeling_parametrized(daily_hls_df['Returns'],5)

In [31]:
#The same with articles

art_by_day=pd.DataFrame(text_df.groupby('release_date')['Processed_Articles'].apply(string_paster))
art_by_day['date']=art_by_day.index 
art_by_day.index=[i for i in range(0,len(art_by_day),1)] 

daily_art_df=art_by_day.merge(sp,how='inner',on='date')
daily_art_df['Binary_Labels']=binary_labeling(daily_art_df['Returns'])
daily_art_df['Ternary_Labels_fixed']=ternary_labeling_fixed(daily_art_df['Returns'])
daily_art_df['Ternary_Labels_par']=ternary_labeling_parametrized(daily_art_df['Returns'],5)

#4. Exporting

Exporting everything for further use

In [None]:
text_df.to_csv('/content/drive/MyDrive/NLP_Financial_Sentiment_Analysis/text_df.csv')
daily_art_df.to_csv('/content/drive/MyDrive/NLP_Financial_Sentiment_Analysis/daily_articles_df.csv')
daily_hls_df.to_csv('/content/drive/MyDrive/NLP_Financial_Sentiment_Analysis/daily_headlines_df.csv')

#5. Conclusion

We will end up with a series of possible text - label combinations to explore

1) Daily headlines - positive/negative sentiment

2) Daily headlines - positive/negative/neutral sentiment

3) Daily articles - positive/negative sentiment

4) Daily articles - positive/negative/neutral sentiment

5) Single headlines - positive/negative sentiment

6) Single headlines - positive/negative/neutral sentiment

7) Single articles - positive/negative sentiment

8) Single articles - positive/negative/neutral sentiment

In addition, we could employ the same combinations as before but removing opinions and keeping news only.

