# 0. Packages Import 

In [1]:
import numpy as np
import pandas as pd
import spacy
import yfinance as yf
from datetime import timedelta
import os

# 1. Data Import

## 1.1 News DataFrame

In [2]:
# Establish useful paths

input_path='../data/input/'

intermediate_path='../data/intermediate/'

In [3]:
# Import News df

df_filename='us_equities_news_dataset.csv'

df=pd.read_csv(os.path.join(input_path,df_filename))

# Shape Check

display(df.shape)

#Visual check

df.head(1)

(221513, 9)

Unnamed: 0,id,ticker,title,category,content,release_date,provider,url,article_id
0,221515,NIO,Why Shares of Chinese Electric Car Maker NIO A...,news,What s happening\nShares of Chinese electric c...,2020-01-15,The Motley Fool,https://invst.ly/pigqi,2060327


In [4]:
# Base level cleaning

# Drop useless columns

df.drop(columns=['id','url','article_id'],inplace=True)

# Change date datatype

df['release_date']=pd.to_datetime(df['release_date'], format='%Y-%m-%d')

# Sort DF by date 

df.sort_values(by='release_date',ascending=True,inplace=True)

# Check NaN

df.isna().sum().to_frame().T

Unnamed: 0,ticker,title,category,content,release_date,provider
0,0,0,0,8,0,0


## 1.2 Index Data

In [5]:
# Download SP500 values from Yahoo!

gspc=yf.download(['^GSPC'],df['release_date'].iloc[1]-timedelta(2),df['release_date'].iloc[-1]+timedelta(1)).reset_index()[['Date','Close']]

# Compute Returns

gspc['Return']=gspc['Close'].pct_change()

# Visual Check 

gspc.head(2)

[*********************100%***********************]  1 of 1 completed


Unnamed: 0,Date,Close,Return
0,2008-10-01,1161.060059,
1,2008-10-02,1114.280029,-0.040291


# 2. Preprocessing Merge

In [6]:
# Group news by day

daily_df=df.groupby(by='release_date')['title'].apply(lambda x : ' '.join(x)).reset_index()

# Check shape

print(f'The shape of the df with news grouped daily is {daily_df.shape} \n')

# Visual check

daily_df.head(2)

The shape of the df with news grouped daily is (3641, 2) 



Unnamed: 0,release_date,title
0,2008-10-02,Nikkei down 1 4 pct on economy fears autos drop
1,2008-10-03,FOREX Dollar poised for biggest weekly gain in...


In [7]:
# Merge News (grouped daily) and SP500 returns 

df_complete=daily_df.merge(gspc,how='inner',left_on='release_date',right_on='Date')

# Check shape

print(f'The shape of the df considering trading days only is {daily_df.shape} \n')

# Visual Check 

df_complete.head(2)

The shape of the df considering trading days only is (3641, 2) 



Unnamed: 0,release_date,title,Date,Close,Return
0,2008-10-02,Nikkei down 1 4 pct on economy fears autos drop,2008-10-02,1114.280029,-0.040291
1,2008-10-03,FOREX Dollar poised for biggest weekly gain in...,2008-10-03,1099.22998,-0.013507


# 3. Labelling

In [8]:
# Binary Labelling (Positive/Negative)

df_complete['binary_label']=df_complete['Return'].map(lambda x: 1 if x>=0 else 0)

# Ternary Labelling (Positive/Neutral/Negative)

# Neutral interval ( - 0.2% , + 0.2%)

df_complete['ternary_label']=df_complete['Return'].map(lambda x: 2 if x >= 0.002 else 1 if x > -0.002 else 0)

# Quick check

df_complete[df_complete['ternary_label']== 1]['Return'].describe()

count    716.000000
mean       0.000008
std        0.001122
min       -0.002000
25%       -0.000960
50%        0.000011
75%        0.000909
max        0.001986
Name: Return, dtype: float64

# 4. Text Cleaning

In [9]:
# Load the spacy "vocabulary"

nlp = spacy.load('en_core_web_sm')

# Remove words with financial meaning from stopwords

for word in ['above','below','up','down']:
  spacy.lang.en.stop_words.STOP_WORDS.remove(word)

# Define the function to clean text

def language_processing(news):
  """Lemmatize and removes stop words, numbers and entities 

     Args:
     news (string) : text documents

     Returns:
     result (string) : clean text  
  """ 
  text_doc=nlp(news) 
  entities_to_remove=[e.text for e in text_doc.ents]
  words_list=[word.lemma_.lower() for word in text_doc if (not word.is_stop) and (word.is_alpha) and (word.text not in entities_to_remove)]
  result=' '.join(words_list)
  return result

In [10]:
# Apply the function

df_complete['clean_text']=df_complete['title'].map(language_processing)

# Visual check

df_complete.head(1)

Unnamed: 0,release_date,title,Date,Close,Return,binary_label,ternary_label,clean_text
0,2008-10-02,Nikkei down 1 4 pct on economy fears autos drop,2008-10-02,1114.280029,-0.040291,0,0,nikkei down pct economy fear auto drop


# 5. Export

In [11]:
df_complete_filename='df_clean_text.parquet'

df_complete.to_parquet(os.path.join(intermediate_path,df_complete_filename))