# 0. Introduction

This notebook contains the following operations:

- Data Import
- Labelling
- Text Cleaning

# 1. Packages & Basic Settings

In [1]:
import numpy as np
import pandas as pd
import yfinance as yf
from datetime import timedelta
import os

In [2]:
from text_processing import TextCleaner

In [3]:
data_path = '../data'

# 2. Data Import

## 2.1. News

In [4]:
# Import News df

df_filename='us_equities_news_dataset.csv'

df=pd.read_csv(os.path.join(data_path, 'input', df_filename))

# Visual check

df.head(2)

Unnamed: 0,id,ticker,title,category,content,release_date,provider,url,article_id
0,221515,NIO,Why Shares of Chinese Electric Car Maker NIO A...,news,What s happening\nShares of Chinese electric c...,2020-01-15,The Motley Fool,https://invst.ly/pigqi,2060327
1,221516,NIO,NIO only consumer gainer Workhorse Group amon...,news,Gainers NIO NYSE NIO 7 \nLosers MGP Ingr...,2020-01-18,Seeking Alpha,https://invst.ly/pje9c,2062196


In [5]:
# Change date datatype

df['release_date']=pd.to_datetime(df['release_date'], format='%Y-%m-%d')

# Sort DF by date 

df.sort_values(by='release_date', ascending=True, inplace=True)


## 2.2. Stock Market Index Data

Since the large majority of US stocks involved the index choosen is the SP500

In [6]:
# Download SP500 values from Yahoo!

gspc=yf.download(['^GSPC'],df['release_date'].iloc[1]-timedelta(2),df['release_date'].iloc[-1]+timedelta(1)).reset_index()[['Date','Close']]

# Compute Returns

gspc['return']=gspc['Close'].pct_change()

# Visual Check 

gspc.head(2)

[*********************100%%**********************]  1 of 1 completed


Unnamed: 0,Date,Close,return
0,2008-10-01,1161.060059,
1,2008-10-02,1114.280029,-0.040291


# 3. Preprocessing

### Assumption

News and headlines in non-trading days are assigned to the the next trading day. Since 'Close' prices are used to compute returns this assumption rely on the fact that non-trading days news impact the following Close price

In [7]:
drop_duplicates_subset = [ 'title', 'category', 'release_date', 'provider']

df_complete = pd.merge_asof(df.drop_duplicates(subset=drop_duplicates_subset), gspc, right_on='Date', left_on='release_date', direction='forward').rename(columns={'Date':'trading_day'})

In [8]:
# Group news by the trading day (price) they have impact on  

daily_df=df_complete.groupby(by='trading_day').agg({'title':list, 'return':'max'}).reset_index().rename(columns={'title':'daily_headlines'})

daily_df.head(2)

Unnamed: 0,trading_day,daily_headlines,return
0,2008-10-02,[Nikkei down 1 4 pct on economy fears autos d...,-0.040291
1,2008-10-03,[FOREX Dollar poised for biggest weekly gain i...,-0.013507


# 4. Labelling

Financial returns based labelling

#### Binary

binary_label_t = 1 ('Positive') if ret_t >= 0 else 0 ('Negative')

#### Ternary

What return can be defined as "neutral"? 

This is a tricky question, in this case we assumed ( - 0.2% , + 0.2% ) as neutral interval

ternary_label_t = 2 ('Positive') if ret_t >= 0.002 / 1 ('Neutral') if -0.002< ret_t < 0.002 / 0 ('Negative) if ret_t < 0.002

In [9]:
# Binary Labelling (Positive/Negative)

daily_df['binary_label'] = daily_df['return'].map(lambda x: 1 if x>=0 else 0)

# Ternary Labelling (Positive/Neutral/Negative)

# Neutral interval ( - 0.2% , + 0.2% )

daily_df['ternary_label'] = daily_df['return'].map(lambda x: 2 if x >= 0.002 else 1 if x > -0.002 else 0)

# Quick check

daily_df[daily_df['ternary_label']== 1]['return'].max(), daily_df[daily_df['ternary_label']== 1]['return'].min()

(0.0019856558213586784, -0.0019995427986508885)

# 5. Text cleaning

This phase relies on an object designed to:

- remove stopwords
- remove punctuation
- remove numbers
- remove entities
- stem words

The result should be a cleaned bag-of-words containing all the (stemmed) words coming from all the headlines that affected the trading day outcome

In [10]:
text_cleaner = TextCleaner(spacy_vocab='en_core_web_sm',
                           stopwords_adjustments={'-':['above','below','up','down']}, # words having clear financial significance and thus should not be removed
                           remove_stopwords=True,
                           remove_numbers=True,
                           remove_recognized_entities=True,
                           remove_punct=True,
                           word_red='stem')


daily_df['clean_text'] = daily_df['daily_headlines'].map(text_cleaner.clean_ls)

# 6. Export

In [11]:
export_cols = ['trading_day', 'clean_text', 'return', 'binary_label', 'ternary_label']

preprocessed_df_filename = 'df_preprocessed.parquet'

In [12]:
daily_df[export_cols].to_parquet(os.path.join(data_path, 'intermediate', preprocessed_df_filename))