In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd /content/drive/MyDrive/ML4FINANCE

/content/drive/MyDrive/ML4FINANCE


In [3]:
import torch
from transformers import pipeline
from sentiment_analysis import *

# Sentiment Analysis Example

In [None]:
# Create a test dataframe to check our sentiment analysis
df = pd.DataFrame({
    'time': ['2021-07-15', '2020-07-12','2000-08-15'],
    'gvkey': [312, 12, 90],
    'text': ["Stock markets rallied after the company reported better-than-expected quarterly earnings. @salut #","Apple stock is expected to be stable",
             "Amazon stock will highly fall\n"]
})
df['time'] = pd.to_datetime(df['time'])
df

Unnamed: 0,time,gvkey,text
0,2021-07-15,312,Stock markets rallied after the company report...
1,2020-07-12,12,Apple stock is expected to be stable
2,2000-08-15,90,Amazon stock will highly fall\n


In [None]:
# Convert to parquet as my class takes a parquet as input
df.to_parquet('datasets/Predictors/df.parquet')

In [None]:
# Create an instance of my class with the created parquet file
reports = ReportsProcessing('datasets/Predictors/df.parquet', 'clean')
reports.data

Unnamed: 0,time,gvkey,text
0,2021-07-15,312,Stock markets rallied after the company report...
1,2020-07-12,12,Apple stock is expected to be stable
2,2000-08-15,90,Amazon stock will highly fall\n


In [None]:
reports.clean_reports()

Cleaning Reports: 100%|██████████| 3/3 [00:00<00:00, 3175.10reports/s]


In [None]:
reports.data

Unnamed: 0,time,gvkey,cleaned_text
0,2021-07-15,312,stock markets rallied after the company report...
1,2020-07-12,12,apple stock is expected to be stable
2,2000-08-15,90,amazon stock will highly fall


In [None]:
# Perform sentiment analysis on my dataframe
reports.reports_sentiment_analysis()

Device set to use cpu
Detecting sentiments: 100%|██████████| 1/1 [00:00<00:00,  3.16it/s]

Sentiment Analysis Report:
	- Positive: 1 tweets (Avg. score: 0.93)
	- Neutral: 1 tweets (Avg. score: 0.94)
	- Negative: 1 tweets (Avg. score: 0.75)





In [None]:
reports.save_clean_data()

# 10-K Reports

## Preprocessing

In [None]:
tenkreports = pd.read_parquet('datasets/Predictors/mda_text.parquet')

In [None]:
tenkreports

Unnamed: 0,submission_type,filing_date,report_date,report_year,cik,company_conformed_name,text
0,10-K,20160311,20151231,2015,0000002178,"ADAMS RESOURCES & ENERGY, INC.",Item 7. MANAGEMENT'S DISCUSSION AND ANALYSIS O...
1,10-K,20160218,20151226,2015,0000002488,ADVANCED MICRO DEVICES INC,ITEM 7.\n\nMANAGEMENT'S DISCUSSION AND ANALYSI...
2,10-K,20160216,20151231,2015,0000003499,ALEXANDERS INC,ITEM 7. management's discussion and analys...
3,10-K,20151210,20150930,2015,0000003545,ALICO INC,Item 7. Management's Discussion and Analysis o...
4,10-K,20161206,20160930,2016,0000003545,ALICO INC,Item 7. Management's Discussion and Analysis o...
...,...,...,...,...,...,...,...
8317,10-K,20161108,20160731,2016,0001550053,"Grand Perfecta, Inc.",ITEM 7. MANAGEMENT'S DISCUSSION AND ANAL...
8318,10-K,20161114,20160930,2016,0001023459,SIMULATIONS PLUS INC,ITEM 7 – MANAGEMENT'S\nDISCUSSION AND ANALYSIS...
8319,10-K,20161213,20160831,2016,0001343009,Cannabics Pharmaceuticals Inc.,Item 7. Management's\nDiscussion and Analysis ...
8320,10-K,20161214,20160831,2016,0001313938,"Sibannac, Inc.",ITEM 7. MANAGEMENT'S DISCUSSION AND\nANALYSIS ...


In [None]:
tenkreports['report_year'].value_counts()

report_year
2015    6467
2016    1684
2014     131
2013      24
2012      10
2011       2
2010       2
2008       1
2009       1
Name: count, dtype: int64

In [None]:
tenkreports['report_date'] = pd.to_datetime(tenkreports['report_date'])
tenkreports['filing_date'] = pd.to_datetime(tenkreports['filing_date'])
tenkreports['report_year'] = pd.to_datetime(tenkreports['report_year'], format='%Y').dt.year
# Remove the 'submission_type' column as it is 10K for every row
tenkreports.drop(columns=['submission_type'], inplace=True)
tenkreports.to_parquet('datasets/Predictors/tenkreports_clean.parquet')

## Sentiment Analysis

In [4]:
reports = ReportsProcessing('datasets/Predictors/tenkreports_clean.parquet', 'clean')
reports.data

Unnamed: 0,filing_date,report_date,report_year,cik,company_conformed_name,text
0,2016-03-11,2015-12-31,2015,0000002178,"ADAMS RESOURCES & ENERGY, INC.",Item 7. MANAGEMENT'S DISCUSSION AND ANALYSIS O...
1,2016-02-18,2015-12-26,2015,0000002488,ADVANCED MICRO DEVICES INC,ITEM 7.\n\nMANAGEMENT'S DISCUSSION AND ANALYSI...
2,2016-02-16,2015-12-31,2015,0000003499,ALEXANDERS INC,ITEM 7. management's discussion and analys...
3,2015-12-10,2015-09-30,2015,0000003545,ALICO INC,Item 7. Management's Discussion and Analysis o...
4,2016-12-06,2016-09-30,2016,0000003545,ALICO INC,Item 7. Management's Discussion and Analysis o...
...,...,...,...,...,...,...
8317,2016-11-08,2016-07-31,2016,0001550053,"Grand Perfecta, Inc.",ITEM 7. MANAGEMENT'S DISCUSSION AND ANAL...
8318,2016-11-14,2016-09-30,2016,0001023459,SIMULATIONS PLUS INC,ITEM 7 – MANAGEMENT'S\nDISCUSSION AND ANALYSIS...
8319,2016-12-13,2016-08-31,2016,0001343009,Cannabics Pharmaceuticals Inc.,Item 7. Management's\nDiscussion and Analysis ...
8320,2016-12-14,2016-08-31,2016,0001313938,"Sibannac, Inc.",ITEM 7. MANAGEMENT'S DISCUSSION AND\nANALYSIS ...


In [5]:
reports.clean_reports()

Cleaning Reports: 100%|██████████| 8322/8322 [00:29<00:00, 284.83reports/s]


In [6]:
reports.data

Unnamed: 0,filing_date,report_date,report_year,cik,company_conformed_name,cleaned_text
0,2016-03-11,2015-12-31,2015,0000002178,"ADAMS RESOURCES & ENERGY, INC.",management's discussion and analysis of financ...
1,2016-02-18,2015-12-26,2015,0000002488,ADVANCED MICRO DEVICES INC,management's discussion and analysis of financ...
2,2016-02-16,2015-12-31,2015,0000003499,ALEXANDERS INC,management's discussion and analysis of financ...
3,2015-12-10,2015-09-30,2015,0000003545,ALICO INC,management's discussion and analysis of financ...
4,2016-12-06,2016-09-30,2016,0000003545,ALICO INC,management's discussion and analysis of financ...
...,...,...,...,...,...,...
8317,2016-11-08,2016-07-31,2016,0001550053,"Grand Perfecta, Inc.",management's discussion and analysis of financ...
8318,2016-11-14,2016-09-30,2016,0001023459,SIMULATIONS PLUS INC,item 7 – management's discussion and analysis ...
8319,2016-12-13,2016-08-31,2016,0001343009,Cannabics Pharmaceuticals Inc.,management's discussion and analysis of financ...
8320,2016-12-14,2016-08-31,2016,0001313938,"Sibannac, Inc.",management's discussion and analysis of financ...


In [8]:
reports.reports_sentiment_analysis()

Device set to use cpu
Detecting sentiments: 100%|██████████| 66/66 [16:17<00:00, 14.81s/it]


Sentiment Analysis Report:
	- Positive: 220 tweets (Avg. score: 0.75)
	- Neutral: 7946 tweets (Avg. score: 0.91)
	- Negative: 156 tweets (Avg. score: 0.81)


In [13]:
reports.data.sort_values(by='report_date', inplace=True)

In [15]:
reports.save_clean_data()

In [5]:
df = pd.read_parquet('./clean/sentiment_tenkreports.parquet')
df

Unnamed: 0,filing_date,report_date,report_year,cik,company_conformed_name,cleaned_text,sentiment,sentiment_score
0,2016-08-24,2008-12-31,2008,0001110648,"IMAGE PROTECT, INC.",financial statements financial statements inde...,neutral,0.936196
1,2016-07-28,2009-09-30,2009,0001316854,Uranium Hunter CORP,financial statements. see the financial statem...,neutral,0.939092
2,2016-07-28,2010-09-30,2010,0001316854,Uranium Hunter CORP,financial statements. see the financial statem...,neutral,0.939092
3,2016-08-19,2010-10-31,2010,0001334325,"Coda Octopus Group, Inc.",item 7. management's discussion and analysis o...,neutral,0.944394
4,2016-03-25,2011-05-31,2011,0001438672,BAKKEN ENERGY CORP.,management's discussion and analysis of financ...,neutral,0.937800
...,...,...,...,...,...,...,...,...
8317,2016-12-12,2016-10-31,2016,0000883241,SYNOPSYS INC,management's discussion and analysis of financ...,neutral,0.947523
8318,2016-12-07,2016-10-31,2016,0000715446,ITUS Corp,management's discussion and analysis of financ...,neutral,0.824702
8319,2016-12-27,2016-10-31,2016,0001312073,"VERIFONE SYSTEMS, INC.",management's discussion and analysis of financ...,neutral,0.937117
8320,2016-12-19,2016-10-31,2016,0000027673,DEERE JOHN CAPITAL CORP,management's discussion and analysis of financ...,neutral,0.942376


We only have data from 2009 to 2016 but with sufficient reports only in 2015 and 2016.

idée: C'est que le sentiment analysis va retourner uniquement un sentiment: positif, négatif ou neutre donc il n'y aura pas vraiment de data leakage. Dans les faits, dans certains datasets ils parlent des performances mais on est pas dans le data leakage car notre sentiment analysis ne resort pas de chiffre non?

# Earning Calls

## Preprocessing

In [None]:
earnings_calls = pd.read_parquet('datasets/Predictors/earnings_calls.parquet')

In [None]:
earnings_calls

Unnamed: 0,transcriptid,componentorder,componenttext,mostimportantdateutc,gvkey,companyname,transcriptcomponenttypename
0,3285682,0,"Good morning, everyone, and welcome to the LXI...",2020-11-23,323562,LXI REIT plc,Presentation Operator Message
1,50630,5,"Thanks, Bob. In summary 2009 was a pivotal yea...",2010-02-18,063083,Endologix LLC,Presenter Speech
2,840499,103,"Just one thing left to ask, actually. If we ta...",2015-07-28,002410,BP p.l.c.,Question
3,47300,58,"I said, I think it certainly has been a rare o...",2010-01-27,009317,SEI Investments Company,Answer
4,49778,1,<strong>Operator</strong>\nLadies and gentleme...,2010-02-12,176660,3SBio Inc.,Presentation Section
...,...,...,...,...,...,...,...
12475891,1220890,45,So a key driver of the improvement last year a...,2017-05-11,220426,Zurich Insurance Group AG,Answer
12475892,1220890,43,"On the rate increase, I mean if you look at th...",2017-05-11,220426,Zurich Insurance Group AG,Answer
12475893,1220890,40,"So I'm going to apologize in advance, Ralph. I...",2017-05-11,220426,Zurich Insurance Group AG,Answer
12475894,1220890,37,"Yes, thanks. So there -- I mean, given what I ...",2017-05-11,220426,Zurich Insurance Group AG,Answer


In [None]:
earnings_calls.dtypes

transcriptid                            int64
componentorder                          int64
componenttext                          object
mostimportantdateutc           datetime64[us]
gvkey                                  object
companyname                            object
transcriptcomponenttypename            object
dtype: object

In [None]:
#earnings_calls.sort_values(by=['mostimportantdateutc','transcriptid', 'componentorder'], inplace = True)

In [None]:
#earnings_calls.to_parquet('datasets/Predictors/earnings_calls_sorted.parquet')

In [None]:
earnings_calls_sorted = pd.read_parquet('datasets/Predictors/earnings_calls_sorted.parquet')

In [None]:
earnings_calls_sorted

Unnamed: 0,transcriptid,componentorder,componenttext,mostimportantdateutc,gvkey,companyname,transcriptcomponenttypename
198438,46682,1,"Good afternoon, everyone, and welcome to Merix...",2010-01-04,030247,Viasystems Corporation,Presentation Operator Message
1610990,46682,2,"Thank you. Happy New Year, everyone. In Octobe...",2010-01-04,030247,Viasystems Corporation,Presenter Speech
1610986,46682,3,"Thanks, Mike, and good afternoon, everyone. As...",2010-01-04,030247,Viasystems Corporation,Presenter Speech
1610982,46682,4,"Thanks, Kelly. As you can tell by our comments...",2010-01-04,030247,Viasystems Corporation,Presenter Speech
4020381,46682,5,[Operator Instructions] And our first question...,2010-01-04,030247,Viasystems Corporation,Question and Answer Operator Message
...,...,...,...,...,...,...,...
3620930,2164922,40,[Foreign Language],2020-12-30,161925,China Finance Online Co. Limited,Answer
3620861,2164922,41,Although the Chinese economy is leading the gr...,2020-12-30,161925,China Finance Online Co. Limited,Answer
1281137,2164922,42,[Operator Instructions] As there are no furthe...,2020-12-30,161925,China Finance Online Co. Limited,Question and Answer Operator Message
3620852,2164922,43,"Thank you, everybody, for attending China Fina...",2020-12-30,161925,China Finance Online Co. Limited,Answer


In [None]:
# Check the number of unique years in the earnings calls dataset
earnings_calls_sorted['mostimportantdateutc'] = pd.to_datetime(earnings_calls_sorted['mostimportantdateutc'])
years = earnings_calls_sorted['mostimportantdateutc'].dt.year
years.value_counts()

mostimportantdateutc
2020    1393501
2019    1240878
2018    1220285
2011    1183355
2012    1147828
2017    1098526
2013    1095738
2014    1073695
2015    1060881
2016    1014520
2010     946689
Name: count, dtype: int64

We have data from 2020 to 2016 and also 20120.

## Sentiment Analysis

In [None]:
reports = ReportsProcessing('datasets/Predictors/earnings_calls_sorted.parquet', 'clean')
reports.data

Unnamed: 0,transcriptid,componentorder,componenttext,mostimportantdateutc,gvkey,companyname,transcriptcomponenttypename
198438,46682,1,"Good afternoon, everyone, and welcome to Merix...",2010-01-04,030247,Viasystems Corporation,Presentation Operator Message
1610990,46682,2,"Thank you. Happy New Year, everyone. In Octobe...",2010-01-04,030247,Viasystems Corporation,Presenter Speech
1610986,46682,3,"Thanks, Mike, and good afternoon, everyone. As...",2010-01-04,030247,Viasystems Corporation,Presenter Speech
1610982,46682,4,"Thanks, Kelly. As you can tell by our comments...",2010-01-04,030247,Viasystems Corporation,Presenter Speech
4020381,46682,5,[Operator Instructions] And our first question...,2010-01-04,030247,Viasystems Corporation,Question and Answer Operator Message
...,...,...,...,...,...,...,...
3620930,2164922,40,[Foreign Language],2020-12-30,161925,China Finance Online Co. Limited,Answer
3620861,2164922,41,Although the Chinese economy is leading the gr...,2020-12-30,161925,China Finance Online Co. Limited,Answer
1281137,2164922,42,[Operator Instructions] As there are no furthe...,2020-12-30,161925,China Finance Online Co. Limited,Question and Answer Operator Message
3620852,2164922,43,"Thank you, everybody, for attending China Fina...",2020-12-30,161925,China Finance Online Co. Limited,Answer


In [None]:
reports.clean_reports()

Cleaning Reports: 100%|██████████| 12475896/12475896 [12:20<00:00, 16845.02reports/s]


In [None]:
reports.data

Unnamed: 0,transcriptid,componentorder,mostimportantdateutc,gvkey,companyname,transcriptcomponenttypename,cleaned_text
198438,46682,1,2010-01-04,030247,Viasystems Corporation,Presentation Operator Message,"good afternoon, everyone, and welcome to merix..."
1610990,46682,2,2010-01-04,030247,Viasystems Corporation,Presenter Speech,"thank you. happy new year, everyone. in octobe..."
1610986,46682,3,2010-01-04,030247,Viasystems Corporation,Presenter Speech,"thanks, mike, and good afternoon, everyone. as..."
1610982,46682,4,2010-01-04,030247,Viasystems Corporation,Presenter Speech,"thanks, kelly. as you can tell by our comments..."
4020381,46682,5,2010-01-04,030247,Viasystems Corporation,Question and Answer Operator Message,and our first question comes from matt sheerin...
...,...,...,...,...,...,...,...
3620930,2164922,40,2020-12-30,161925,China Finance Online Co. Limited,Answer,
3620861,2164922,41,2020-12-30,161925,China Finance Online Co. Limited,Answer,although the chinese economy is leading the gr...
1281137,2164922,42,2020-12-30,161925,China Finance Online Co. Limited,Question and Answer Operator Message,as there are no further questions at this time...
3620852,2164922,43,2020-12-30,161925,China Finance Online Co. Limited,Answer,"thank you, everybody, for attending china fina..."


In [None]:
reports.reports_sentiment_analysis()

config.json:   0%|          | 0.00/758 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/252 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

[1;30;43mLe flux de sortie a été tronqué et ne contient que les 5000 dernières lignes.[0m
Detecting sentiments:  95%|█████████▍| 92470/97468 [6:30:51<21:37,  3.85it/s][A
Detecting sentiments:  95%|█████████▍| 92471/97468 [6:30:51<21:45,  3.83it/s][A
Detecting sentiments:  95%|█████████▍| 92472/97468 [6:30:52<21:49,  3.81it/s][A
Detecting sentiments:  95%|█████████▍| 92473/97468 [6:30:52<21:40,  3.84it/s][A
Detecting sentiments:  95%|█████████▍| 92474/97468 [6:30:52<21:24,  3.89it/s][A
Detecting sentiments:  95%|█████████▍| 92475/97468 [6:30:52<21:18,  3.91it/s][A
Detecting sentiments:  95%|█████████▍| 92476/97468 [6:30:53<21:20,  3.90it/s][A
Detecting sentiments:  95%|█████████▍| 92477/97468 [6:30:53<21:25,  3.88it/s][A
Detecting sentiments:  95%|█████████▍| 92478/97468 [6:30:53<21:29,  3.87it/s][A
Detecting sentiments:  95%|█████████▍| 92479/97468 [6:30:53<21:43,  3.83it/s][A
Detecting sentiments:  95%|█████████▍| 92480/97468 [6:30:54<21:27,  3.87it/s][A
Detecting sentime

Sentiment Analysis Report:
	- Positive: 1728592 tweets (Avg. score: 0.76)
	- Neutral: 9875256 tweets (Avg. score: 0.84)
	- Negative: 872048 tweets (Avg. score: 0.76)


In [None]:
reports.save_clean_data()

In [None]:
df = pd.read_parquet('./clean/sentiment_earning_calls.parquet')

In [None]:
df

Unnamed: 0,transcriptid,componentorder,mostimportantdateutc,gvkey,companyname,transcriptcomponenttypename,cleaned_text,sentiment,sentiment_score
0,46682,1,2010-01-04,030247,Viasystems Corporation,Presentation Operator Message,"good afternoon, everyone, and welcome to merix...",neutral,0.798825
1,46682,2,2010-01-04,030247,Viasystems Corporation,Presenter Speech,"thank you. happy new year, everyone. in octobe...",negative,0.792872
2,46682,3,2010-01-04,030247,Viasystems Corporation,Presenter Speech,"thanks, mike, and good afternoon, everyone. as...",positive,0.952724
3,46682,4,2010-01-04,030247,Viasystems Corporation,Presenter Speech,"thanks, kelly. as you can tell by our comments...",positive,0.891024
4,46682,5,2010-01-04,030247,Viasystems Corporation,Question and Answer Operator Message,and our first question comes from matt sheerin...,neutral,0.922422
...,...,...,...,...,...,...,...,...,...
12475891,2164922,40,2020-12-30,161925,China Finance Online Co. Limited,Answer,,neutral,0.424185
12475892,2164922,41,2020-12-30,161925,China Finance Online Co. Limited,Answer,although the chinese economy is leading the gr...,positive,0.906605
12475893,2164922,42,2020-12-30,161925,China Finance Online Co. Limited,Question and Answer Operator Message,as there are no further questions at this time...,neutral,0.937188
12475894,2164922,43,2020-12-30,161925,China Finance Online Co. Limited,Answer,"thank you, everybody, for attending china fina...",neutral,0.730117
