# word2vec Sentiment Analysis

## Section 1: Load data

In [None]:
# Import modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import MaxAbsScaler 
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
import time
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import plot_confusion_matrix
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.decomposition import TruncatedSVD
from joblib import dump
import os
from joblib import load
import gensim
import nltk


In [None]:
import matplotlib as mpl
import matplotlib.ticker as plticker

# Edit the font, font size, and axes width
mpl.rcParams['font.family'] = 'Times New Roman'
plt.rcParams['font.size'] = 14
plt.rcParams['axes.linewidth'] = 2

mpl.rcParams['figure.figsize'] = [8.0, 8.0]
mpl.rcParams['figure.dpi'] = 120
mpl.rcParams['savefig.dpi'] = 120

In [None]:
# Load Semeval data
df_semeval = pd.read_json("../01_Data/05_Semeval/Headline_Trainingdata.json")
df_semeval_val = pd.read_json("../01_Data/05_Semeval/Headline_Trialdata.json")
#df_semeval_test = pd.read_json("../01_Data/05_Semeval/Headline_Testdata.json")
df_semeval.head()

In [None]:
print(f"train: {df_semeval.shape},val: {df_semeval_val.shape}")


In [None]:
# Append validation to semeval training data
df_semeval=df_semeval.append(df_semeval_val)
print(f"train: {df_semeval.shape}")

In [None]:
df_semeval["Sentiment_cat"]=["positive" if x>0 else "negative" for x in df_semeval["sentiment"]]
df_semeval.head()

In [None]:
df_semeval["Sentiment_cat"].value_counts()

In [None]:
# Load the rise and Fall dataset
df_RiseFall =pd.read_excel("../01_Data/06_Rise_Fall_News/1_News_Fullset_Risefall.xlsx",index_col=0)

df_RiseFall.head()

In [None]:
# Check how many records are positive and negative
df_RiseFall["target_sentiment"].value_counts(normalize=True)

## Section 2: Train - Test set split

In [None]:
# split the enlarged dataset 
docs_train, docs_test, y_train, y_test = train_test_split(df_semeval.title, df_semeval["Sentiment_cat"], 
                                                          test_size=0.2, random_state=7)

In [None]:
y_test.value_counts()

In [None]:
# create copies later used for entity recognition
docs_train_orig = docs_train.copy()
docs_test_orig = docs_test.copy()

In [None]:
"""# uncomment to use Rise and Fall News sample 
docs_train, docs_test, y_train, y_test = train_test_split(df_RiseFall.Title, df_RiseFall.target_sentiment, 
                                                          test_size=0.2, random_state=7)"""

In [None]:
# Implement stratified sampling !?
"""from sklearn.model_selection import StratifiedShuffleSplit

stratified_splitter = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=7)

train_index, test_index = list(stratified_splitter.split(df_RiseFall, df_RiseFall["target_sentiment"]))[0]
trainset = df.loc[train_index]
testset = df.loc[test_index]"""

In [None]:
print(f"instances: train: {len(docs_train)}; test: {len(docs_test)}")

## Section 3: Data transformation

### Section 3.1:Tokenization


In [None]:
#nltk.download('punkt')
#nltk.download('stopwords')

# only tokenize without lower casing
docs_train = [nltk.word_tokenize(line)for line in docs_train.values]
print(docs_train)

In [None]:
# transform test data
docs_test = [nltk.word_tokenize(line) for line in docs_test.values]
docs_test[:5]

In [None]:
"""from gensim.utils import simple_preprocess

# Tokenize the title of each articles, including lower casing
docs_train = [simple_preprocess(line, deacc=True) for line in docs_train.values]
print(docs_train)"""

Without lower casing results in better results.

In [None]:
"""# transform test data
docs_test = [simple_preprocess(line, deacc=True) for line in docs_test.values]
docs_test"""

### Section 3.2: Preprocessing
Removal of special characters, numbers, Stop-words etc. 

In [None]:
# https://mlwhiz.com/blog/2019/01/17/deeplearning_nlp_preprocess/
# https://www.kaggle.com/christofhenkel/how-to-preprocessing-when-using-embeddings

import re
from nltk.corpus import stopwords

def preprocessor (token):
  
    # remove special characters (matches all characters not specified)
    #pattern = r'[^a-zA-z0-9\s]'
    pattern = r'[^a-zA-z0-9]'
    text = re.sub(pattern, '', token)
    
    # remove numbers
    if bool(re.search(r'\d', text)):
        text = re.sub('[0-9]{5,}', '#####', text)
        text = re.sub('[0-9]{4}', '####', text)
        text = re.sub('[0-9]{3}', '###', text)
        text = re.sub('[0-9]{2}', '##', text)
        text = re.sub('[0-9]{1}', '#', text)
        
    # remove missspelling (unlikely in newswires, so no) 
    
    # removing contractions (maybe ?)
    
    # remove stop words
    #stopWords = set(stopwords.words('english'))
    stopWords = ["to","of","and","a"]
    
    if text in stopWords:
        text=''
           
    return text 

NLTK Stopwords removal reduces accuracy.

In [None]:
# apply preprocessor
docs_train = [[preprocessor(word) for word in tokens] for tokens in docs_train]

# drop empty strings
docs_train = [[(word) for word in tokens if word] for tokens in docs_train]
print(docs_train[:5])

In [None]:
# apply preprocessing to test data
docs_test = [[preprocessor(word) for word in tokens] for tokens in docs_test]

# drop empty strings
docs_test = [[(word) for word in tokens if word] for tokens in docs_test]

### Section 3.3: Entity recognition and removal


In [None]:
#import spacy
#nlp = spacy.load('en_core_web_sm')

In [None]:
"""# remove entities

train_entitites=[]

# loop through train documents and create nlp objects
for sentence in docs_train_orig:
    sen = nlp(sentence)
    
    # create list of entity strings
    train_entitites.append([word.text for word in sen.ents])

# flatten entity list of lists
flat_train_entities = [item for sublist in train_entitites for item in sublist]
    
#print(flat_train_entities[:10])

# filter out entities
docs_train = [[word if word not in flat_train_entities else '' for word in sentences] for sentences in docs_train]
docs_train[:2]  

# drop empty strings
docs_train = [[(word) for word in tokens if word] for tokens in docs_train]
print(docs_train[:3])"""

Only marginally improves accurarcy and is computationally very expensive to process on real data.
Thus, named entity recognition is not included in the final implementation.

In [None]:
"""import spacy
nlp = spacy.load('en_core_web_sm')

# loop through train documents and create nlp objects
for sentence in docs_train[:3]:
    sen = nlp(sentence)
    
    # create list of entity strings
    entitites = [word.text for word in sen.ents]
    
    print(entitites)

"""

"""#nltk.download('averaged_perceptron_tagger')

# Tag each token
docs_train2 = [nltk.pos_tag(word)for word in docs_train]
docs_train2[0:5]

#nltk.download('maxent_ne_chunker')
nltk.download('words')

docs_train3 = [nltk.chunk.ne_chunk(sentence) for sentence in docs_train2]
docs_train3"""

In [None]:
"""# apply for test

# remove entities

test_entitites=[]

# loop through train documents and create nlp objects
for sentence in docs_test_orig:
    sen = nlp(sentence)
    
    # create list of entity strings
    test_entitites.append([word.text for word in sen.ents])

# flatten entity list of lists
flat_test_entities = [item for sublist in test_entitites for item in sublist]
    

# filter out entities
docs_test = [[word if word not in flat_train_entities else '' for word in sentences] for sentences in docs_test]
docs_test[:2]  

# drop empty strings
docs_test = [[(word) for word in tokens if word] for tokens in docs_test]"""

Only marginal improvement from removing named entities. The reason for this is probably that entities are search for in the original doc which contains "noise" that is subsequently removed in the preprocessing. This could mean that the filtering does not work very well.

### Section 3.4: Stemming / Lemmatisation

Performs worse with stemming.

In [None]:
"""from gensim.parsing.porter import PorterStemmer
porter_stemmer = PorterStemmer()

# Get the stemmed_tokens
docs_train = [[porter_stemmer.stem(word) for word in tokens] for tokens in docs_train]
docs_train"""

In [None]:
"""# transform test data 2
docs_test = [[porter_stemmer.stem(word) for word in tokens] for tokens in docs_test]
docs_test"""

### Section 3.6 Train word2vec model

In [None]:
from gensim.models import Word2Vec

In [None]:
"""# run to train own model

# adopted, taken from from https://medium.com/swlh/sentiment-classification-using-word-embeddings-word2vec-aedf28fbb8ca
path_word2vec_model = '0_models/31_word2vec_.model'


# Train the Word2Vec Model
model = Word2Vec(docs_train, min_count = 1, size = 1000,\
                     workers = 3, window = 3, sg = 1)

# store the model
model.save(path_word2vec_model)

# Load the model from the model file
model = Word2Vec.load(path_word2vec_model)"""

In [None]:
# run to use the trained model by Google

# https://code.google.com/archive/p/word2vec/
# https://github.com/RaRe-Technologies/gensim-data
# Tutorial: https://towardsdatascience.com/using-word2vec-to-analyze-news-headlines-and-predict-article-success-cdeda5f14751

# Load word2vec model (trained on an Google news corpus)
model = gensim.models.KeyedVectors.load_word2vec_format('0_models/31_word2vec_GoogleNews-vectors-negative300.bin', binary = True) 

# Check dimension of word vectors
model.vector_size

In [None]:
# create dictionary of vocabulary that counts occurances
# source: https://www.kaggle.com/christofhenkel/how-to-preprocessing-when-using-embeddings

def build_vocab(sentences, verbose =  True):
    """
    :param sentences: list of list of words
    :return: dictionary of words and their count
    """
    vocab = {}
    for sentence in sentences:
        for word in sentence:
            try:
                vocab[word] += 1
            except KeyError:
                vocab[word] = 1
    return vocab

In [None]:
vocab= build_vocab(docs_train)
vocab

In [None]:
# check % of words included in embedding
# source: https://www.kaggle.com/christofhenkel/how-to-preprocessing-when-using-embeddings

import operator 

def check_coverage(vocab,model):
    a = {}
    oov = {}
    k = 0
    i = 0
    for word in (vocab):
        try:
            a[word] = model[word]
            k += vocab[word]
        except:

            oov[word] = vocab[word]
            i += vocab[word]
            pass

    print('Found embeddings for {:.2%} of vocab'.format(len(a) / len(vocab)))
    print('Found embeddings for  {:.2%} of all text'.format(k / (k + i)))
    sorted_x = sorted(oov.items(), key=operator.itemgetter(1))[::-1]

    return sorted_x

In [None]:
oov = check_coverage(vocab,model)

without stop words removal:
94.3% of vocab
91.5% of words

with stop words removal:
92.8% of vocab
96.0% of words


with manual list of stop words removal:
93.0% of vocab
96.5% of words

with manual list of stop words removal and entity removal:
93.5% of vocab
96.6% of words

In [None]:
# print words missing in the embedding sorted by frequency
oov

In [None]:
# Unique ID of the word
word="rise"
print("Index of the word:"+ word)
print(model.wv.vocab[word].index)

# Total number of the words 
print(len(model.wv.vocab))

# Print the size of the word2vec vector for one word
print("Length of the vector generated for a word")

print(len(model[word]))

# Get the mean for the vectors for an example review
#print("Print the length after taking average of all word vectors in a sentence:")
#print(np.mean([model[token] for token in docs_train[24]], axis=0))

In [None]:
# look-up some similar words
#model.most_similar(word)

In [None]:
docs_train[0]

In [None]:
# Transform training data

X_train=[]

# loop through train documents
for i in range(len(docs_train)):
    
    # take average vector of words per news headline if word is included in the model's vocabulary
    X_train.append(np.mean([model[token] for token in docs_train[i] if token in model.vocab], axis=0))

In [None]:
len(X_train)

In [None]:
# test for each document in the corpus whether its empty
is_empty=[False if x.size==1 else True for i,x in enumerate(X_train)]
is_empty[0:5]

In [None]:
len(is_empty)

In [None]:
y_train.shape

In [None]:
# https://github.com/linanqiu/word2vec-sentiments/blob/master/word2vec-sentiment.ipynb
# source for doc2vec

In [None]:
# Transform test data

X_test=[]

# loop through train documents
for i in range(len(docs_test)):
    
    # take average vector of words per news headline if word is included in the model's vocabulary
    X_test.append(np.mean([model[token] for token in docs_test[i] if token in model.vocab], axis=0))

## Section 4: Train models

### Section 4.1 Basline model

In [None]:
# Majority baseline
y_train.value_counts()

In [None]:
# Generate majority baseline dataframe
y_pred_basel = np.full((len(y_train), 1), "positive")
y_pred_basel

In [None]:
# Calculate f-score
f1_score(y_train,y_pred_basel,average='macro')

### Section 4.2 SVM - Linear

#### Section 4.2.1  Gridsearch

In [None]:
lsvm = LinearSVC()

# specify the hyperparameters and their values
# 5 combinations in the grid
param_grid = {
    'C': [0.5,0.6,0.65,0.69,0.7,0.71,0.75,0.8],
    #    'C': [0.05,0.01,0.02,0.1,0.5,0.6,0.7,0.8,0.9,1,1.1,1.2,1.3,1.5,2.0,5,10],
    #'max_iter': [5000],
    'random_state': [7]
}

# we'll use 5-fold cross-validation
grid_search_LSVC = GridSearchCV(lsvm, param_grid, cv=5,
                           scoring='f1_macro', 
                           return_train_score=True) 

start = time.time()
grid_search_LSVC.fit(X_train, y_train)
end = time.time() - start
print(f"Took {end} seconds")

In [None]:
# Print the set of best hyperparameters
grid_search_LSVC.best_estimator_

In [None]:
# Print the corresponding f-score
grid_search_LSVC.best_score_

In [None]:
# Print the results of all tested models
val_scores = grid_search_LSVC.cv_results_["mean_test_score"]
train_scores = grid_search_LSVC.cv_results_["mean_train_score"]
params = [str(x) for x in grid_search_LSVC.cv_results_["params"]]

for val_score, train_score, param in sorted(zip(val_scores, train_scores, params), reverse=True):
    print(val_score, train_score, param)

In [None]:
# obtain the f-scores of the best models in each split

svm_lin_split_test_scores = []
for x in range(5):
    # extract f-score of the best model (at index=0) from each of the 5 splits
    val = grid_search_LSVC.cv_results_[f"split{x}_test_score"][0]
    svm_lin_split_test_scores.append(val)

**Notes:**

** With stemming: **

Best score semeval val set: C=0.1, val: 74.06% and train: 81.6%

Best score semeval own val set: C=0.1, val: 74.2% and train: 81.4%

Best score Rise and Fall: C=10, val: 58.7% and train: 60.16%

** Without stemming: **

Best score semeval own val set: C=0.1, val: 76.0% and train: 82.7%

** Without lower casing and stemming: **

Best score semeval own val set: C=0.1, val: 76.0% and train: 90.5%

** Without lower casing, stemming with removal of numbers and special cases: **

Best score semeval own val set: C=1, val: 76.15% and train: 90.5%

** Without lower casing, stemming with removal of numbers, special cases, stop words : **

Best score semeval own val set: C=0.1, val: 75.5% and train: 83.4%

** Without lower casing, stemming with removal of numbers, special cases, manual stop words : **

Best score semeval own val set: C=0.7, val: 76.9% and train: 89.6%

** Without lower casing, stemming with removal of numbers, special cases, manual stop words and named entity removal: **

Best score semeval own val set: C=1, val: 76.32% and train: 91.6%

#### Section 4.2.2 SVM -Linear - Store the best model


In [None]:
# Store model
# create a folder where all trained models will be kept
if not os.path.exists("0_models"):
    os.makedirs("0_models")
    
dump(grid_search_LSVC.best_estimator_, '0_models/32_word2vec_Sentiment Analysis_Linear_SVM model.joblib')

#### Section 4.2.3 SVM -Linear - Plot results of SVM classifier on reduced dimensions

In [None]:
# adjusted visualisation from https://towardsdatascience.com/a-practical-guide-to-interpreting-and-visualising-support-vector-machines-97d2a5b0564e
# reduce dimensions
tsvd = TruncatedSVD(n_components=2).fit(X_train)
tsvd_2d = tsvd.transform(X_train)

In [None]:
"""import pylab as pl

for i in range(0, tsvd_2d.shape[0]):
    if y_train[i] == "negative":
        c1 = pl.scatter(tsvd_2d[i,0],tsvd_2d[i,1],c='r',    marker='o')
    elif y_train[i] == "positive":
        c2 = pl.scatter(tsvd_2d[i,0],tsvd_2d[i,1],c='g',    marker='+')

pl.legend([c1, c2], ['negative', 'positive'])
pl.title('SVM illustration')
pl.show()"""

### Section 4.3 SVM - Poly 

#### Section 4.3.1 Gridsearch

In [None]:
poly_svm = SVC(kernel='poly')

# specify the hyperparameters and their values
# 5 combinations in the grid
param_grid = {
    'C': [0.01, 0.1, 1,10,15,18,19,20,25],
    #'C': [1.6,1.65,1.7,1.725,1.75,1.775,1.8,3,4,5,6,7,8,9,10],
    #  'C': [0.01,0.05,0.1,1,1.7,1.8,1.9,2,2.1,2.2,2.3,2.5,3,4,5,6,7,8,9,10],
    'gamma': ["scale", "auto",0.01,0.05, 0.1,0.14,0.15,0.16,0.2,0.5],
    'degree': [2],
    #'degree': [2,3],
    #'max_iter': [5000],
    'random_state': [7]
}

# we'll use 5-fold cross-validation
grid_search_poly_SVC = GridSearchCV(poly_svm, param_grid, cv=5,
                           scoring='f1_macro', 
                           return_train_score=True) 

start = time.time()
grid_search_poly_SVC.fit(X_train, y_train)
end = time.time() - start
print(f"Took {end} seconds")

In [None]:
# Print the set of best hyperparameters
grid_search_poly_SVC.best_estimator_

In [None]:
# Print the corresponding f-score
grid_search_poly_SVC.best_score_

In [None]:
# Print the results of all tested models
val_scores = grid_search_poly_SVC.cv_results_["mean_test_score"]
train_scores = grid_search_poly_SVC.cv_results_["mean_train_score"]
params = [str(x) for x in grid_search_poly_SVC.cv_results_["params"]]

for val_score, train_score, param in sorted(zip(val_scores, train_scores, params), reverse=True):
    print(val_score, train_score, param)

**Notes:**

** Without lower casing, stemming with removal of numbers, special cases, manual stop words : **

Best score semeval own val set: C=19,degree="2", gamma="0.15", val: 76.4% and train: 94.6%

** Without lower casing, stemming with removal of numbers, special cases, manual stop words and named entity removal: **

#### Section 4.3.2 SVM -Poly - Store the best model

In [None]:
# Store model
# create a folder where all trained models will be kept
if not os.path.exists("0_models"):
    os.makedirs("0_models")
    
dump(grid_search_poly_SVC.best_estimator_, '0_models/32_word2vec_Sentiment Analysis_Poly_SVM model.joblib')

### Section 4.4 SVM - Rbf 

#### Section 4.4.1 Gridsearch

In [None]:
rbf_svm =  SVC(kernel='rbf')

# specify the hyperparameters and their values
# 5 combinations in the grid
param_grid = {
    'C': [0.01, 0.1, 1,5, 10,15,19,20,21,25],
    #'C': [1.6,1.65,1.7,1.725,1.75,1.775,1.8,3,4,5,6,7,8,9,10],
    #  'C': [0.01,0.05,0.1,1,1.7,1.8,1.9,2,2.1,2.2,2.3,2.5,3,4,5,6,7,8,9,10],
    'gamma': ["scale", "auto",0.01,0.05,0.09, 0.1,0.11,0.12,0.4,0.5],
    #'max_iter': [5000],
    'random_state': [7]
}

# we'll use 5-fold cross-validation
grid_search_rbf_SVC = GridSearchCV(rbf_svm, param_grid, cv=5,
                           scoring='f1_macro', 
                           return_train_score=True) 

start = time.time()
grid_search_rbf_SVC.fit(X_train, y_train)
end = time.time() - start
print(f"Took {end} seconds")

In [None]:
# Print the set of best hyperparameters
grid_search_rbf_SVC.best_estimator_

In [None]:
# Print the corresponding f-score
grid_search_rbf_SVC.best_score_

In [None]:
# Print the results of all tested models
val_scores = grid_search_rbf_SVC.cv_results_["mean_test_score"]
train_scores = grid_search_rbf_SVC.cv_results_["mean_train_score"]
params = [str(x) for x in grid_search_rbf_SVC.cv_results_["params"]]

for val_score, train_score, param in sorted(zip(val_scores, train_scores, params), reverse=True):
    print(val_score, train_score, param)

In [None]:
# obtain the f-scores of the best models in each split

svm_rbf_split_test_scores = []
for x in range(5):
    # extract f-score of the best model (at index=0) from each of the 5 splits
    val = grid_search_rbf_SVC.cv_results_[f"split{x}_test_score"][0]
    svm_rbf_split_test_scores.append(val)

**Notes:**

** Without lower casing, stemming with removal of numbers, special cases, manual stop words : **

Best score semeval own val set: C=20,gamma=0.1, val: 77.4% and train: 94.9%

** Without lower casing, stemming with removal of numbers, special cases, manual stop words and named entity removal: **

#### Section 4.4.2 SVM - Rbf - Store the best model

In [None]:
# Store model
# create a folder where all trained models will be kept
if not os.path.exists("0_models"):
    os.makedirs("0_models")
    
dump(grid_search_rbf_SVC.best_estimator_, '0_models/32_word2vec_Sentiment Analysis_Rbf_SVM model.joblib')

### Section 4.5 SVM - Sigmoid 

#### Section 4.5.1 Gridsearch

In [None]:
sigm_svm =  SVC(kernel='sigmoid')

# specify the hyperparameters and their values
# 5 combinations in the grid
param_grid = {
    'C': [0.01, 0.1, 1,5, 10,15,20,30,35,40,41,42,45,50],
    #'C': [1.6,1.65,1.7,1.725,1.75,1.775,1.8,3,4,5,6,7,8,9,10],
    #  'C': [0.01,0.05,0.1,1,1.7,1.8,1.9,2,2.1,2.2,2.3,2.5,3,4,5,6,7,8,9,10],
    'gamma': ["scale", "auto",0.01,0.05,0.09, 0.1,0.11,0.12,0.13,0.2,0.3,0.4,0.5],
    #'max_iter': [5000],
    'random_state': [7]
    }

# we'll use 5-fold cross-validation
grid_search_sigm_SVC = GridSearchCV(sigm_svm, param_grid, cv=5,
                           scoring='f1_macro', 
                           return_train_score=True) 

start = time.time()
grid_search_sigm_SVC.fit(X_train, y_train)
end = time.time() - start
print(f"Took {end} seconds")

In [None]:
# Print the set of best hyperparameters
grid_search_sigm_SVC.best_estimator_

In [None]:
# Print the corresponding f-score
grid_search_sigm_SVC.best_score_

In [None]:
# Print the results of all tested models
val_scores = grid_search_sigm_SVC.cv_results_["mean_test_score"]
train_scores = grid_search_sigm_SVC.cv_results_["mean_train_score"]
params = [str(x) for x in grid_search_sigm_SVC.cv_results_["params"]]

for val_score, train_score, param in sorted(zip(val_scores, train_scores, params), reverse=True):
    print(val_score, train_score, param)

**Notes:**

** Without lower casing, stemming with removal of numbers, special cases, manual stop words : **

Best score semeval own val set: C=26,gamma=, val: 75.4% and train: 88.5%

** Without lower casing, stemming with removal of numbers, special cases, manual stop words and named entity removal: **

#### Section 4.5.2 SVM - Sigmoid - Store the best model

In [None]:
# Store model
# create a folder where all trained models will be kept
if not os.path.exists("0_models"):
    os.makedirs("0_models")
    
dump(grid_search_sigm_SVC.best_estimator_, '0_models/32_word2vec_Sentiment Analysis_Sigmoid_SVM model.joblib')

### Results: Summary SVM
Linear: 0.768800703862267 0.8958871182923149 {'C': 0.7, 'random_state': 7}

Poly:   0.7635545031027572 0.9458602940788126 {'C': 19, 'degree': 2, 'gamma': 0.15, 'random_state': 7}

Rbf:    0.7738981131931123 0.9491613701359167 {'C': 20, 'gamma': 0.1, 'random_state': 7}

Sigmoid: 0.7589640372372386 0.898002240325126 {'C': 41, 'gamma': 0.1, 'random_state': 7}

Best performance rbf kernel, followed by linear

### Section 4.6 Random Forest 

#### Section 4.6.1 Gridsearch

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()

# specify the hyperparameters and their values
# 4 x 3 x 2 = 24 combinations in the grid
param_grid = {
    'n_estimators': [10,50,85,89,90,91,95],
    #'n_estimators': [10,20,30,40,50,60,70,80,85,90,95,300],
    #'n_estimators': [10, 100, 250,275,300],
   # 'max_depth': [3, 5, 15,18,19,20],
    'max_depth': [10,14, 15,16],
    #'min_samples_split': [2,3,4,5,6,10],
    'min_samples_split': [3,4,5],
    'random_state': [7]
}

# we'll use 5-fold cross-validation
grid_search = GridSearchCV(rf, param_grid, cv=5,
                           scoring='f1_macro', 
                           return_train_score=True) 

start = time.time()
grid_search.fit(X_train, y_train)
end = time.time() - start
print(f"Took {end} seconds")

In [None]:
# Print the set of best hyperparameters
grid_search.best_estimator_

In [None]:
# Print the corresponding f-score
grid_search.best_score_

In [None]:
# Print the results of all tested models
val_scores = grid_search.cv_results_["mean_test_score"]
train_scores = grid_search.cv_results_["mean_train_score"]
params = [str(x) for x in grid_search.cv_results_["params"]]

for val_score, train_score, param in sorted(zip(val_scores, train_scores, params), reverse=True):
    print(val_score, train_score, param)

#### Section 4.6.2  Random Forest - Store the best model

In [None]:
# Store model
# create a folder where all trained models will be kept
if not os.path.exists("0_models"):
    os.makedirs("0_models")
    
dump(grid_search.best_estimator_, '0_models/32_word2vec_Sentiment Analysis_Random Forest_model.joblib')

**Notes:**

** Without lower casing, stemming with removal of numbers, special cases, manual stop words : **

Best score semeval own val set: ;max_depth=20, min samples split: 5, n estimators 300, val: 70.8% and train: 98.5%

** Without lower casing, stemming with removal of numbers, special cases, manual stop words and named entity removal: **

### Section 4.7 Decision Tree 

#### Section 4.7.1  Gridsearch

In [None]:
from sklearn.tree import DecisionTreeClassifier

dtree = DecisionTreeClassifier(random_state=7)

# specify the hyperparameters and their values
# 3 x 2 = 24 combinations in the grid
param_grid = {
    
    'criterion':['gini', 'entropy'],
    'max_depth': [5,10,13,14,15,16,17,18,19,20, 30],
    'min_samples_split': [5,7,8,9,10, 20, 50],
    'min_samples_leaf': [1,2,3,5]
}

# we'll use 5-fold cross-validation
grid_search = GridSearchCV(dtree, param_grid, cv=5,
                           scoring='f1_macro', 
                           return_train_score=True) 

start = time.time()
grid_search.fit(X_train, y_train)
end = time.time() - start
print(f"Took {end} seconds")

In [None]:
# Print the set of best hyperparameters
grid_search.best_estimator_

In [None]:
# Print the corresponding f-score
grid_search.best_score_

In [None]:
# Print the results of all tested models
val_scores = grid_search.cv_results_["mean_test_score"]
train_scores = grid_search.cv_results_["mean_train_score"]
params = [str(x) for x in grid_search.cv_results_["params"]]

for val_score, train_score, param in sorted(zip(val_scores, train_scores, params), reverse=True):
    print(val_score, train_score, param)

#### Section 4.7.2  Decision Tree - Store the best model

In [None]:
# Store model
# create a folder where all trained models will be kept
if not os.path.exists("0_models"):
    os.makedirs("0_models")
    
dump(grid_search.best_estimator_, '0_models/32_word2vec_Sentiment Analysis_Decision Tree model.joblib')

**Notes:**

** Without lower casing, stemming with removal of numbers, special cases, manual stop words : **

Best score semeval own val set: max depth > 15 --> overfitting, val:65.3% and train: 96.3%*

** Without lower casing, stemming with removal of numbers, special cases, manual stop words and named entity removal: **

### Section 4.8 Naive Bayes

#### Section 4.8.1  Gridsearch

In [None]:
from sklearn.naive_bayes import GaussianNB

# no hyperparameters to tune, but use GridSearch for comparability of results

gaussNB = GaussianNB()

# specify the hyperparameters and their values
param_grid = {}

# we'll use 5-fold cross-validation
grid_search = GridSearchCV(gaussNB, param_grid, cv=5,
                           scoring='f1_macro', 
                           return_train_score=True) 

start = time.time()
grid_search.fit(X_train, y_train)
end = time.time() - start
print(f"Took {end} seconds")

In [None]:
# Print the set of best hyperparameters
grid_search.best_estimator_

In [None]:
# Print the corresponding f-score
grid_search.best_score_

In [None]:
# Print the results of all tested models
val_scores = grid_search.cv_results_["mean_test_score"]
train_scores = grid_search.cv_results_["mean_train_score"]
params = [str(x) for x in grid_search.cv_results_["params"]]

for val_score, train_score, param in sorted(zip(val_scores, train_scores, params), reverse=True):
    print(val_score, train_score, param)

#### Section 4.8.2  Naive Bayes - Store the best model

In [None]:
# Store model
# create a folder where all trained models will be kept
if not os.path.exists("0_models"):
    os.makedirs("0_models")
    
dump(grid_search.best_estimator_, '0_models/32_word2vec_Sentiment Analysis_Gaussian NB model.joblib')

**Notes:**

** Without lower casing, stemming with removal of numbers, special cases, manual stop words : **

Best score semeval own val set: standard settings, val: 68.1% and train: 74.4%

** Without lower casing, stemming with removal of numbers, special cases, manual stop words and named entity removal: **

Rand. Forest: 0.728244519135879 0.9851823501222633 {'max_depth': 15, 'min_samples_split': 4, 'n_estimators': 90, 'random_state': 7}'

Decision Tree: 0.6542735047016043 0.9595464543787842 {'criterion': 'gini', 'max_depth': 13, 'min_samples_leaf': 2, 'min_samples_split': 8}

Naive Bayes: 0.6812015083736931 0.7435447202399154 {}

### Section 4.8 Text performance differences of best performing models


In [None]:
from scipy.stats import ttest_ind
# return the t-score and a two-tailed p-value
ttest_ind(svm_lin_split_test_scores, svm_rbf_split_test_scores)

## Section 5: Evaluate the two most promising model on the test data

### Section 5.1 Load the best models and calculate F1-Scores

In [None]:
# load the best model
best_linear_svm = load("0_models/32_word2vec_Sentiment Analysis_Linear_SVM model.joblib")

# use the best model to make predictions on the test set
y_hat = best_linear_svm.predict(X_test)

# Print marco-averaged precision, recall and f-score
p, r, f, s = precision_recall_fscore_support(y_test, y_hat, average="macro")
print("Support Vector Machines: Linear")
print(f"Precision: {p}")
print(f"Recall: {r}")
print(f"F score: {f}")

In [None]:
# load the best model
best_model = load("0_models/32_word2vec_Sentiment Analysis_Rbf_SVM model.joblib")

# use the best model to make predictions on the test set
y_hat = best_model.predict(X_test)

# Print marco-averaged precision, recall and f-score
p, r, f, s = precision_recall_fscore_support(y_test, y_hat, average="macro")
print("Support Vector Machines: Rbf")
print(f"Precision: {p}")
print(f"Recall: {r}")
print(f"F score: {f}")

In [None]:
# Print averaged precision, recall and f-score for each class
p, r, f, s = precision_recall_fscore_support(y_test, y_hat, average=None)
print("Support Vector Machines: Rbf: Classes")
print(f"Precision: {p}")
print(f"Recall: {r}")
print(f"F score: {f}")
print(f"Support: {s}")

### Section 5.2 Print confusion matrix

In [None]:
conf_matrix=plot_confusion_matrix(best_model, X_test, y_test,
                                 cmap=plt.cm.Blues,
                                 normalize='true',
                                 values_format=".1%"
                     )

In [None]:
# store figure to image
conf_matrix.figure_.savefig('Confusion_matrix_test.png', bbox_inches='tight')

**Results:**

Without lower casing, stemming and with removal of numbers and special characters:

Google Corpus News + Semeval own val
Val:neg-neg: 0.65, pos-pos:0.89
Train: neg-neg: 0.91, pos-pos:0.97

Without lower casing, stemming and with removal of numbers, special characters and named entities:



## Section 6: Classify the real data 

### Section 6.1: Load data, do all steps of preprocessing and apply the trained model

In [None]:
# Load word2vec model (trained on an Google news corpus)
model = gensim.models.KeyedVectors.load_word2vec_format('0_models/31_word2vec_GoogleNews-vectors-negative300.bin', binary = True) 

# load the best model
best_model = load("0_models/32_word2vec_Sentiment Analysis_Rbf_SVM model.joblib")

# https://mlwhiz.com/blog/2019/01/17/deeplearning_nlp_preprocess/
# https://www.kaggle.com/christofhenkel/how-to-preprocessing-when-using-embeddings

import re
from nltk.corpus import stopwords

def preprocessor (token):
  
    # remove special characters (matches all characters not specified)
    #pattern = r'[^a-zA-z0-9\s]'
    pattern = r'[^a-zA-z0-9]'
    text = re.sub(pattern, '', token)
    
    # remove numbers
    if bool(re.search(r'\d', text)):
        text = re.sub('[0-9]{5,}', '#####', text)
        text = re.sub('[0-9]{4}', '####', text)
        text = re.sub('[0-9]{3}', '###', text)
        text = re.sub('[0-9]{2}', '##', text)
        text = re.sub('[0-9]{1}', '#', text)
        
    # remove missspelling (unlikley in newswires, so no) 
    
    # removing contractions (maybe ?)
    
    # remove stop words
    #stopWords = set(stopwords.words('english'))
    stopWords = ["to","of","and","a"]
    
    if text in stopWords:
        text=''
           
    return text 

In [None]:
#path="../01_Data/01_Eikon/1_Headlines/3_WRDS_SP 500 Headlines/2019_03_01_to_2020_04_30_Headlines_SP500.csv"
#path="../01_Data/01_Eikon/1_Headlines/4_WRDS_SP 500 Headlines completed/2019_03_01_to_2020_06_30_Headlines_SP500.csv"

path="../01_Data/01_Eikon/1_Headlines/4_WRDS_SP 500 Headlines completed/2019_03_01_to_2020_06_30_Headlines_SP500_filt.csv"


df_sp500hl = pd.read_csv(path)
df_sp500hl.head(3)

In [None]:
df_sp500hl.shape

In [None]:
# Rename columns

# Dictionary of columns to replace
dict_rep= {"versionCreated": "DateTime",
           "versionCreated.1": "Date"}

# Replace column names 
df_sp500hl.rename(columns = dict_rep, inplace = True) 

#df_sp500_scores.Date= pd.to_datetime(df_sp500_scores.Date).date
df_sp500hl.set_index("Date",inplace=True)


df_sp500hl.head(3)

In [None]:
df_sp500hl.shape

In [None]:
# load relevant columns into own dataframe
docs_real=df_sp500hl["text"][:]
docs_real_orig = docs_real.copy()

In [None]:
# transform real data
docs_real = [nltk.word_tokenize(line) for line in docs_real.values]
docs_real[:2]

In [None]:
# count words per headline
list_words_per_hl=[len(x) for x in docs_real]
list_words_per_hl

In [None]:
# return average number of word per headline
np.mean(list_words_per_hl)

In [None]:
# return stdev 
np.std(list_words_per_hl)

In [None]:
# count number of headlines that relate to stocks
count_stock_rel_news=[any([y in ["share", "shares", "stock", "equity", "equities"] for y in x]) for x in docs_real]

In [None]:
# display number of headlines containin one of the above words
sum(count_stock_rel_news)

In [None]:
# check if code works as expected
z = [["moin","eikon","mama","test"],["equity"],["Hallo", "wie","gehts"],["shares", "wie","gehts"]]

# count number of headlines that relate to stocks
test=[any([y in ["share", "shares", "stock", "equity", "equities"] for y in x]) for x in z]
test

In [None]:
# apply preprocessor
docs_real = [[preprocessor(word) for word in tokens] for tokens in docs_real]

# drop empty strings
docs_real = [[(word) for word in tokens if word] for tokens in docs_real]
print(docs_real[:2])

In [None]:
# count words per headline
list_words_per_hl=[len(x) for x in docs_real]
list_words_per_hl

In [None]:
np.mean(list_words_per_hl)

In [None]:
"""# remove entities

real_entitites=[]

# loop through train documents and create nlp objects
for sentence in docs_real_orig:
    sen = nlp(sentence)
    
    # create list of entity strings
    real_entitites.append([word.text for word in sen.ents])

# flatten entity list of lists
flat_real_entitites = [item for sublist in real_entitites for item in sublist]
    
#print(flat_train_entities[:10])

# filter out entities
docs_real = [[word if word not in flat_real_entitites else '' for word in sentences] for sentences in docs_real]
docs_real[:2]  

# drop empty strings
docs_real = [[(word) for word in tokens if word] for tokens in docs_real]
print(docs_real[:3])"""

In [None]:
X_real=[]

# loop through real documents
for i in range(len(docs_real)):
    # take average vector of words per news headline if word is included in model 
    X_real.append(np.mean([model[token] for token in docs_real[i] if token in model.vocab], axis=0))

In [None]:
len(X_real)

In [None]:
# test for each document in the corpus whether its empty
is_empty=[False if x.size==1 else True for i,x in enumerate(X_real)]
is_empty[2864:2867]

In [None]:
# filter out the documents without words in the model
from itertools import compress
X_real=list(compress(X_real, is_empty))
len(X_real)

In [None]:
# filter out the documents without words in the documents
df_sp500hl=df_sp500hl[is_empty]
df_sp500hl.shape

In [None]:
len(df_sp500hl)

In [None]:
# use the best model to make predictions on the real set
df_sp500hl["Sentiment"] = best_model.predict(X_real)
df_sp500hl.head()

In [None]:
# Create seperate columns for negative and positive
df_sp500hl["Sent_pos"] = [1 if x=="positive" else 0  for x in df_sp500hl["Sentiment"]]
df_sp500hl["Sent_neg"] = [-1 if x=="negative" else 0 for x in df_sp500hl["Sentiment"]]
df_sp500hl["Sent_abs"] = [1 if x=="positive" else -1 for x in df_sp500hl["Sentiment"]]

df_sp500hl.head()

In [None]:
# Source about switching from LinearSVC to SVC to obtain actual probabilities 
# https://stackoverflow.com/questions/26478000/converting-linearsvcs-decision-function-to-probabilities-scikit-learn-python
# I use the confidence, as probabilities are computationally intense and the calc. has drawbacks / is critised in the documentation

df_sp500hl["Sent_conf_abs"]=best_model.decision_function(X_real)
df_sp500hl.head()

In [None]:
# Create seperate columns for negative and positive
df_sp500hl["Sent_conf_pos"] = [x if x>0 else 0  for x in df_sp500hl["Sent_conf_abs"]]
df_sp500hl["Sent_conf_neg"] = [x if x<0 else 0 for x in df_sp500hl["Sent_conf_abs"]]
df_sp500hl

In [None]:
# Print most confident positive news
df_sp500hl.nlargest(10,"Sent_conf_abs")

In [None]:
# Print most confident negative news
df_sp500hl.nsmallest(10,"Sent_conf_abs")

In [None]:
# create sentiment scores for confident classifications only

threshold= 1

df_sp500hl["Sent_pos_filt"] = [1 if x>=threshold else 0 for x in df_sp500hl["Sent_conf_abs"]]
df_sp500hl["Sent_neg_filt"] = [-1 if x<=-threshold else 0 for x in df_sp500hl["Sent_conf_abs"]]
df_sp500hl["Sent_abs_filt"] = df_sp500hl["Sent_pos_filt"]+df_sp500hl["Sent_neg_filt"]

df_sp500hl.head()

In [None]:
df_sp500hl.Sent_pos.sum()

In [None]:
abs(df_sp500hl.Sent_neg.sum())

In [None]:
df_sp500hl["Sent_pos_filt"].sum()

In [None]:
# Store
df_sp500hl.nlargest(500,"Sent_conf_abs").to_excel("../01_Data/10_Modelling/32_word2vec_Sentiment Analysis_Semeval_Headlines most postive.xlsx")
df_sp500hl.nsmallest(500,"Sent_conf_abs").to_excel("../01_Data/10_Modelling/32_word2vec_Sentiment Analysis__Semeval_Headlines most negative.xlsx")
df_sp500hl.to_csv("../01_Data/10_Modelling/32_word2vec_Sentiment Analysis_Semeval__Headlines complete incl score and confidence.csv")

### Summary Statistics / Analysis of Headlines

In [None]:
# Load classified headlines from csv
df_sp500hl=pd.read_csv("../01_Data/10_Modelling/32_word2vec_Sentiment Analysis_Semeval__Headlines complete incl score and confidence.csv")

In [None]:
# show header
df_sp500hl.head()

In [None]:
# show descriptive statistics of confidence
df_sp500hl.Sent_conf_abs.describe()

In [None]:
df_sp500hl.Sent_conf_abs.kurtosis()

In [None]:
df_sp500hl.Sent_conf_abs.skew()

In [None]:
# show histogram of confidence
df_sp500hl.Sent_conf_abs.hist()

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams.update({'figure.figsize':(10,5), 'figure.dpi':100})

fig, ax = plt.subplots()


# Hide the top and right spines of the axis
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)

plt.xticks(np.arange(-6,#min(df_sp500hl.Sent_conf_abs), 
                    9,# max(df_sp500hl.Sent_conf_abs)+1
                      1.0))

ax.get_yaxis().set_major_formatter(
mpl.ticker.FuncFormatter(lambda x, p: format(int(x), ',')))

# Plot Histogram on x
x = df_sp500hl.Sent_conf_abs
plt.hist(x, bins=50)
plt.gca().set(#title='Histogram', 
              ylabel='Frequency',xlabel="Sentiment Confidence")


# store to image
plt.savefig('Hist_sentiment_Real.png',dpi=300, transparent=False, bbox_inches='tight')

In [None]:
# show density curve /distribution of confidence
#df_sp500hl.Sent_conf_abs.plot.kde()

In [None]:
from scipy.stats import gaussian_kde
density = gaussian_kde(df_sp500hl.Sent_conf_abs)
xs = np.linspace(-5,5,200)
#density.covariance_factor = lambda : .25
#density._compute_covariance()
plt.figure(figsize=(10,5))
plt.plot(xs,density(xs))

# Add the x and y-axis labels
plt.xlabel('Sentiment Confidence')
plt.ylabel('Density')


# Hide the top and right spines of the axis
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)

#loc = plticker.MultipleLocator(base=1.0) # this locator puts ticks at regular intervals
#ax.xaxis.set_major_locator(loc)
plt.xticks(np.arange(-4,#min(df_sp500hl.Sent_conf_abs), 
                    5,# max(df_sp500hl.Sent_conf_abs)+1
                      1.0))

# store to image
plt.savefig('KDE_plot_sentiment_Real.png',dpi=300, transparent=False, bbox_inches='tight')

# display
plt.show()

In [None]:
# filter out irrelevant news, duplicated news (same score, same company, same day)

### Section 6.2: Transform news sentiment to daily measures of firm specfifc sentiment

In [None]:
# Create one grouped Dataframe 
# (that sums up the accidents for each categorical field)

# Define fields for grouping
group_list= ["Date",
             "RIC"
             ]

# Define components to compute target variable
dict_aggregations= {"Sent_abs": "sum", # = overall sentiment score  
                    "Sent_neg": "sum", # = number of negative news per day 
                    "Sent_pos": "sum",  # = number of positive news per day 
                    "Sent_abs_filt": "sum", # = overall sentiment score filtered 
                    "Sent_neg_filt": "sum", # = number of negative news per day filtered
                    "Sent_pos_filt": "sum",  # = number of positive news per day filtered
                    "Sent_conf_abs": "sum", # = sentiment score based on confidence
                    "Sent_conf_pos": "sum",
                    "Sent_conf_neg": "sum",
                    "Sentiment": "count" # =number of articles published
                   }

# Calculate daily scores for each company / Create aggregated tables
df_sp500_scores = df_sp500hl.groupby(group_list).agg(dict_aggregations).reset_index()
df_sp500_scores.set_index(["Date","RIC"], inplace=True)

# Replace column names for both test and train data
df_sp500_scores.rename(columns = {"Sentiment": "News_vol"}, inplace = True) 

df_sp500_scores.head(30)

In [None]:
# Calculate average sentiment
df_sp500_scores["Sent_avg"]=df_sp500_scores["Sent_abs"]/df_sp500_scores["News_vol"]
df_sp500_scores["Sent_avg_filt"]=df_sp500_scores["Sent_abs_filt"]/df_sp500_scores["News_vol"]
df_sp500_scores["Sent_avg_conf"]=df_sp500_scores["Sent_conf_abs"]/df_sp500_scores["News_vol"]
df_sp500_scores["Sent_avg_conf_neg"]=df_sp500_scores["Sent_conf_neg"]/df_sp500_scores["News_vol"]
df_sp500_scores.head()

In [None]:
# calculate ratios between positive and negative
df_sp500_scores["Sent_ratio"]=np.log((1+df_sp500_scores["Sent_pos"])/(1+df_sp500_scores["Sent_neg"]*-1))
df_sp500_scores["Sent_ratio_filt"]=np.log((1+df_sp500_scores["Sent_pos_filt"])/(1+df_sp500_scores["Sent_neg_filt"]*-1))
df_sp500_scores["Sent_ratio_conf"]=np.log((1+df_sp500_scores["Sent_conf_pos"])/(1+df_sp500_scores["Sent_conf_neg"]*-1))


df_sp500_scores.head()

In [None]:
# create a copy to manipulate / for visualisations
df_sp500_scores_vis=df_sp500_scores.copy()
df_sp500_scores_vis=df_sp500_scores_vis.reset_index()
df_sp500_scores_vis

In [None]:
df_sp500_scores_vis.to_csv("../01_Data/10_Modelling/32_word2vec_Sentiment Analysis_Semeval__Headlines for visualisations.csv")

### Transform data to format needed for regression

In [None]:
# Unstack table
df_sp500_scores=df_sp500_scores.unstack()
df_sp500_scores

In [None]:
df_sp500_scores

In [None]:
# Reduce multi index to single column index
level_one = df_sp500_scores.columns.get_level_values(0).astype(str)
level_two = df_sp500_scores.columns.get_level_values(1).astype(str)
df_sp500_scores.columns = level_one+"_" + level_two 
df_sp500_scores

In [None]:
# Convert index to datetime
df_sp500_scores.index= pd.to_datetime(df_sp500_scores.index).date
df_sp500_scores

In [None]:
# Reset index so that date becomes a column again
df_sp500_scores.reset_index()
df_sp500_scores.rename(columns =  {"index": "Date"}, inplace = True)
df_sp500_scores

In [None]:
df_sp500_scores.index.name = 'Date'
df_sp500_scores

In [None]:
# store to excel 
df_sp500_scores.to_excel("../01_Data/10_Modelling/32_word2vec_Sentiment Analysis_Semeval_Daily_firm_specific_sentiment_scores.xlsx")

### Create Visualisation and descriptive statistics

In [None]:
import matplotlib as mpl

# Edit the font, font size, and axes width
mpl.rcParams['font.family'] = 'Times New Roman'
plt.rcParams['font.size'] = 14
plt.rcParams['axes.linewidth'] = 2

mpl.rcParams['figure.figsize'] = [8.0, 8.0]
mpl.rcParams['figure.dpi'] = 120
mpl.rcParams['savefig.dpi'] = 120

In [None]:
df_sp500_scores_vis=pd.read_csv("../01_Data/10_Modelling/32_word2vec_Sentiment Analysis_Semeval__Headlines for visualisations.csv", index_col="Unnamed: 0")
df_sp500_scores_vis

In [None]:
#Filter out Moodys
df_sp500_scores_vis = df_sp500_scores_vis[df_sp500_scores_vis.RIC!="MCO"]

In [None]:
df_sp500_scores_vis

In [None]:
# sort by date
df_sp500_scores_vis=df_sp500_scores_vis.sort_values(by="Date")
df_sp500_scores_vis

In [None]:
# Show data for Apple
#df_sp500_scores_vis[df_sp500_scores_vis.RIC=="AAPL.O"]

### Calculate average sentiment by company

In [None]:
# Calculate average sentiment by company
df_comp_avg_sent=pd.pivot_table(df_sp500_scores_vis,index=["RIC"],\
               #values=["Sent_conf_neg"],\
               aggfunc=(np.mean, min, max,sum),fill_value=np.NaN)
df_comp_avg_sent.head()

In [None]:
# store in excel
df_comp_avg_sent.to_excel("../01_Data/10_Modelling/32_word2vec_Sentiment Analysis_Semeval_Headlines AVG Sentiment by company.xlsx")

### Plot weekly average sentiment overall

In [None]:
df_sp500_scores_vis.columns

In [None]:
df_sp500_scores_vis

In [None]:
df_sp500_scores_vis["Date"]

In [None]:
df_sp500_scores_vis.dtypes

In [None]:
df_sp500_scores_vis["Date"]=pd.to_datetime(df_sp500_scores_vis["Date"])

In [None]:
df_sp500_scores_vis.dtypes

In [None]:
df_sp500_scores_vis['weeknum'] = df_sp500_scores_vis["Date"].apply(lambda x:x.isocalendar()[1])
df_sp500_scores_vis['year'] = df_sp500_scores_vis["Date"].apply(lambda x:x.isocalendar()[0])
df_sp500_scores_vis["Week_display"]=df_sp500_scores_vis['year'].astype(str)+"-"+df_sp500_scores_vis['weeknum'].astype(str)
df_sp500_scores_vis["Week"]=df_sp500_scores_vis['weeknum'].astype(str)
df_sp500_scores_vis

In [None]:
import datetime

In [None]:
df_sp500_scores_vis['ISO'] = df_sp500_scores_vis['year'].astype(str) + '-W' + df_sp500_scores_vis['Week'].astype(str) + '-1'

# Create column that shows first day of week as "Week"
df_sp500_scores_vis['Week'] = df_sp500_scores_vis['ISO'].map(lambda x: datetime.datetime.strptime(x, "%G-W%V-%u"))
df_sp500_scores_vis

In [None]:
#df_sp500_scores_vis["Week"]=df_sp500_scores_vis["Date"].dt.strftime('%Y-%V')
df_sp500_scores_vis["Month"]=df_sp500_scores_vis["Date"].dt.strftime('%Y-%m')
df_sp500_scores_vis.sort_values(by="Week")
df_sp500_scores_vis

In [None]:
df_sp500_scores_vis["Week"].min()

In [None]:
# Show average sentiment per week
df_daily_avg_sent=pd.pivot_table(df_sp500_scores_vis,index=["Date"],\
               #values=["Sent_avg"],\
              # values=["Sent_avg_filt"],\
               values=["Sent_avg"],\
               aggfunc=[np.mean],fill_value=np.NaN)
df_daily_avg_sent.sort_index()

In [None]:
df_daily_avg_sent.loc["03.03.2019":"29.06.2020"].plot(figsize=(15,5))

## Plot overall sentiment by week

In [None]:
# Show average sentiment per week
df_weekly_avg_sent=pd.pivot_table(df_sp500_scores_vis,index=["Week"],\
               values=["Sent_avg_conf"],\
              # values=["Sent_avg"],\
               #values=["Sent_neg_filt","Sent_neg"],\
               aggfunc=[np.mean],fill_value=np.NaN)

# drop 2 level columns
df_weekly_avg_sent.columns = df_weekly_avg_sent.columns.map('_'.join)


#df_daily_avg_sent.plot(figsize=(15,5))
df_weekly_avg_sent

In [None]:
# Filter out first and last week
#df_weekly_avg_sent=df_weekly_avg_sent.loc["2019-03-03":"2020-06-28"]
df_weekly_avg_sent

In [None]:
#colourWheel =['#00008b','#a6e9ff']
colourWheel =['#00008b','#000000']

#plt.close('all')
fig, ax = plt.subplots(figsize=(12,5))
alphaVal = 1
linethick=2
ax.plot(df_weekly_avg_sent.index,
        df_weekly_avg_sent,
        color="#00008b",
        #color="blue",
        linestyle = '-',
        #dashes=dashesStyles[j%len(dashesStyles)],
        lw=linethick,
        label="AVG Sent",
        alpha=alphaVal)

ax.set_xlabel('')
#ax.yaxis.set_major_formatter(ScalarFormatter())
#ax.yaxis.major.formatter._useMathText = True
#ax.yaxis.set_minor_locator(  AutoMinorLocator(5))
#ax.xaxis.set_minor_locator(  AutoMinorLocator(5))
#ax.yaxis.set_label_coords(0.63,1.01)
#ax.yaxis.tick_right()
plt.xticks(rotation=60)
#fig.autofmt_xdate()

import matplotlib.ticker as plticker
#loc = plticker.MultipleLocator(base=25.0) # this locator puts ticks at regular intervals
#ax.xaxis.set_major_locator(loc)


#nameOfPlot = 'GDP per hour (constant prices, indexed to 2007)'
plt.xlabel("First day of the week",rotation=0)
plt.ylabel("Average Sentiment Confidence",rotation=90)
#ax.legend(frameon=False, title="Decil",loc='lower left',ncol=2,handlelength=2)

# Hide the top and right spines of the axis
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)

# store to image
plt.savefig('Average Sentiment by week.png',dpi=300, transparent=False, bbox_inches='tight')


plt.show()

In [None]:
# Show average sentiment per week
df_weekly_avg_sent_2=pd.pivot_table(df_sp500_scores_vis,index=["Week"],\
               values=["Sent_avg_conf","Sent_conf_pos","Sent_conf_neg","News_vol"],\
              # values=["Sent_avg"],\
               #values=["Sent_neg_filt","Sent_neg"],\
               aggfunc=[np.mean,"sum"],fill_value=np.NaN)

# drop 2 level columns
df_weekly_avg_sent_2.columns = df_weekly_avg_sent_2.columns.map('_'.join)


#df_daily_avg_sent.plot(figsize=(15,5))
df_weekly_avg_sent_2

In [None]:
# Filter out first and last week
#df_weekly_avg_sent_2=df_weekly_avg_sent_2.loc["2019-03-03":"2020-06-28"]
df_weekly_avg_sent_2

In [None]:
df_weekly_avg_sent_2[df_weekly_avg_sent_2.index=="2019-12-23"]

In [None]:
#df_mean_percentiles_trans[:"29.06.2020"].plot(figsize=(12,8),rot=60)
#colourWheel =['#00008b','#a6e9ff']
colourWheel =['#00008b','#000000']

#plt.close('all')
fig, ax = plt.subplots(figsize=(12,5))
ax2 = ax.twinx()  # set up the 2nd axis

alphaVal = 1
linethick=2

ax2.bar(df_weekly_avg_sent_2.index,
        df_weekly_avg_sent_2.sum_News_vol,
        color="grey",
        alpha=0.2, 
        width=5,
        #color="blue",
        label="News Vol",
        #alpha=alphaVal
       )

ax.plot(df_weekly_avg_sent_2.index,
        df_weekly_avg_sent_2.mean_Sent_avg_conf,
        color="#00008b",
        #color="blue",
        linestyle = '-',
        #dashes=dashesStyles[j%len(dashesStyles)],
        lw=linethick,
        label="Sent AVG Conf",
       # alpha=alphaVal
       )


"""ax.plot(df_weekly_avg_sent_2.index,
        df_weekly_avg_sent_2.mean_Sent_conf_neg,
        color="#000000",
        #color="blue",
        linestyle = '-',
        #dashes=dashesStyles[j%len(dashesStyles)],
        lw=linethick,
        label=series,
        alpha=alphaVal)

ax.plot(df_weekly_avg_sent_2.index,
        df_weekly_avg_sent_2.mean_Sent_conf_pos,
        color="#00000b",
        #color="blue",
        linestyle = '-',
        #dashes=dashesStyles[j%len(dashesStyles)],
        lw=linethick,
        label=series,
        alpha=alphaVal)
"""



#df_weekly_avg_sent_2.sum_News_vol.plot(secondary_y=True)

ax.set_xlabel('')
#ax.yaxis.set_major_formatter(ScalarFormatter())
#ax.yaxis.major.formatter._useMathText = True
#ax.yaxis.set_minor_locator(  AutoMinorLocator(5))
#ax.xaxis.set_minor_locator(  AutoMinorLocator(5))
#ax.yaxis.set_label_coords(0.63,1.01)
#ax.yaxis.tick_right()

#fig.autofmt_xdate()

#nameOfPlot = 'GDP per hour (constant prices, indexed to 2007)'
ax.set_xlabel("Month",rotation=0)
#plt.ylabel("Average Sentiment Confidence",rotation=90)
ax.set_ylabel('Average Sentiment Confidence')
ax2.set_ylabel('Number of News Headlines')

# Set number format for second y axis
ax2.get_yaxis().set_major_formatter(
mpl.ticker.FuncFormatter(lambda x, p: format(int(x), ',')))

# store to image
plt.savefig('Average Sentiment and News Vol. by week.png',dpi=300, transparent=False, bbox_inches='tight')


plt.show()

### Add S&P index to chart

In [None]:
df_sp=pd.read_excel("../01_Data/01_Eikon/2_Prices/2_Full Stock Prices data/2_WRDS_SP 500 Full stock price.xlsx",usecols=["Date",".SPX"])
df_sp.head()

In [None]:
df_sp["Date"]=pd.to_datetime(df_sp["Date"])
df_sp['weeknum'] = df_sp["Date"].apply(lambda x:x.isocalendar()[1])
df_sp["Week"]=df_sp['weeknum'].astype(str)
df_sp['year'] = df_sp["Date"].apply(lambda x:x.isocalendar()[0])
df_sp['ISO'] = df_sp['year'].astype(str) + '-W' + df_sp['Week'].astype(str) + '-1'
df_sp

In [None]:
# Create column that shows first day of week as "Week"
df_sp['Week'] = df_sp['ISO'].map(lambda x: datetime.datetime.strptime(x, "%G-W%V-%u"))

# Sort by date
df_sp=df_sp.sort_values(by="Date")
df_sp

# set index
df_sp=df_sp.set_index("Date")
df_sp

In [None]:
# Filter out first and last week
#df_sp=df_sp.loc["2019-03-03":"2020-06-28"]
df_sp

#### Show weekly data

In [None]:
df_sp2=df_sp.groupby("Week").agg("mean")
df_sp2

In [None]:
#df_mean_percentiles_trans[:"29.06.2020"].plot(figsize=(12,8),rot=60)
#colourWheel =['#00008b','#a6e9ff']
colourWheel =['#00008b','#000000']

#plt.close('all')
fig, ax = plt.subplots(figsize=(12,5))
ax2 = ax.twinx()  # set up the 2nd axis

alphaVal = 1
linethick=2

ax2.bar(df_weekly_avg_sent_2.index,
        df_weekly_avg_sent_2.sum_News_vol,
        color="grey",
        alpha=0.2, 
        width=5,
        #color="blue",
        label="News Vol.",
        #alpha=alphaVal
       )

lns1=ax.plot(df_weekly_avg_sent_2.index,
        df_weekly_avg_sent_2.mean_Sent_avg_conf,
        color="#00008b",
        #color="blue",
        linestyle = '-',
        #dashes=dashesStyles[j%len(dashesStyles)],
        lw=linethick,
        label="Average Sentiment",
   
       # alpha=alphaVal
       )

ax3 = ax.twinx() 

lns2=ax3.plot(df_sp2.index,
        df_sp2[".SPX"],
        color="green",
        #color="blue",
        linestyle = '-',
         alpha=.7,
        #dashes=dashesStyles[j%len(dashesStyles)],
        lw=linethick,
        label="S&P 500 Index",
       # alpha=alphaVal
       )
# hide axis three
ax3.get_yaxis().set_visible(False)


"""ax.plot(df_weekly_avg_sent_2.index,
        df_weekly_avg_sent_2.mean_Sent_conf_neg,
        color="#000000",
        #color="blue",
        linestyle = '-',
        #dashes=dashesStyles[j%len(dashesStyles)],
        lw=linethick,
        label=series,
        alpha=alphaVal)

ax.plot(df_weekly_avg_sent_2.index,
        df_weekly_avg_sent_2.mean_Sent_conf_pos,
        color="#00000b",
        #color="blue",
        linestyle = '-',
        #dashes=dashesStyles[j%len(dashesStyles)],
        lw=linethick,
        label=series,
        alpha=alphaVal)
"""



#df_weekly_avg_sent_2.sum_News_vol.plot(secondary_y=True)

ax.set_xlabel('')
#ax.yaxis.set_major_formatter(ScalarFormatter())
#ax.yaxis.major.formatter._useMathText = True
#ax.yaxis.set_minor_locator(  AutoMinorLocator(5))
#ax.xaxis.set_minor_locator(  AutoMinorLocator(5))
#ax.yaxis.set_label_coords(0.63,1.01)
#ax.yaxis.tick_right()

#fig.autofmt_xdate()

#nameOfPlot = 'GDP per hour (constant prices, indexed to 2007)'
ax.set_xlabel("Month",rotation=0)
#plt.ylabel("Average Sentiment Confidence",rotation=90)
ax.set_ylabel('Average Sentiment Confidence')
ax2.set_ylabel('Number of News Headlines')

# Set number format for second y axis
ax2.get_yaxis().set_major_formatter(
mpl.ticker.FuncFormatter(lambda x, p: format(int(x), ',')))

# insert legend
#ax.legend(frameon=True, title="Legend",loc='lower left',ncol=2,handlelength=2)
#plt.legend(loc='lower left')
#fig.legend(loc="lower left")
lns = lns1+lns2
labs = [l.get_label() for l in lns]
ax.legend(lns, labs, frameon=False,
          #title="Legend",\
          loc='lower left',ncol=1,handlelength=1,framealpha=1,facecolor="w")

# store to image
plt.savefig('Average Sentiment and News Vol. and SP Index by week.png',dpi=300, transparent=False, bbox_inches='tight')


plt.show()

### Sentiment per day

In [None]:
# Show average negative sentiment per day
df_daily_avg_sent=pd.pivot_table(df_sp500_scores_vis,index=["Date"],\
               values=["Sent_neg_filt"],\
               #values=["Sent_neg"],\
               #values=["Sent_neg_filt","Sent_neg"],\
               aggfunc=[np.mean],fill_value=np.NaN)

df_daily_avg_sent.plot(figsize=(15,5))

In [None]:
# Show average negative sentiment per day
df_daily_avg_sent=pd.pivot_table(df_sp500_scores_vis,index=["Week"],\
               values=["Sent_avg_conf"],\
              # values=["Sent_avg"],\
               #values=["Sent_neg_filt","Sent_neg"],\
               aggfunc=[np.mean],fill_value=np.NaN)

df_daily_avg_sent.plot(figsize=(15,5))

In [None]:
# Show average negative sentiment per day
df_daily_avg_sent=pd.pivot_table(df_sp500_scores_vis,index=["Date"],\
               values=["Sent_conf_neg"],\
               #values=["Sent_neg"],\
               #values=["Sent_neg_filt","Sent_neg"],\
               aggfunc=[np.mean],fill_value=np.NaN)

df_daily_avg_sent.plot(figsize=(15,5))

### Plot weekly average sentiment by firm

In [None]:
df_sp500_scores_vis.sort_values(by="Week")

In [None]:
# Show average sentiment per week
df_week_avg_sent_by_comp=pd.pivot_table(df_sp500_scores_vis,index=["RIC"],columns=["Week"],\
               #values=["Sent_avg"],\
               #values=["Sent_avg_filt"],\
               values=["Sent_avg_conf"],\
               #values=["Sent_conf_neg"],\
               aggfunc=[np.mean],fill_value=np.NaN)

df_week_avg_sent_by_comp.head()

In [None]:
# flatten columns
level_three = df_week_avg_sent_by_comp.columns.get_level_values(2).astype(str)
df_week_avg_sent_by_comp.columns = level_three

In [None]:
labels=[1,2,3,4,5,6,7,8,9,10]
#labels=[1,2,3]

col_list=df_week_avg_sent_by_comp.columns

# Calculate percentiles by week and company
for i in col_list:
    df_week_avg_sent_by_comp[str(i)+"_percentile"]=pd.qcut(df_week_avg_sent_by_comp[i].rank(method='first'),\
                                                      10,
                                                      #[0,0.33,0.66,1.0],\
                                                      #duplicates="drop",\
                                                      labels=labels
                                                     )

df_week_avg_sent_by_comp

In [None]:
df_week_avg_sent_by_comp.to_excel("Test_avg_sent_score_by_week.xlsx")

In [None]:
df_week_avg_sent_perc=pd.DataFrame()
list_dfs=[]

for i in col_list:
    df_week_avg_sent_perc=df_week_avg_sent_by_comp.groupby(str(i)+'_percentile').mean()[i]
    list_dfs.append(df_week_avg_sent_perc)
    
#list_dfs[5]

In [None]:
# append dataframes
df_mean_percentiles = pd.concat(list_dfs,axis=1)

df_mean_percentiles.head()

In [None]:
df_mean_percentiles_trans=df_mean_percentiles.transpose()
df_mean_percentiles_trans


In [None]:
# Filter out first and last week
df_mean_percentiles_trans=df_mean_percentiles_trans.loc["2019-03-03":"2020-06-28"]
df_mean_percentiles_trans

In [None]:
df_mean_percentiles_trans.to_excel("Weekly_average_Sent_conf by Percentile.xlsx")

In [None]:
#df_mean_percentiles_trans[:"29.06.2020"].plot(figsize=(12,8),rot=60)
#colourWheel =['#00008b','#a6e9ff']
colourWheel =['#00008b','#000000']

#plt.close('all')
fig, ax = plt.subplots(figsize=(12,5))
for j,series in enumerate(df_mean_percentiles_trans):
    if(series==1 or series==10):
        alphaVal = 1
        linethick=3
        ax.plot(df_mean_percentiles_trans[series].index,
                df_mean_percentiles_trans[series],
                color=colourWheel[j%len(colourWheel)],
                #color="blue",
                linestyle = '-',
                #dashes=dashesStyles[j%len(dashesStyles)],
                lw=linethick,
                label=series,
                alpha=alphaVal)
    else:
        alphaVal = 0.6
        linethick = 1.5
        ax.plot(df_mean_percentiles_trans[series].index,
                df_mean_percentiles_trans[series],
                #color=colourWheel[j%len(colourWheel)],
                color="grey",
                linestyle = '-',
                #dashes=dashesStyles[j%len(dashesStyles)],
                lw=linethick,
                label=series,
                alpha=alphaVal)
#ax.set_xlabel('')
#ax.yaxis.set_major_formatter(ScalarFormatter())
#ax.yaxis.major.formatter._useMathText = True
#ax.yaxis.set_minor_locator(  AutoMinorLocator(5))
#ax.xaxis.set_minor_locator(  AutoMinorLocator(5))
#ax.yaxis.set_label_coords(0.63,1.01)
#ax.yaxis.tick_right()
plt.xticks(rotation=90)
#fig.autofmt_xdate()

import matplotlib.ticker as plticker

#loc = plticker.MultipleLocator(base=5.0) # this locator puts ticks at regular intervals
#ax.xaxis.set_major_locator(loc)

# Hide the top and right spines of the axis
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)



#nameOfPlot = 'GDP per hour (constant prices, indexed to 2007)'
#plt.xlabel("First Day of Week",rotation=0)
plt.ylabel("Average Sentiment Confidence",rotation=90)
ax.legend(frameon=False, title="Decil",loc='lower center',ncol=10,handlelength=1)

# store to image
plt.savefig('Average Sentiment by decil and week.png',dpi=300, transparent=False, bbox_inches='tight')

plt.show()

### Plot monthly average sentiment by percentile and month

In [None]:
# Show average sentiment per week
df_month_avg_sent_by_comp=pd.pivot_table(df_sp500_scores_vis,index=["RIC"],columns=["Month"],\
               #values=["Sent_avg"],\
               #values=["Sent_avg_filt"],\
               values=["Sent_avg_conf"],\
               #values=["Sent_conf_neg"],\
               aggfunc=[np.mean],fill_value=np.NaN)

# flatten columns
level_three = df_month_avg_sent_by_comp.columns.get_level_values(2).astype(str)
df_month_avg_sent_by_comp.columns = level_three

df_month_avg_sent_by_comp.head()

In [None]:
labels=[1,2,3,4,5,6,7,8,9,10]

col_list=df_month_avg_sent_by_comp.columns

# Calculate percentiles by week and company
for i in col_list:
    df_month_avg_sent_by_comp[str(i)+"_decil"]=pd.qcut(df_month_avg_sent_by_comp[i].rank(method='first'),10,\
                                                      #duplicates="drop",\
                                                      labels=labels)

df_month_avg_sent_by_comp.head()

In [None]:
df_month_avg_sent_perc=pd.DataFrame()
list_dfs=[]

for i in col_list:
    df_month_avg_sent_perc=df_month_avg_sent_by_comp.groupby(str(i)+'_decil').mean()[i]
    list_dfs.append(df_month_avg_sent_perc)
    
#list_dfs[5]

In [None]:
# append dataframes
df_mean_percentiles = pd.concat(list_dfs,axis=1)

df_mean_percentiles.head()

In [None]:
df_mean_percentiles_trans=df_mean_percentiles.transpose()

In [None]:
#df_mean_percentiles_trans[:"29.06.2020"].plot(figsize=(12,8),rot=60)
#colourWheel =['#00008b','#a6e9ff']
colourWheel =['#00008b','#000000']

#plt.close('all')
fig, ax = plt.subplots(figsize=(12,5))
for j,series in enumerate(df_mean_percentiles_trans[:]):
    if(series==1 or series==10):
        alphaVal = 1
        linethick=3
        ax.plot(df_mean_percentiles_trans[series].index,
                df_mean_percentiles_trans[series],
                color=colourWheel[j%len(colourWheel)],
                #color="blue",
                linestyle = '-',
                #dashes=dashesStyles[j%len(dashesStyles)],
                lw=linethick,
                label=series,
                alpha=alphaVal)
    else:
        alphaVal = 0.6
        linethick = 1.5
        ax.plot(df_mean_percentiles_trans[series].index,
                df_mean_percentiles_trans[series],
                #color=colourWheel[j%len(colourWheel)],
                color="grey",
                linestyle = '-',
                #dashes=dashesStyles[j%len(dashesStyles)],
                lw=linethick,
                label=series,
                alpha=alphaVal)
ax.set_xlabel('')
#ax.yaxis.set_major_formatter(ScalarFormatter())
#ax.yaxis.major.formatter._useMathText = True
#ax.yaxis.set_minor_locator(  AutoMinorLocator(5))
#ax.xaxis.set_minor_locator(  AutoMinorLocator(5))
#ax.yaxis.set_label_coords(0.63,1.01)
#ax.yaxis.tick_right()
plt.xticks(rotation=60)
#fig.autofmt_xdate()

import matplotlib.ticker as plticker

loc = plticker.MultipleLocator(base=1.0) # this locator puts ticks at regular intervals
ax.xaxis.set_major_locator(loc)

#nameOfPlot = 'GDP per hour (constant prices, indexed to 2007)'
plt.xlabel("Year-Month",rotation=0)
plt.ylabel("Average Sentiment Confidence",rotation=90)
ax.legend(frameon=False, title="Decil",loc='lower center',ncol=10,handlelength=1)
#plt.savefig(os.path.join(dirFile,'test.png'),dpi=300)
plt.show()

In [None]:
# Show average negative sentiment per day
df_daily_avg_sent=pd.pivot_table(df_sp500_scores_vis,index=["Date"],\
               values=["Sent_neg_filt"],\
               #values=["Sent_neg"],\
               #values=["Sent_neg_filt","Sent_neg"],\
               aggfunc=[np.mean],fill_value=np.NaN)

df_daily_avg_sent.plot(figsize=(15,5))

In [None]:
# Show average negative sentiment per day
df_daily_avg_sent=pd.pivot_table(df_sp500_scores_vis,index=["Week"],\
               values=["Sent_avg_conf"],\
              # values=["Sent_avg"],\
               #values=["Sent_neg_filt","Sent_neg"],\
               aggfunc=[np.mean],fill_value=np.NaN)

df_daily_avg_sent.plot(figsize=(15,5))

In [None]:
# Show volume of news per day
df_daily_avg_sent=pd.pivot_table(df_sp500_scores_vis,index=["Date"],\
             values=["News_vol"],\
               #values=["Sent_ratio"],\
              # values=["Sent_ratio","Sent_ratio_filt"],\
               aggfunc=[np.mean],fill_value=np.NaN)

#df_daily_avg_sent.plot(kind="bar",figsize=(15,5))
df_daily_avg_sent

### Analyse average daily sentiment 

In [None]:
# load average daily media attention decils
df_Quantiles=pd.read_excel("RIC and News Quantiles.xlsx")
df_Quantiles=df_Quantiles.drop_duplicates(subset="Company")
df_Quantiles=df_Quantiles[["Company","News_Quantile"]]
df_Quantiles

In [None]:
df_sp500_scores_vis.columns

In [None]:
# join percentiles 
df_sp500_scores_vis_inc_dec=df_sp500_scores_vis.merge(df_Quantiles,how="left",left_on="RIC",right_on="Company")

df_sp500_scores_vis_inc_dec.head()

In [None]:
# Show average sentiment stats
df_sp500_scores_vis_inc_dec_piv=pd.pivot_table(df_sp500_scores_vis_inc_dec,index=["News_Quantile"],\
               values=["Sent_avg_conf"],\
               aggfunc=[np.mean, "median","max","std","skew"],fill_value=np.NaN)
df_sp500_scores_vis_inc_dec_piv.sort_index()

In [None]:
df_sp500_scores_vis_inc_dec_piv.to_excel("Average daily sentiment by decil of media coverage.xlsx")

### Analyse industries

In [None]:
# load industries
df_Industry=pd.read_excel("RIC and Industry Sector 2.xlsx")
df_Industry=df_Industry.drop_duplicates(subset="RIC")
#df_Industry=df_Industry[["Company","News_Quantile"]]
df_Industry

In [None]:
df_sp500_scores_vis.head()

In [None]:
# join industries 
df_sp500_scores_vis_inc_ind=df_sp500_scores_vis.merge(df_Industry,how="left",left_on="RIC",right_on="RIC")

df_sp500_scores_vis_inc_ind.head()

In [None]:
# Show average sentiment per week
df_sp500_scores_vis_inc_ind_piv=pd.pivot_table(df_sp500_scores_vis_inc_ind,index=["ICB Sector"],\
               values=["Sent_avg_conf"],\
               aggfunc=[np.mean, "median","max","std","skew"],fill_value=np.NaN)
#df_sp500_scores_vis_inc_ind_piv.sort_values()

In [None]:
df_sp500_scores_vis_inc_ind_piv.to_excel("Average daily sentiment by industry.xlsx")