#0. Importing packages and preprocessed data

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import pandas_datareader.data as web
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split,RandomizedSearchCV,GridSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score, matthews_corrcoef, f1_score, mean_squared_error
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.preprocessing import StandardScaler,MinMaxScaler, RobustScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier, BaggingClassifier, RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier
from datetime import datetime, timedelta
from nltk.stem.snowball import SnowballStemmer
from sklearn.svm import SVC
from xgboost import XGBClassifier
from numpy.random import uniform
from statistics import mean
from random import choice

In [4]:
text_df=pd.read_csv('/content/drive/MyDrive/NLP_Financial_Sentiment_Analysis_Final/text_df.csv',index_col=0).dropna()
daily_df=pd.read_csv('/content/drive/MyDrive/NLP_Financial_Sentiment_Analysis_Final/daily.csv',index_col=0)
daily_news_df=pd.read_csv('/content/drive/MyDrive/NLP_Financial_Sentiment_Analysis_Final/daily_news.csv',index_col=0)
news_df=text_df[text_df['category']=='news'].copy().dropna()

#1. Additional Information

To have the chance to filter out days/rows with some information lack we can include in the dataframe the length, in terms of number of words, of processed titles/articles

In [7]:
def get_words_number(lan_series):
  """Input: Processed language series 
     Output: Number of words for each "sentence"

     Args:
     series (pd.Series or list): Series of texts

     Returns:
     list : list of integers
  """  
  length=[]
  for s in lan_series:
    l_div=s.split()
    length.append(len(l_div))
  return length

In [8]:
for df in [text_df,news_df,daily_df,daily_news_df]:
  df['Title_Len']=get_words_number(df['Processed_Titles'].astype(str))
  df['Article_Len']=get_words_number(df['Processed_Articles'].astype(str))

  

#2. Labelling

In order to apply Machine Learning supervised classification models we have to:

- Extract Features from processed text

- Label the resulting vectors of features. To this purpose we decided to discretize the text-associated financial returns in two ways: binary(Positive/Negative) and ternary(Positive/Neutral/Negative). Since the only really "neutral" return (r=0) at the end coincides with a binary classification, we tried several neutral ranges to perform ternary labelling

Proceeding with labelling

In [5]:
#Binary (Positive/Negative)

def binary_labeling(ret_series):
  """Input: Returns time series 
     Output: Corresponding Binary Labels (2=Positive/0=Negative)

     Args:
     series (pd.Series or list): Time series of financial returns

     Returns:
     list : list of 2/0/np.nan
  """  
  labels=[]
  for r in ret_series:
    if r>=0:
      labels.append(2)
    elif r<0:
      labels.append(0)
    else:
      labels.append(np.nan)
  return labels

#Ternary (Positive/Neutral/Negative)

def ternary_labeling_fixed(ret_series,lim=0.0015):
  """Input: Returns time series 
     Output: Corresponding Ternary Labels (2=Positive/1=Neutral/0=Negative) assuming the following range (-lim , +lim) for neutral

     Args:
     series (pd.Series or list): Time series of financial returns

     Returns:
     list : list composed of 2/1/0/np.nan
  """  
  labels=[]
  for r in ret_series:
    if r>=lim:
      labels.append(2)
    elif r<lim and r>-lim:
      labels.append(1)
    elif r<=-lim:
      labels.append(0)
    else:
      labels.append(np.nan)
  return labels

def ternary_labeling_parametrized(ret_series,d=10):
  """Input: Returns time series 
     Output: Corresponding Ternary Labels (2=Positive/1=Neutral/0=Negative) assuming the following range (- (mean return + (std of returns /d)) , +(mean return + (std of returns /d))) for neutral

     Args:
     series (pd.Series or list): Time series of financial returns
     d (int) : d divides the series standard deviation to determine the neutral range as describe above (Output: ...)

     Returns:
     list : list composed of 2/1/0/np.nan
  """  
  labels=[]
  m=np.mean(ret_series)
  std=np.std(ret_series)
  b=(std/d)+m
  for r in ret_series:
    if r>=b:
      labels.append(2)
    elif r<b and r>-b:
      labels.append(1)
    elif r<=-b:
      labels.append(0)
    else:
      labels.append(np.nan)
  return labels

In [6]:
#For further uses we build a dictionary mapping labels into identifying strings 

Labels_Dictionary={}

for df in [text_df,news_df,daily_df,daily_news_df]:
  df['Binary_Labels']=binary_labeling(df['Returns'])
  Labels_Dictionary[str(df['Binary_Labels'])]='Binary'
  df['Ternary_Labels_0.1%']=ternary_labeling_fixed(df['Returns'],0.001)
  Labels_Dictionary[str(df['Ternary_Labels_0.1%'])]='Ternary_f_0.1%'
  df['Ternary_Labels_0.15%']=ternary_labeling_fixed(df['Returns'],0.0015)
  Labels_Dictionary[str(df['Ternary_Labels_0.15%'])]='Ternary_f_0.15%'
  df['Ternary_Labels_0.2%']=ternary_labeling_fixed(df['Returns'],0.002)
  Labels_Dictionary[str(df['Ternary_Labels_0.2%'])]='Ternary_f_0.2%'
  df['Ternary_Labels_0.35%']=ternary_labeling_fixed(df['Returns'],0.0035)
  Labels_Dictionary[str(df['Ternary_Labels_0.35%'])]='Ternary_f_0.35%'
  df['Ternary_Labels_0.5%']=ternary_labeling_fixed(df['Returns'],0.005)
  Labels_Dictionary[str(df['Ternary_Labels_0.5%'])]='Ternary_f_0.5%'
  df['Ternary_Labels_par_10']=ternary_labeling_parametrized(df['Returns'],10)
  Labels_Dictionary[str(df['Ternary_Labels_par_10'])]='Ternary_par_10'
  df['Ternary_Labels_par_5']=ternary_labeling_parametrized(df['Returns'],5)
  Labels_Dictionary[str(df['Ternary_Labels_par_5'])]='Ternary__par_5'

In [9]:
#We define the following dictionaries to map text and models into identifying strings

Text_Dictionary={str(text_df['Processed_Titles']):'Single_Headlines',str(text_df['Processed_Articles']):'Single_Articles',str(news_df['Processed_Titles']):'Single_News_Headlines',str(news_df['Processed_Articles']):'Single_News_Articles',str(daily_df['Processed_Titles']):'Daily_Headlines',str(daily_df['Processed_Articles']):'Daily_Articles',str(daily_news_df['Processed_Titles']):'Daily_News_Headlines',str(daily_news_df['Processed_Articles']):'Daily_News_Articles'}

Models_Dictionary={str(MultinomialNB()):'NB',str(LogisticRegression(max_iter=20000)):'LR',str(DecisionTreeClassifier()):'DT',str(RandomForestClassifier()):'RF',str(GradientBoostingClassifier()):'GrB'}


#3. ML Models First Outlook

Since we have a huge number of possible text-label-model combinations we go for a preliminary analysis with default models to grab an idea of which cases are worth further attention

In [11]:
headlines_combinations=[]
for df in [text_df,news_df,daily_df,daily_news_df]:
  hlc=[(df['Processed_Titles'],df['Binary_Labels']),(df['Processed_Titles'],df['Ternary_Labels_0.1%']),(df['Processed_Titles'],df['Ternary_Labels_0.15%']),(df['Processed_Titles'],df['Ternary_Labels_0.2%']),(df['Processed_Titles'],df['Ternary_Labels_0.35%']),(df['Processed_Titles'],df['Ternary_Labels_0.5%']),(df['Processed_Titles'],df['Ternary_Labels_par_10']),(df['Processed_Titles'],df['Ternary_Labels_par_5'])]
  headlines_combinations.extend(hlc)


In [10]:
def BOW_vects_default_models_preliminary_outlook(comb_list,models_dict,text_input_dict,labels_dict,vzer='cv',max_ngram=3,max_ft=500,train_size=0.8,models=[MultinomialNB(),LogisticRegression(max_iter=20000),DecisionTreeClassifier(),RandomForestClassifier(),GradientBoostingClassifier()]): 
  """
     Function trying some models on a list of texts-labels combinations

     Args:
     comb_list (list) : list of texts(series) - labels(series) couples
     models_dict (dictionary) : dictionary providing a model - string correspondance
     text_input_dict (dictionary) : dictionary providing a text input - string correspondance
     labels_dict (dictionary) : dictionary providing a labels input - string correspondance
     vzer (string) : string identifying the vectorizer , accepted only CountVectorizer and TfidfVectorizer
     max_ngram (int) : maximum ngrams length to be included in the vectorizer
     max_ft (int) : maximum number of words/ngrams to be taken into account in the vectorizer
     train_size (float) : relative size of the train set
     models (list) : list of default setting models to apply 

     Returns:
     (pd.DataFrame) : DataFrame including all the results of the models applied to every texts-labels combination

  """
  if vzer=='cv':
    vectorizer=CountVectorizer(strip_accents='ascii', stop_words='english',ngram_range=(1,max_ngram),max_features=max_ft)
  elif vzer=='tfidf':
    vectorizer=TfidfVectorizer(strip_accents='ascii', stop_words='english',ngram_range=(1,max_ngram),max_features=max_ft)
  else:
    raise ValueError
  text_input_list=[]
  labels_list=[]
  Models_list=[]
  Accuracy_train_list=[]
  Accuracy_test_list=[]
  for text,lab in comb_list:
    N=len(text)
    tt_split=int(round(N*train_size,0))
    train=text[0:tt_split]
    test=text[tt_split:N]
    X_train=vectorizer.fit_transform(train)
    X_test=vectorizer.transform(test)
    y_train=lab[0:tt_split]
    y_test=lab[tt_split:N]
    for model in models:
      mdl=model
      mdl.fit(X_train,y_train)
      preds_train=mdl.predict(X_train)
      preds_test=mdl.predict(X_test)
      Accuracy_train_list.append(accuracy_score(y_train,preds_train))
      Accuracy_test_list.append(accuracy_score(y_test,preds_test))
      Models_list.append(models_dict[str(mdl)])
      text_input_list.append(text_input_dict[str(text)])
      labels_list.append(labels_dict[str(lab)])
  results=pd.DataFrame({'Text_Data':text_input_list,'Labels':labels_list,'Model':Models_list,'Accuracy_Train':Accuracy_train_list,'Accuracy_Test':Accuracy_test_list})  
  results=results.sort_values(by='Accuracy_Test',ascending=False)
  return results



##3.1 Features Extraction Method: CountVectorizer

In [12]:
HL_def_models_res=BOW_vects_default_models_preliminary_outlook(headlines_combinations,Models_Dictionary,Text_Dictionary,Labels_Dictionary)

HL_def_models_res.head(20)

Unnamed: 0,Text_Data,Labels,Model,Accuracy_Train,Accuracy_Test
124,Daily_News_Headlines,Binary,GrB,0.864221,0.885714
123,Daily_News_Headlines,Binary,RF,0.999524,0.878095
84,Daily_Headlines,Binary,GrB,0.84572,0.86629
83,Daily_Headlines,Binary,RF,0.99953,0.847458
120,Daily_News_Headlines,Binary,NB,0.794188,0.841905
82,Daily_Headlines,Binary,DT,0.99953,0.838041
122,Daily_News_Headlines,Binary,DT,0.999524,0.834286
128,Daily_News_Headlines,Ternary_f_0.1%,RF,0.999524,0.805714
88,Daily_Headlines,Ternary_f_0.1%,RF,0.99953,0.800377
133,Daily_News_Headlines,Ternary_f_0.15%,RF,0.999524,0.8


To notice:

- Results from daily aggregated headlines occupy all the first 20 rows
- Binary Classification results are superior on average
- GradientBoosting and RandomForest classifiers present the better performances



In [14]:
HL_def_models_res[HL_def_models_res['Labels']!='Binary'].head(10)

Unnamed: 0,Text_Data,Labels,Model,Accuracy_Train,Accuracy_Test
128,Daily_News_Headlines,Ternary_f_0.1%,RF,0.999524,0.805714
88,Daily_Headlines,Ternary_f_0.1%,RF,0.99953,0.800377
133,Daily_News_Headlines,Ternary_f_0.15%,RF,0.999524,0.8
153,Daily_News_Headlines,Ternary_par_10,RF,1.0,0.792381
129,Daily_News_Headlines,Ternary_f_0.1%,GrB,0.839924,0.782857
89,Daily_Headlines,Ternary_f_0.1%,GrB,0.821731,0.779661
114,Daily_Headlines,Ternary_par_10,GrB,0.819379,0.762712
94,Daily_Headlines,Ternary_f_0.15%,GrB,0.817027,0.755179
138,Daily_News_Headlines,Ternary_f_0.2%,RF,1.0,0.748571
134,Daily_News_Headlines,Ternary_f_0.15%,GrB,0.825631,0.748571


- On average, the lower the neutral range the best the accuracy

In [15]:
HL_def_models_res[HL_def_models_res['Labels']=='Binary'].head(10)

Unnamed: 0,Text_Data,Labels,Model,Accuracy_Train,Accuracy_Test
124,Daily_News_Headlines,Binary,GrB,0.864221,0.885714
123,Daily_News_Headlines,Binary,RF,0.999524,0.878095
84,Daily_Headlines,Binary,GrB,0.84572,0.86629
83,Daily_Headlines,Binary,RF,0.99953,0.847458
120,Daily_News_Headlines,Binary,NB,0.794188,0.841905
82,Daily_Headlines,Binary,DT,0.99953,0.838041
122,Daily_News_Headlines,Binary,DT,0.999524,0.834286
80,Daily_Headlines,Binary,NB,0.762935,0.789077
81,Daily_Headlines,Binary,LR,0.860301,0.770245
121,Daily_News_Headlines,Binary,LR,0.880896,0.746667


- For almost every model the news slicing improve performances

#4. Model Tuning

In [33]:
#Preliminary function necessary for a sort of "customized" model validation

def time_cross_val_sampling(total_set_length):
  """
     Function returning integers to split randomly the sample in train/test maintaining the (chronological) order

     Args:
     total_set_length (int) : total length of the sample data

     Returns:
     starting_idx (int) : first index (of the whole dataset) of the train set
     splitting_idx (int) : last index (of the whole dataset) of the train set and first of the test set
     test_end_idx (int) : last index (of the whole dataset) of the train set

  """
  tr_size=uniform(0.48,0.75)
  ts_size=uniform(tr_size/4,tr_size/3)
  max_starting_fraction=1-(tr_size+ts_size)
  starting_fraction=uniform(0,max_starting_fraction)
  starting_idx=int(round(starting_fraction*total_set_length,0))
  splitting_idx=int(round((starting_fraction+tr_size)*total_set_length,0))
  test_end_idx=int(round((starting_fraction+tr_size+ts_size)*total_set_length,0))
  return starting_idx,splitting_idx,test_end_idx

In [34]:
#Preliminary function to train/test split the sample

def tt_split(ds,txt_col,lab_col,length_column,minimum_length,vect,max_ngram,max_ft,scaling):
  """
     Function returning train/test features (vectorized text) and labels under some parameter input

     Args:
     ds (pd.DataFrame) : DataFrame including text,text length and corresponding labels
     txt_col (string) : string identifying the column containing text
     lab_col (string) : string identifying the column containing labels
     length_col (string) : string identifying the column containing text number of words
     vect (string) : string to identify a vectorizer through the following correspondance {'cv':Count_vectorizer,'tfidf':TfidfVectorizer}
     max_ngram (int) : maximum ngrams length to be included in the vectorizer
     max_ft (int) : maximum number of features length to be included in the vectorizer
     scaling (string) : string to identify a scaling technique through the following correspondance {'standard':StandardScaler,'robust':RobustScaler,'minmax':MinMaxScaler,else: No scaler applied}
     
     Returns:
     X_train (np.array) : X Train set
     X_test (np.array) : X Test set
     y_train (pd.Series) : y Train set
     y_test (pd.Series) : y Test set

  """
  dataset_sel=ds[ds[length_column]>=minimum_length]
  N=len(dataset_sel)
  train_starting_time,test_starting_time,test_end_time=time_cross_val_sampling(N)
  text=dataset_sel[txt_col]
  lab=dataset_sel[lab_col]
  train=text[train_starting_time:test_starting_time]
  test=text[test_starting_time:test_end_time]
  if vect=='cv':
    vectorizer=CountVectorizer(strip_accents='ascii', stop_words='english',ngram_range=(1,max_ngram),max_features=max_ft)
  elif vectorizer=='tfidf':
    vectorizer=TfidfVectorizer(strip_accents='ascii', stop_words='english',ngram_range=(1,max_ngram),max_features=max_ft)
  else:
    raise ValueError
  X_tr=vectorizer.fit_transform(train).toarray()
  X_ts=vectorizer.transform(test).toarray()
  y_train=lab[train_starting_time:test_starting_time]
  y_test=lab[test_starting_time:test_end_time]
  scaling_bool=True
  if scaling=='standard':
    scaler=StandardScaler()
  elif scaling=='robust':
    scaler=RobustScaler()
  elif scaling=='minmax':
    scaler=MinMaxScaler()
  else:
    scaling_bool=False
  if scaling_bool==True:
    X_train=scaler.fit_transform(X_tr)
    X_test=scaler.transform(X_ts)
  else:
    X_train=X_tr
    X_test=X_ts
  return X_train,X_test,y_train,y_test

    

##4.1 Gradient Boosting Classifier

Parameters under analysis and questions to answer to improve the model results:
- Minimum length to filter out rows with lower text amount
- Maximum ngrams length to be included in the vectorizer
- Maximum number of features length to be included in the vectorizer
- Type of Scaler (including None)
- GradientBoostingClassifier learning rate
- GradientBoostingClassifier number of estimators
- GradientBoostingClassifier max depth


In [35]:
def custom_RandomizedSearchCV_for_GrB(dataset,text_col,labels_col,len_col,len_filter,vzer,max_ngram_l,max_ft_l,scalers_l,learning_rate_list,n_estimators_list,max_dpt_l,attempts=80,val_samples=5):
  """
     Function returning gradient boosting classifier results under randomly selected (from a range) parameters input regarding the features extraction procedures and the model

     Args:
     dataset (pd.DataFrame) : DataFrame including text,text length and corresponding labels
     text_col (string) : string identifying the column containing text
     labels_col (string) : string identifying the column containing labels
     len_col (string) : string identifying the column containing text number of words
     vzer (string) : string to identify a vectorizer through the following correspondance {'cv':Count_vectorizer,'tfidf':TfidfVectorizer}
     max_ngram_l (list) : range of possible maximum ngrams lengths to be included in the vectorizer. List of integers
     max_ft_l (list) : range of possible maximum numbers of features length to be included in the vectorizer. List of integers
     scalers_l (list) : range of strings. These strings identify a scaling technique through the following correspondance {'standard':StandardScaler,'robust':RobustScaler,'minmax':MinMaxScaler,else: No scaler applied}
     learning_rate_list (list) : range of possible model learning rates. List of floats
     n_estimators_list (list) : range of possible model n_estimators. List of integers
     max_depth_l (list) : range of possible model maximum depths. List of integers
     attempts (int) : number of attempts (times to randomly selct parameters)
     val_samples (int) : number of cv samples to use to compute (average) performances


     Returns:
     results (pd.DataFrame) : DataFrame including all the attempts results 

  """
  Models=[]
  Length_Filter=[]
  Vectorizers=[]
  Max_n_gram=[]
  Max_features=[]
  Scalers=[]
  Learning_Rates=[]
  n_estimators=[]
  Max_depth=[]
  Accuracy_Train=[]
  Accuracy_Test=[]
  for j in range(attempts):
    min_len=choice(len_filter)
    mng=choice(max_ngram_l)
    mft=choice(max_ft_l)
    scl=choice(scalers_l)
    lr=choice(learning_rate_list)
    ne=choice(n_estimators_list)
    md=choice(max_dpt_l)
    partial_acc_train=[]
    partial_acc_test=[]
    for k in range(val_samples):
      X_train,X_test,y_train,y_test=tt_split(dataset,text_col,labels_col,len_col,min_len,vzer,mng,mft,scl)
      model=GradientBoostingClassifier(learning_rate=lr,max_depth=md,n_estimators=ne)
      model.fit(X_train,y_train)
      train_predictions=model.predict(X_train)
      test_predictions=model.predict(X_test)
      partial_acc_train.append(accuracy_score(y_train,train_predictions))
      partial_acc_test.append(accuracy_score(y_test,test_predictions))
    Models.append('GrB')
    Length_Filter.append(min_len)
    Vectorizers.append(vzer)
    Max_n_gram.append(mng)
    Max_features.append(mft)
    Scalers.append(scl)
    Learning_Rates.append(lr)
    n_estimators.append(ne)
    Max_depth.append(md)
    Accuracy_Train.append(mean(partial_acc_train))
    Accuracy_Test.append(mean(partial_acc_test))
  results=pd.DataFrame({'Model':Models,'Minimum_Length':Length_Filter,'Vectorizer':Vectorizers,'Vect_Max_ngram':Max_n_gram,'Vect_Max_features':Max_features,'Scaler':Scalers,'GrB_Learning_Rate':Learning_Rates,'GrB_n_estimators':n_estimators,'GrB_max_depth':Max_depth,'Accuracy_Train':Accuracy_Train,'Accuracy_Test':Accuracy_Test})
  results=results.sort_values(by='Accuracy_Test',ascending=False)  
  return results


In [118]:
grB_randCV_results=custom_RandomizedSearchCV_for_GrB(daily_news_df,'Processed_Titles','Binary_Labels','Title_Len',[0,50,100],'cv',[2,3,4,5,6],[300,500,600,800,1500,3000,6000],['standard','robust','NO'],[0.08,0.1,0.15,0.2,0.25],[50,80,100,120,150],[3,4,5],attempts=80,val_samples=4)

grB_randCV_results.head()

Unnamed: 0,Model,Minimum_Length,Vectorizer,Vect_Max_ngram,Vect_Max_features,Scaler,GrB_Learning_Rate,GrB_n_estimators,GrB_max_depth,Accuracy_Train,Accuracy_Test
61,GrB,50,cv,6,300,robust,0.08,100,3,0.939995,0.889332
65,GrB,100,cv,6,600,standard,0.25,100,3,1.0,0.888079
7,GrB,50,cv,2,500,NO,0.1,120,5,0.999793,0.886588
59,GrB,50,cv,4,6000,NO,0.08,120,4,0.995598,0.886314
2,GrB,100,cv,5,600,robust,0.2,100,5,1.0,0.886166


Taking into account the overfitting risk we proceed aiming at keeping only parameters combinations with relatively small difference between train and test results.

In [132]:
grB_randCV_results['TT_difference']=abs(grB_randCV_results['Accuracy_Train']-grB_randCV_results['Accuracy_Test'])

grB_randCV_results_avoiding_OF=grB_randCV_results[grB_randCV_results['TT_difference']<0.05]

grB_randCV_results_avoiding_OF.head()

Unnamed: 0,Model,Minimum_Length,Vectorizer,Vect_Max_ngram,Vect_Max_features,Scaler,GrB_Learning_Rate,GrB_n_estimators,GrB_max_depth,Accuracy_Train,Accuracy_Test,TT_difference
52,GrB,0,cv,4,3000,robust,0.08,80,4,0.923196,0.882267,0.040929
15,GrB,0,cv,3,1500,NO,0.1,50,3,0.84759,0.874235,0.026646
10,GrB,0,cv,4,3000,standard,0.08,120,3,0.887264,0.863018,0.024246
46,GrB,0,cv,4,3000,NO,0.1,50,4,0.881614,0.859678,0.021936
6,GrB,0,cv,6,300,NO,0.08,150,3,0.883812,0.856572,0.02724


Best RandomizedSearch GradientBoostingClassifier Model

In [133]:
#Get the best parameters from the customized random search above

top_min_len=grB_randCV_results_avoiding_OF.iloc[0,1]
top_max_ng=grB_randCV_results_avoiding_OF.iloc[0,3]
top_max_ft=grB_randCV_results_avoiding_OF.iloc[0,4]

print(f'Minimum Length: {top_min_len}')
print(f'Max ngrams: {top_max_ng}')
print(f'Max features: {top_max_ft}')

top_lr=grB_randCV_results_avoiding_OF.iloc[0,6]
top_md=grB_randCV_results_avoiding_OF.iloc[0,8]
top_ne=grB_randCV_results_avoiding_OF.iloc[0,7]


print(f'Learning Rate: {top_lr}')
print(f'Max depth: {top_md}')
print(f'N_estimators: {top_ne}')


#Slice through minimum length

GB_daily_news_df_selection=daily_news_df[daily_news_df['Title_Len']>=top_min_len].copy()

GB_text=GB_daily_news_df_selection['Processed_Titles']
GB_lab=GB_daily_news_df_selection['Binary_Labels']

#Define train/test

Size_train=0.8

train_test_splitting_number=int(round(len(GB_daily_news_df_selection)*Size_train,0))
GB_train_text=GB_text[0:train_test_splitting_number]
GB_test_text=GB_text[train_test_splitting_number:]
GB_y_train=GB_lab[0:train_test_splitting_number]
GB_y_test=GB_lab[train_test_splitting_number:]

#Vectorizing and Scaling

GB_cv_vectorizer=CountVectorizer(strip_accents='ascii', stop_words='english',ngram_range=(1,top_max_ng),max_features=top_max_ft)


X_train_GB=GB_cv_vectorizer.fit_transform(GB_train_text).toarray()
X_test_GB=GB_cv_vectorizer.transform(GB_test_text).toarray()

if grB_randCV_results_avoiding_OF.iloc[0,5]=='standard':
  GB_scaler=StandardScaler()  
  X_train_GB=GB_scaler.fit_transform(X_train_GB)
  X_test_GB=GB_scaler.transform(X_test_GB)
elif grB_randCV_results_avoiding_OF.iloc[0,5]=='robust':
  GB_scaler=RobustScaler()  
  X_train_GB=GB_scaler.fit_transform(X_train_GB)
  X_test_GB=GB_scaler.transform(X_test_GB)
else:
  print('No scaling')


#Model

GB=GradientBoostingClassifier(learning_rate=top_lr,max_depth=top_md,n_estimators=top_ne )  
GB.fit(X_train_GB,GB_y_train)

GB_predictions_train=GB.predict(X_train_GB)
GB_predictions=GB.predict(X_test_GB)


#Results
print(f'Accuracy Train : {accuracy_score(GB_y_train,GB_predictions_train)}')
print(f'Accuracy Test : {accuracy_score(GB_y_test,GB_predictions)}')
print(f'ROC AUC Score Test : {roc_auc_score(GB_y_test,GB_predictions)}')
print(f'MCC Test : {matthews_corrcoef(GB_y_test,GB_predictions)}')

Minimum Length: 0
Max ngrams: 4
Max features: 3000
Learning Rate: 0.08
Max depth: 4
N_estimators: 80
Accuracy Train : 0.8761314911862792
Accuracy Test : 0.8914285714285715
ROC AUC Score Test : 0.8877551020408163
MCC Test : 0.7792366493720679


##4.2 Random Forest Classifier

Parameters under analysis and questions to answer to improve the model results:
- Minimum length to filter out rows with lower text amount
- Maximum ngrams length to be included in the vectorizer
- Maximum number of features length to be included in the vectorizer
- Type of Scaler (including None)
- RandomForestClassifier criterion
- RandomForestClassifier number of estimators
- RandomForestClassifier max depth

In [135]:
def revised_RandomizedSearchCV_for_RF(dataset,text_col,labels_col,len_col,len_filter,vzer,max_ngram_l,max_ft_l,scalers_l,criterion_list,n_estimators_list,max_dpt_l,attempts=80,val_samples=5):
  """
     Function returning random forest classifier results under randomly selected (from a range) parameters input regarding the features extraction procedures and the model

     Args:
     dataset (pd.DataFrame) : DataFrame including text,text length and corresponding labels
     text_col (string) : string identifying the column containing text
     labels_col (string) : string identifying the column containing labels
     len_col (string) : string identifying the column containing text number of words
     vzer (string) : string to identify a vectorizer through the following correspondance {'cv':Count_vectorizer,'tfidf':TfidfVectorizer}
     max_ngram_l (list) : range of possible maximum ngrams lengths to be included in the vectorizer. List of integers
     max_ft_l (list) : range of possible maximum numbers of features length to be included in the vectorizer. List of integers
     scalers_l (list) : range of strings. These strings identify a scaling technique through the following correspondance {'standard':StandardScaler,'robust':RobustScaler,'minmax':MinMaxScaler,else: No scaler applied}
     criterion_list (list) : range of possible model learning rates. List of floats
     n_estimators_list (list) : range of possible model n_estimators. List of integers
     max_depth_l (list) : range of possible model maximum depth. List of integers
     attempts (int) : number of attempts (times to randomly select parameters)
     val_samples (int) : number of cv samples to use to compute (average) performances


     Returns:
     results (pd.DataFrame) : DataFrame including all the attempts results 

  """
  Models=[]
  Length_Filter=[]
  Vectorizers=[]
  Max_n_gram=[]
  Max_features=[]
  Scalers=[]
  Crits=[]
  n_estimators=[]
  Max_depth=[]
  Accuracy_Train=[]
  Accuracy_Test=[]
  for j in range(attempts):
    min_len=choice(len_filter)
    mng=choice(max_ngram_l)
    mft=choice(max_ft_l)
    scl=choice(scalers_l)
    cr=choice(criterion_list)
    ne=choice(n_estimators_list)
    md=choice(max_dpt_l)
    partial_acc_train=[]
    partial_acc_test=[]
    for k in range(val_samples):
      X_train,X_test,y_train,y_test=tt_split(dataset,text_col,labels_col,len_col,min_len,vzer,mng,mft,scl)
      model=RandomForestClassifier(criterion=cr,max_depth=md,n_estimators=ne)
      model.fit(X_train,y_train)
      train_predictions=model.predict(X_train)
      test_predictions=model.predict(X_test)
      partial_acc_train.append(accuracy_score(y_train,train_predictions))
      partial_acc_test.append(accuracy_score(y_test,test_predictions))
    Models.append('RF')
    Length_Filter.append(min_len)
    Vectorizers.append(vzer)
    Max_n_gram.append(mng)
    Max_features.append(mft)
    Scalers.append(scl)
    Crits.append(cr)
    n_estimators.append(ne)
    Max_depth.append(md)
    Accuracy_Train.append(mean(partial_acc_train))
    Accuracy_Test.append(mean(partial_acc_test))
  results=pd.DataFrame({'Model':Models,'Minimum_Length':Length_Filter,'Vectorizer':Vectorizers,'Vect_Max_ngram':Max_n_gram,'Vect_Max_features':Max_features,'Scaler':Scalers,'RF_Criterion':Crits,'RF_n_estimators':n_estimators,'RF_max_depth':Max_depth,'Accuracy_Train':Accuracy_Train,'Accuracy_Test':Accuracy_Test})
  results=results.sort_values(by='Accuracy_Test',ascending=False)  
  return results


In [136]:
RF_randCV_results=revised_RandomizedSearchCV_for_RF(daily_news_df,'Processed_Titles','Binary_Labels','Title_Len',[0,50,100],'cv',[3,4,5,6],[300,500,600,800,1500,3000,6000],['standard','robust','NO'],['gini','entropy'],[60,80,100,120,150],[2,3,4,5,6,8],attempts=40,val_samples=4)

RF_randCV_results.head(5)

Unnamed: 0,Model,Minimum_Length,Vectorizer,Vect_Max_ngram,Vect_Max_features,Scaler,RF_Criterion,RF_n_estimators,RF_max_depth,Accuracy_Train,Accuracy_Test
32,RF,100,cv,4,3000,NO,entropy,60,4,0.882926,0.895408
34,RF,100,cv,5,6000,robust,gini,100,6,0.925109,0.893113
27,RF,100,cv,4,300,robust,entropy,150,8,0.989038,0.889696
21,RF,50,cv,5,6000,standard,entropy,120,3,0.816385,0.889445
12,RF,100,cv,5,3000,NO,gini,120,8,0.974359,0.886362


In [138]:
#Get the best parameters from the customized random search above

RF_top_min_len=RF_randCV_results.iloc[0,1]
RF_top_max_ng=RF_randCV_results.iloc[0,3]
RF_top_max_ft=RF_randCV_results.iloc[0,4]

print(f'Minimum Length: {RF_top_min_len}')
print(f'Max ngrams: {RF_top_max_ng}')
print(f'Max features: {RF_top_max_ft}')

RF_top_cr=RF_randCV_results.iloc[0,6]
RF_top_md=RF_randCV_results.iloc[0,8]
RF_top_ne=RF_randCV_results.iloc[0,7]


print(f'Criterion: {RF_top_cr}')
print(f'Max depth: {RF_top_md}')
print(f'N_estimators: {RF_top_ne}')


#Slice through minimum length

RF_daily_news_df_selection=daily_news_df[daily_news_df['Title_Len']>=RF_top_min_len].copy()

RF_text=RF_daily_news_df_selection['Processed_Titles']
RF_lab=RF_daily_news_df_selection['Binary_Labels']

#Define train/test

Size_Train=0.8

train_test_splitting_n=int(round(len(RF_daily_news_df_selection)*Size_Train,0))
RF_train_text=RF_text[0:train_test_splitting_n]
RF_test_text=RF_text[train_test_splitting_n:]
RF_y_train=RF_lab[0:train_test_splitting_n]
RF_y_test=RF_lab[train_test_splitting_n:]


#Vectorizing and Scaling

RF_cv_vectorizer=CountVectorizer(strip_accents='ascii', stop_words='english',ngram_range=(1,RF_top_max_ng),max_features=RF_top_max_ft)


X_train_RF=RF_cv_vectorizer.fit_transform(RF_train_text).toarray()
X_test_RF=RF_cv_vectorizer.transform(RF_test_text).toarray()

if RF_randCV_results.iloc[0,5]=='standard':
  RF_scaler=StandardScaler()  
  X_train_RF=RF_scaler.fit_transform(X_train_GB)
  X_test_RF=RF_scaler.transform(X_test_GB)
elif RF_randCV_results.iloc[0,5]=='robust':
  RF_scaler=RobustScaler()  
  X_train_RF=RF_scaler.fit_transform(X_train_GB)
  X_test_RF=RF_scaler.transform(X_test_GB)
else:
  print('No scaling')


#Model

RF=RandomForestClassifier(criterion=RF_top_cr,max_depth=RF_top_md,n_estimators=RF_top_ne)  
RF.fit(X_train_RF,RF_y_train)

RF_predictions_train=RF.predict(X_train_RF)
RF_predictions=RF.predict(X_test_RF)


#Results
print(f'Accuracy Train : {accuracy_score(RF_y_train,RF_predictions_train)}')
print(f'Accuracy Test : {accuracy_score(RF_y_test,RF_predictions)}')
print(f'ROC AUC Score Test : {roc_auc_score(RF_y_test,RF_predictions)}')
print(f'MCC Test : {matthews_corrcoef(RF_y_test,RF_predictions)}')

Minimum Length: 100
Max ngrams: 4
Max features: 3000
Criterion: entropy
Max depth: 4
N_estimators: 60
No scaling
Accuracy Train : 0.8534554537885096
Accuracy Test : 0.9033333333333333
ROC AUC Score Test : 0.9034285714285715
MCC Test : 0.8027393307271766


Under the constraint of having at least hundred words of processed text as input the Random Forest Classifier provides impressive performances