# 0. Intro

Notebook to deploy and evaluate the optimized pipelines

# 1. Packages & Basic Settings

In [1]:
import numpy as np
import pandas as pd
import os
import pickle
import matplotlib.pyplot as plt
import shap
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier

In [2]:
from evaluation import CustomEval

In [3]:
data_path = '../data'

# 2. Data

& Pipelines instructions

In [4]:
preprocessed_df_filename = 'df_preprocessed.parquet'

df = pd.read_parquet(os.path.join(data_path, 'intermediate', preprocessed_df_filename))

In [5]:
pipelines_instructions = pickle.load(open(os.path.join(data_path, 'output', 'NLP_FSA_pipelines_instructions.pkl'), 'rb'))

# 3. Data Segmentation

In [7]:
overall_test_size = 0.2

split_point = int(round(len(df)*(1-overall_test_size)))

In [8]:
df_train = df.iloc[:split_point].copy()

df_test = df.iloc[split_point:].copy()


# 4. Deploy & Evaluate 

In [9]:
test_rets = df_test['return'].values

selected_metrics = ['MCC','Accuracy','F1','Roc_Auc','PnL_sum','MDD']



In [10]:
extended_names = {'cv': 'CountVectorizer',
                  'tfidf': 'TfidfVectorizer',
                  'binary':'Binary',
                  'ternary':'Ternary',
                  'DT': 'DecisionTree',
                  'RF': 'RandomForest',
                  'HGB': 'GradientBoosting',
                  'VC': 'VotingClassifier'}

In [11]:
all_results = []

for clf, pipes in pipelines_instructions.items():

    for (vect,model), pipe_steps in pipes.items():

        pipe_points = {'Classification':extended_names[clf], 'Vectorizer':extended_names[vect], 'Model':extended_names[model]}

        pipe = Pipeline(pipe_steps)

        pipe.fit(df_train['clean_text'], df_train[f'{clf}_label'])

        y_pred = pipe.predict(df_test['clean_text'])

        computed_metrics = CustomEval(clf=clf, metrics = selected_metrics).eval(df_test[f'{clf}_label'].values, y_pred, test_rets)

        results_store = dict(**pipe_points, **computed_metrics) 

        all_results.append(results_store)


In [12]:
df_res = pd.DataFrame(all_results).sort_values(by=['Classification','Vectorizer']).round(4)

In [13]:
df_res

Unnamed: 0,Classification,Vectorizer,Model,Roc_Auc,Accuracy,F1,MCC,MDD,PnL_sum
0,Binary,CountVectorizer,DecisionTree,0.8554,0.8724,0.8973,0.7539,-0.0271,3.1677
1,Binary,CountVectorizer,RandomForest,0.8689,0.8687,0.8818,0.7347,-0.0144,3.2052
2,Binary,CountVectorizer,GradientBoosting,0.9031,0.9062,0.9178,0.8088,-0.0271,3.2209
6,Binary,CountVectorizer,VotingClassifier,0.8758,0.8837,0.901,0.7638,-0.0271,3.188
3,Binary,TfidfVectorizer,DecisionTree,0.8633,0.8668,0.883,0.7285,-0.0229,3.1681
4,Binary,TfidfVectorizer,RandomForest,0.8581,0.8593,0.8744,0.7146,-0.0144,3.1932
5,Binary,TfidfVectorizer,GradientBoosting,0.8882,0.8949,0.91,0.7865,-0.015,3.2948
7,Binary,TfidfVectorizer,VotingClassifier,0.864,0.8687,0.8856,0.7321,-0.015,3.1997
8,Ternary,CountVectorizer,DecisionTree,,0.8255,,0.7315,-0.0271,3.1262
9,Ternary,CountVectorizer,RandomForest,,0.8124,,0.7116,-0.0127,3.1089
