# Stock purchase recommendations with Machine Learning - using TPOT

In [1]:
import pandas as pd
import numpy as np
import talib as ta
import matplotlib.pyplot as plt
from tqdm import tqdm, tqdm_notebook # progress bar
import fastparquet
import pickle

from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

from tpot import TPOTClassifier



In [2]:
pd.set_option('display.max_columns', 1500)

## 1) Load Training and Test Data

In [3]:
# load the training and test datae from feature engineering step:
X_train = fastparquet.ParquetFile('../data/processed/X_train.parq').to_pandas()
X_test = fastparquet.ParquetFile('../data/processed/X_test.parq').to_pandas()
y_train = pickle.load(open('../data/processed/y_train.pkl', 'rb'))
y_test = pickle.load(open('../data/processed/y_test.pkl', 'rb'))

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((7024, 693), (1756, 693), (7024,), (1756,))

### Build Model

In [16]:
pipeline_optimizer = TPOTClassifier(generations=5, population_size=20, cv=5, scoring='f1',
                                    random_state=42, verbosity=2, n_jobs=-1)

In [17]:
pipeline_optimizer.fit(X_train, y_train)



HBox(children=(IntProgress(value=0, description='Optimization Progress', max=120, style=ProgressStyle(descript…

Generation 1 - Current best internal CV score: 0.3393850345594306
Generation 2 - Current best internal CV score: 0.3393850345594306
Generation 3 - Current best internal CV score: 0.3393850345594306
Generation 4 - Current best internal CV score: 0.4513608989034045
Generation 5 - Current best internal CV score: 0.4513608989034045

Best pipeline: GaussianNB(GaussianNB(input_matrix))


TPOTClassifier(config_dict=None, crossover_rate=0.1, cv=5,
        disable_update_check=False, early_stop=None, generations=5,
        max_eval_time_mins=5, max_time_mins=None, memory=None,
        mutation_rate=0.9, n_jobs=-1, offspring_size=None,
        periodic_checkpoint_folder=None, population_size=20,
        random_state=42, scoring='f1', subsample=1.0,
        template='RandomTree', use_dask=False, verbosity=2,
        warm_start=False)

In [18]:
print(pipeline_optimizer.score(X_test, y_test))

0.4075767472240366


In [25]:
pipeline_optimizer.export('../src/models/TPOT/tpot_exported_pipeline.py')

In [20]:
y_pred = pipeline_optimizer.predict(X_test)

# save for backtesting in separate notebook
pickle.dump(y_pred, open('../data/model_predictions/y_pred_TPOT.pkl', 'wb'))

In [21]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       False       0.65      0.47      0.54      1153
        True       0.34      0.52      0.41       603

   micro avg       0.48      0.48      0.48      1756
   macro avg       0.49      0.49      0.47      1756
weighted avg       0.54      0.48      0.50      1756



In [22]:
accuracy_score(y_test, y_pred)

0.48348519362186787

In [23]:
confusion_matrix(y_test, y_pred, labels=[False, True])

array([[537, 616],
       [291, 312]], dtype=int64)