In [5]:
%load_ext autoreload
%autoreload 2

In [None]:
# !pip install memoized_property
# !pip install mlflow

# IMPORTS

In [6]:
import pandas as pd
import numpy as np
import statistics as stats
import math
from memoized_property import memoized_property
import mlflow
from mlflow.tracking import MlflowClient
from itertools import product
from sklearn.svm import SVC
from sklearn.linear_model import RidgeClassifier

In [8]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


# Data import + Preprocessing

In [9]:
data = pd.read_csv('/content/drive/MyDrive/bitstampUSD.csv')

In [10]:
data = data[2798176:4727776].copy()

In [11]:
def preprocessing_data(data, features_size, h):
        
    data_pp = data.copy()
    data_pp['diff_Open'] = data_pp['Open'].diff(h)
    data_pp['diff_Open'] = data_pp['diff_Open'].dropna()
    data_pp[f"t+{h}"] = data_pp['diff_Open'].shift(-h)
    
    for i in range(0, features_size):
        data_pp[f't-{i}'] = data_pp['Open'].shift(i)
    data_shifted = data_pp.dropna()
    
    return data_shifted


# Feature building

In [12]:
def features_target(data_shifted, h):
    
    X = data_shifted.drop(columns=['Open', 'diff_Open', f"t+{h}"])
    y = data_shifted[f"t+{h}"].copy()
    y[y > 0] = 1
    y[y <= 0] = 0
    
    data_size = data_shifted.shape[0]
    
    return X, y, data_size

# Date formatting + selecting

In [13]:
def select_date(data, date_start, date_end):
    data['Timestamp'] = pd.to_datetime(data['Timestamp'], unit='s', origin='unix')
    data = data[['Open', 'Timestamp']].set_index("Timestamp").fillna(method='ffill')

    if date_start != None:
        if date_end != None:
            data = data[date_start:date_end].copy()
    else:
        data = data.copy()
        
    return data

In [14]:
def input_data(X, y, sample_size, data_size, train_size, test_size, h=1, w=0):    
 

    sample_X = X.iloc[data_size-(test_size * w + sample_size) : data_size - (test_size * w)]
    sample_y = y.iloc[data_size-(test_size * w + sample_size) : data_size - (test_size * w)]
    
    X_train = sample_X.iloc[0:train_size]
    y_train = sample_y.iloc[0:train_size]
    X_test = sample_X.iloc[(train_size+h-1):(sample_size)]
    y_test = sample_y.iloc[(train_size+h-1):(sample_size)]
    
    return X_train, X_test, y_train, y_test

In [16]:
def predict_score(model_init, X_train, X_test, y_train, y_test):
    model = model_init
    model = model.fit(X_train, y_train)
    results = model.predict(X_test)
    score = model.score(X_test, y_test) 
    return score, model.coef_


In [17]:
def cross_val(data, model_init=None,sample_size=1000, train_fraction=0.7, features_size=60, h=1, date_start=None, date_end=None):
    
    data = select_date(data, date_start, date_end)
    data_shifted = preprocessing_data(data, features_size, h)
    X, y, data_size = features_target(data_shifted, h)
    train_size = int(train_fraction*sample_size)
    test_size = sample_size - train_size
    
    
    r = math.floor((data_size-sample_size)/test_size)
    intervals = range(0, r)
    reversed_intervals = reversed(intervals)
    results = []
    parameters = []
    
    for i in reversed_intervals:
        X_train, X_test, y_train, y_test = input_data(X, y, sample_size, data_size, train_size, test_size, h, w=i)
        score, params = predict_score(model_init, X_train, X_test, y_train, y_test)
        results.append(score)
        parameters.append(params)
        # print(f"fold {i} done")

    
    return dict({'mean_score':np.around(np.mean(results), 2),
                 'std':np.around(np.std(results), 2),   
                 'score_min':np.around(np.amin(results), 2),
                 'score_max':np.around(np.amax(results), 2), 
                 'n_fold':r}), np.around(np.mean(parameters, axis=0), 4)

In [20]:
class MLFlowBase():

    def __init__(self, experiment_name, MLFLOW_URI):
        self.experiment_name = experiment_name
        self.MLFLOW_URI = MLFLOW_URI

    @memoized_property
    def mlflow_client(self):
        mlflow.set_tracking_uri(self.MLFLOW_URI)
        return MlflowClient()

    @memoized_property
    def mlflow_experiment_id(self):
        try:
            return self.mlflow_client \
                .create_experiment(self.experiment_name)
        except BaseException:
            return self.mlflow_client \
                .get_experiment_by_name(self.experiment_name).experiment_id

    def mlflow_create_run(self):
        self.mlflow_run = self.mlflow_client \
            .create_run(self.mlflow_experiment_id)

    def mlflow_log_param(self, key, value):
        self.mlflow_client \
            .log_param(self.mlflow_run.info.run_id, key, value)

    def mlflow_log_metric(self, key, value):
        self.mlflow_client \
            .log_metric(self.mlflow_run.info.run_id, key, value)

In [22]:
class Trainer(MLFlowBase):

    def __init__(self):
        super().__init__(
            "[FR] [Paris] [nhuberde] bitcoin project",
            "https://mlflow.lewagon.co")
            
    # def train(self, trainer_params, hyper_params):
    def train(self, trainer_params, data):

        i = 0

        # step 1 : iterate on trainer params
        for param_combination in product(*trainer_params.values()):

            exp_params = dict(zip(trainer_params.keys(), param_combination))

            # print(exp_params)

            # step 2 : iterate on models
            # for model_name, model_hparams in hyper_params.items():

                # print(f"model name {model_name}")

                # step 3 : iterate on model hyperparams
                # for hparam_combi in product(*model_hparams.values()):

                #     hexp_params = dict(zip(model_hparams.keys(), hparam_combi))

                    # print(hexp_params)

                    # mais avec quoi je train ?
            i += 1
            print(f"\nexperiment #{i}:")
            print(exp_params)
            # print(f"model name {model_name}")
            # print(hexp_params)

            # TODO: train with trainer params + model + hyperparams

            # => appeler la crossval
            # data = get_data()
            results, parameters = cross_val(data, **exp_params)


            # create a mlflow training
            self.mlflow_create_run()  # create one training

            # log trainer params
            for key, value in exp_params.items():
                self.mlflow_log_param(key, value)
                
            # then log buddy_name on mlflow
            self.mlflow_log_param("buddy_name", {buddy_name})

            # log params
            # self.mlflow_log_param("model", model_name)

            # log model hyper params
            # for key, value in hexp_params.items():
            #     self.mlflow_log_param(key, value)

            # push metrics to mlflow
            self.mlflow_log_metric("mean_score", results['mean_score'])
            self.mlflow_log_metric("std", results['std'])
            self.mlflow_log_metric("score_min", results['score_min'])
            self.mlflow_log_metric("score_max", results['score_max'])
            for p in range(parameters.shape[1]):
                self.mlflow_log_metric(f"coef_feature {p}", parameters[0,p])
            


# if __name__ == "__main__":



In [23]:
buddy_name = 'Caroline'

In [28]:
trainer_params = dict(
    model_init=[RidgeClassifier()],
    sample_size=[1440, 10080, 14400, 21600, 43200, 86400, 129600],
    train_fraction=[0.7],
    features_size=[60],
    h=[10],
    date_start=["2020"],
    date_end=["2020"],
)

trainer = Trainer()
models = trainer.train(trainer_params, data)
models


experiment #1:
{'model_init': RidgeClassifier(alpha=1.0, class_weight=None, copy_X=True, fit_intercept=True,
                max_iter=None, normalize=False, random_state=None,
                solver='auto', tol=0.001), 'sample_size': 1440, 'train_fraction': 0.7, 'features_size': 60, 'h': 10, 'date_start': '2020', 'date_end': '2020'}

experiment #2:
{'model_init': RidgeClassifier(alpha=1.0, class_weight=None, copy_X=True, fit_intercept=True,
                max_iter=None, normalize=False, random_state=None,
                solver='auto', tol=0.001), 'sample_size': 10080, 'train_fraction': 0.7, 'features_size': 60, 'h': 10, 'date_start': '2020', 'date_end': '2020'}

experiment #3:
{'model_init': RidgeClassifier(alpha=1.0, class_weight=None, copy_X=True, fit_intercept=True,
                max_iter=None, normalize=False, random_state=None,
                solver='auto', tol=0.001), 'sample_size': 14400, 'train_fraction': 0.7, 'features_size': 60, 'h': 10, 'date_start': '2020', 'date_end'

In [35]:
def predict_score_deep(X_train, X_test, y_train, y_test):
    
    model = Sequential()
    es = EarlyStopping(patience=2, restore_best_weights=True)
    
    model.add(GRU(units = 30, activation='tanh', return_sequences = True, input_shape = (X_train.shape[1], 1)))
    model.add(Dropout(0.2))
    # model.add(LSTM(units = 40, return_sequences = True))
    # model.add(Dropout(0.2))
    model.add(GRU(units = 10,  activation='tanh', return_sequences = True))
    model.add(Dropout(0.2))
    # Adding the output layer
    model.add(Dense(units = 5, activation='relu'))
    model.add(Dense(units = 1, activation='sigmoid'))
    # Compiling the RNN
    model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
    # Fitting the RNN to the Training set
    model.fit(X_train, y_train, validation_split=0.2, epochs = 3, batch_size = 32, callbacks=[es])

    score = model.evaluate(X_test, y_test, verbose=0)

    return score[1] #attention score[0] loss à return also

In [33]:
def deep(data, sample_size=1000, train_fraction=0.7, features_size=60, h=1, date_start=None, date_end=None):
    
    data = select_date(data, date_start, date_end)
    data_shifted = preprocessing_data(data, features_size, h)
    X, y, data_size = features_target(data_shifted, h)
    train_size = int(train_fraction*sample_size)
    test_size = sample_size - train_size
    
    r = math.floor((data_size-train_size)/test_size)
    intervals = range(0, r)
    reversed_intervals = reversed(intervals)
    results = []
    
    for i in reversed_intervals:
        X_train, X_test, y_train, y_test = input_data(X, y, sample_size, data_size, train_size, test_size, h, w=i)
  
        X_train, y_train = np.array(X_train), np.array(y_train)
        X_test, y_test = np.array(X_test), np.array(y_test)
        X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1))
        X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], 1))
            
        score = predict_score_deep(X_train, X_test, y_train, y_test)
        results.append(score)
    
    return dict({'mean_score':round(stats.mean(results),2),
                 'std':round(stats.stdev(results),2),
                 'score_min':round(min(results),2),
                 'score_max':round(max(results),2),
                 'n_fold':r})

In [47]:
class Trainer2(MLFlowBase):

    def __init__(self):
        super().__init__(
            "[FR] [Paris] [nhuberde] bitcoin project",
            "https://mlflow.lewagon.co")
            
    # def train(self, trainer_params, hyper_params):
    def train(self, trainer_params, data):

        i = 0

        # step 1 : iterate on trainer params
        for param_combination in product(*trainer_params.values()):

            exp_params = dict(zip(trainer_params.keys(), param_combination))

            # print(exp_params)

            # step 2 : iterate on models
            # for model_name, model_hparams in hyper_params.items():

                # print(f"model name {model_name}")

                # step 3 : iterate on model hyperparams
                # for hparam_combi in product(*model_hparams.values()):

                #     hexp_params = dict(zip(model_hparams.keys(), hparam_combi))

                    # print(hexp_params)

                    # mais avec quoi je train ?
            i += 1
            print(f"\nexperiment #{i}:")
            print(exp_params)
            # print(f"model name {model_name}")
            # print(hexp_params)

            # TODO: train with trainer params + model + hyperparams

            # => appeler la crossval
            # data = get_data()
            results = deep(data, **exp_params)


            # create a mlflow training
            self.mlflow_create_run()  # create one training

            # log trainer params
            for key, value in exp_params.items():
                self.mlflow_log_param(key, value)
                
            # then log buddy_name on mlflow
            self.mlflow_log_param("buddy_name", {buddy_name})
            self.mlflow_log_param("model_type", "GRU")

            # log params
            # self.mlflow_log_param("model", model_name)

            # log model hyper params
            # for key, value in hexp_params.items():
            #     self.mlflow_log_param(key, value)

            # push metrics to mlflow
            self.mlflow_log_metric("mean_score", results['mean_score'])
            self.mlflow_log_metric("std", results['std'])
            self.mlflow_log_metric("score_min", results['score_min'])
            self.mlflow_log_metric("score_max", results['score_max'])
            self.mlflow_log_metric("n_fold", results['n_fold'])
            


# if __name__ == "__main__":



In [39]:
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers
from tensorflow.keras.layers import GRU, Dense
from tensorflow.keras.callbacks import EarlyStopping
import math
import matplotlib.pyplot as plt
import keras
import pandas as pd
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout
from keras.layers import *
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from keras.callbacks import EarlyStopping

In [None]:
results = deep(data, sample_size=1440, train_fraction=0.7, features_size=60, h=1, date_start="2020-10-01", date_end="2020-10-07")

In [41]:
results

{'mean_score': 0.51,
 'n_fold': 20,
 'score_max': 0.61,
 'score_min': 0.47,
 'std': 0.03}

In [46]:
trainer_params2 = dict(
    sample_size=[1440],
    train_fraction=[0.7],
    features_size=[60],
    h=[1],
    date_start=["2020-10-01"],
    date_end=["2020-10-07"],
)

trainer2 = Trainer2()
trainer2.train(trainer_params2, data)


experiment #1:
{'sample_size': 1440, 'train_fraction': 0.7, 'features_size': 60, 'h': 1, 'date_start': '2020-10-01', 'date_end': '2020-10-07'}
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [51]:
# data2 = select_date(data, date_start=None, date_end=None)

In [53]:
# data_shifted = preprocessing_data(data2, features_size=120, h=10)