In [59]:
import os
from pathlib import Path
import pandas as pd
import numpy as np
import datetime
import time

from collections import Counter

from sklearn.model_selection import train_test_split
from sklearn.base import clone

from sklearn.linear_model import LinearRegression

In [60]:
model_data_df = pd.read_csv(Path('../Data/model_data.csv'))

In [61]:
azn_df = model_data_df.loc[model_data_df['symbol'].isin(['AZN'])]
pfe_df = model_data_df.loc[model_data_df['symbol'].isin(['PFE'])]
jnj_df = model_data_df.loc[model_data_df['symbol'].isin(['JNJ'])]
mrna_df = model_data_df.loc[model_data_df['symbol'].isin(['MRNA'])]

In [62]:
from sklearn.metrics import r2_score,mean_absolute_error

def calc_scorecard(y_pred,y_true):
    
    def make_df(y_pred,y_true):
        y_pred.name = 'y_pred'
        y_true.name = 'y_true'

        df = pd.concat([y_pred,y_true],axis=1).dropna()

        df['sign_pred'] = df.y_pred.apply(np.sign)
        df['sign_true'] = df.y_true.apply(np.sign)
        df['is_correct'] = 0
        df.loc[df.sign_pred * df.sign_true > 0 ,'is_correct'] = 1 # only registers 1 when prediction was made AND it was correct
        df['is_incorrect'] = 0
        df.loc[df.sign_pred * df.sign_true < 0,'is_incorrect'] = 1 # only registers 1 when prediction was made AND it was wrong
        df['is_predicted'] = df.is_correct + df.is_incorrect
        df['result'] = df.sign_pred * df.y_true 
        return df
    
    df = make_df(y_pred,y_true)
    
    scorecard = pd.Series()
    scorecard.loc['RSQ'] = r2_score(df.y_true,df.y_pred)
    scorecard.loc['MAE'] = mean_absolute_error(df.y_true,df.y_pred)
    scorecard.loc['edge'] = df.result.mean()
    scorecard.loc['noise'] = df.y_pred.diff().abs().mean()
    scorecard.loc['edge_to_noise'] = scorecard.loc['edge'] / scorecard.loc['noise']
    scorecard.loc['edge_to_mae'] = scorecard.loc['edge'] / scorecard.loc['MAE']
    return scorecard  

In [63]:
def run_model(df, algo):
    df = df.drop(['date', 'symbol'], axis=1)

    X = df.drop(columns=['close'])

    y = df['close'].values

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=False)

    model = clone(algo)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    return calc_scorecard(pd.Series(y_pred), pd.Series(y))


In [64]:
run_model(azn_df, LinearRegression())



RSQ             -0.315023
MAE              0.575875
edge             0.064612
noise            0.247892
edge_to_noise    0.260646
edge_to_mae      0.112198
dtype: float64