In [1]:
import pandas as pd
import numpy as np
import os
import pickle
from treeinterpreter import treeinterpreter as ti
from scipy.special import cbrt
import shap

  from numpy.core.umath_tests import inner1d


In [2]:
from openpyxl import load_workbook,Workbook

# Getting data

In [3]:
os.chdir('../pickles')

In [4]:
numerical = pickle.load(open('numerical.pickle','rb'))
categorical = pickle.load(open('categorical.pickle','rb'))
conversion_dict = pickle.load(open('conversion_dict.pickle','rb'))
imputation_cols = pickle.load(open('imputation_cols.pickle','rb'))
model_columns = pickle.load(open('model_columns','rb'))
lr_model_columns = pickle.load(open('lr_model_columns.pickle','rb'))
order = pickle.load(open('order.pickle','rb'))

In [5]:
os.chdir('../Statistics/')

In [7]:
df_val = pd.read_excel('future_predictions.xlsx',sheet_name='data',header=0)

In [8]:
df_val.columns

Index(['Symbol', 'Date', 'Series', 'Prev_Close_order5', 'High_1d_order5',
       'Low_1d_order5', 'Last_1d_order5', 'VWAP_1d_order5', 'Volume_1d_order9',
       'Turnover_1d_order11',
       ...
       'return_1d_500_cube', 'return_2d_500_sqr', 'return_2d_500_cube',
       'return_3d_500_sqr', 'return_3d_500_cube', 'Voltality_500_log',
       'RF_Prediction', 'GBM_Prediction', 'LR_Prediction', 'Ensembled_Prob'],
      dtype='object', length=223)

In [9]:
os.chdir('../Input_Data/')

In [10]:
open_prices = pd.read_csv('open_price.csv',header=None)
close_prices = pd.read_csv('close_price.csv',header=None)

In [11]:
open_prices.columns = ['Symbol','Date','Open_Price']
close_prices.columns = ['Symbol','Date','Close_Price']

In [12]:
open_prices.set_index(['Symbol','Date'],inplace=True)
close_prices.set_index(['Symbol','Date'],inplace=True)

In [13]:
df_val.set_index(['Symbol','Date'],inplace=True)

In [14]:
df_val = df_val.join(open_prices,how='left')

In [15]:
df_val = df_val.join(close_prices,how='left')

# Reason Codes for logistic regression model

In [16]:
os.chdir('../Models/')

In [17]:
LR = pickle.load(open('LR.pickle','rb'))
RF = pickle.load(open('RF.pickle','rb'))
GBM = pickle.load(open('GBM.pickle','rb'))

In [18]:
df_val.reset_index(inplace=True)

In [19]:
df_val.columns

Index(['Symbol', 'Date', 'Series', 'Prev_Close_order5', 'High_1d_order5',
       'Low_1d_order5', 'Last_1d_order5', 'VWAP_1d_order5', 'Volume_1d_order9',
       'Turnover_1d_order11',
       ...
       'return_2d_500_cube', 'return_3d_500_sqr', 'return_3d_500_cube',
       'Voltality_500_log', 'RF_Prediction', 'GBM_Prediction', 'LR_Prediction',
       'Ensembled_Prob', 'Open_Price', 'Close_Price'],
      dtype='object', length=225)

In [22]:
def reason_code_lr(model,data,target='lift_ind'):
    df_reasons = pd.DataFrame()
    df_reasons['Symbol'] = data['Symbol']
    df_reasons['Date'] = data['Date']
    df_reasons['Open Price'] = data['Open_Price']
    df_reasons['Prev Close'] = data['Prev_Close_order5']*(10**5)
    df_reasons['Close Price'] = data['Close_Price']
    df_reasons['lift_ind'] = data['lift_ind']
    df_reasons['Probability'] = model.predict_proba(data[lr_model_columns])[:,1]
    df_reasons['log_odds'] = df_reasons['Probability'].apply(lambda x: np.log(x/(1-x)))
    df_reasons['bias'] = model.intercept_[0]
    coef_array = np.array(model.coef_[0])
    values = np.array(data[lr_model_columns])
    contributions = np.array([coef_array*values[i,:] for i in range(values.shape[0])])
    contributions = pd.DataFrame(contributions)
    contributions['mrc'] = contributions.apply(lambda x: ','.join([a+':'+str(b) for (a,b) in sorted(zip(lr_model_columns,x),key = lambda x: x[1])]),axis=1)
    df_reasons['mrc'] = contributions['mrc']
    
    return df_reasons

In [23]:
df_reasons = reason_code_lr(LR,df_val)

In [24]:
os.chdir('../Statistics/')

In [26]:
book = load_workbook('future_predictions.xlsx')
writer = pd.ExcelWriter('future_predictions.xlsx', engine='openpyxl') 
writer.book = book
writer.sheets = dict((ws.title, ws) for ws in book.worksheets)

df_reasons.to_excel(writer, "lr_reason_codes", index=False)

writer.save()

# Random Forest Reason Codes

In [29]:
def reason_code_rf(model,data,target='lift_ind'):
    predictors = lr_model_columns
    df_reasons = pd.DataFrame()
    prediction, bias, contributions = ti.predict(model,np.array(data[lr_model_columns]))
    df_reasons['Symbol'] = data['Symbol']
    df_reasons['Date'] = data['Date']
    df_reasons['Open Price'] = data['Open_Price']
    df_reasons['Prev Close'] = data['Prev_Close_order5']*(10**5)
    df_reasons['Close Price'] = data['Close_Price']
    df_reasons['lift_ind'] = data['lift_ind']
    df_reasons['Probability'] = model.predict_proba(data[lr_model_columns])[:,1]
    df_reasons['bias'] = bias[:,1]
    contributions = pd.DataFrame(contributions[:,:,1])
    contributions['mrc'] = contributions.apply(lambda x: ','.join([a+':'+str(b) for (a,b) in sorted(zip(lr_model_columns,x),key = lambda x: x[1])]),axis=1)
    df_reasons['mrc'] = contributions['mrc']
    
    return df_reasons

In [30]:
df_reasons = reason_code_rf(RF,df_val)

In [32]:
book = load_workbook('future_predictions.xlsx')
writer = pd.ExcelWriter('future_predictions.xlsx', engine='openpyxl') 
writer.book = book
writer.sheets = dict((ws.title, ws) for ws in book.worksheets)

df_reasons.to_excel(writer, "rf_reason_codes", index=False)

writer.save()

# reason code for gbm model 

In [35]:
def reason_code_gbm(model,data,target='lift_ind'):
    predictors = lr_model_columns
    df_reasons = pd.DataFrame()
    df_reasons['Symbol'] = data['Symbol']
    df_reasons['Date'] = data['Date']
    df_reasons['Open Price'] = data['Open_Price']
    df_reasons['Prev Close'] = data['Prev_Close_order5']*(10**5)
    df_reasons['Close Price'] = data['Close_Price']
    df_reasons['lift_ind'] = data['lift_ind']
    df_reasons['Probability'] = model.predict_proba(data[lr_model_columns])[:,1]
    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(data[model_columns])
    df_reasons['bias'] = explainer.expected_value
    contributions = pd.DataFrame(shap_values)
    contributions['mrc'] = contributions.apply(lambda x: ','.join([a+':'+str(b) for (a,b) in sorted(zip(lr_model_columns,x),key = lambda x: x[1])]),axis=1)
    df_reasons['mrc'] = contributions['mrc']
    
    return df_reasons

In [36]:
df_reasons = reason_code_gbm(GBM,df_val)

In [37]:
df_reasons

Unnamed: 0,Symbol,Date,Open Price,Prev Close,Close Price,lift_ind,Probability,bias,mrc
0,ADANIPORTS,2019-01-01,386.30,387.70,389.00,1,0.491170,-0.118757,"return_3d_500_sqr:-0.07325468036341941,return_..."
1,ADANIPORTS,2019-01-02,388.00,389.00,381.30,0,0.478009,-0.118757,"return_3d_500_sqr:-0.07325468036341941,return_..."
2,ADANIPORTS,2019-01-03,382.40,381.30,378.10,0,0.453284,-0.118757,"return_3d_500_sqr:-0.07325468036341941,return_..."
3,ADANIPORTS,2019-01-04,378.65,378.10,382.15,1,0.550976,-0.118757,"return_3d_500_sqr:-0.07325468036341941,return_..."
4,ADANIPORTS,2019-01-07,385.00,382.15,381.65,0,0.419709,-0.118757,"return_3d_500_sqr:-0.07325468036341941,return_..."
5,ADANIPORTS,2019-01-08,383.00,381.65,378.75,0,0.503819,-0.118757,"return_3d_500_sqr:-0.07325468036341941,return_..."
6,ADANIPORTS,2019-01-09,382.00,378.75,377.30,0,0.390855,-0.118757,"return_3d_500_sqr:-0.07325468036341941,return_..."
7,ADANIPORTS,2019-01-10,377.95,377.30,380.25,1,0.391960,-0.118757,"return_3d_500_sqr:-0.07325468036341941,return_..."
8,ADANIPORTS,2019-01-11,380.30,380.25,377.15,0,0.343554,-0.118757,"return_3d_500_sqr:-0.07325468036341941,return_..."
9,ADANIPORTS,2019-01-14,377.00,377.15,374.45,0,0.451165,-0.118757,"return_3d_500_sqr:-0.07325468036341941,return_..."


In [38]:
book = load_workbook('future_predictions.xlsx')
writer = pd.ExcelWriter('future_predictions.xlsx', engine='openpyxl') 
writer.book = book
writer.sheets = dict((ws.title, ws) for ws in book.worksheets)

df_reasons.to_excel(writer, "gbm_reason_codes", index=False)

writer.save()

# The End