In [1]:
import pandas as pd
import numpy as np
import pickle
import os

In [2]:
from sklearn.metrics import r2_score,mean_squared_error,precision_score

In [3]:
from scipy.special import cbrt

In [20]:
os.chdir('../pickles')

In [21]:
numerical = pickle.load(open('numerical.pickle','rb'))
categorical = pickle.load(open('categorical.pickle','rb'))
conversion_dict = pickle.load(open('conversion_dict.pickle','rb'))
imputation_cols = pickle.load(open('imputation_cols.pickle','rb'))
model_columns = pickle.load(open('model_columns','rb'))
lr_model_columns = pickle.load(open('lr_model_columns.pickle','rb'))
order = pickle.load(open('order.pickle','rb'))
pca = pickle.load(open('pca.pickle','rb'))
scale = pickle.load(open('scale.pickle','rb'))

In [6]:
os.chdir('../Input_Data')

In [7]:
df = pd.read_csv('validation.csv',header=0)

In [8]:
df_val = df.drop(['Symbol','Series','Date'],axis=1)

In [9]:
rename_dict = {col : col.replace('%','pct_') for col in df_val.columns if '%' in col}

In [10]:
df_val.rename(rename_dict,axis=1,inplace=True)

In [11]:
rename_dict={}
for col in order.keys():
    df_val[col]=df_val[col]/10**order[col]
    rename_dict[col] = col+"(in {})".format("10^{}".format(order[col]))

In [12]:
df_val.rename(rename_dict,axis=1,inplace=True)

In [13]:
for col in categorical:
    if len(conversion_dict[col])==1:
        category = conversion_dict[col][0]
        df_val[col+'_dum_'+str(category)] = 0
        df_val.loc[df_val[col]==category,col+'_dum_'+str(category)]=1
    else:
        total_categories = len(conversion_dict[col])
        dummies = len(str(int(bin(total_categories)[2:],10)))
        bin_conv=[]
        for i in range(total_categories):
            bin_conv.append(conversion_dict[col][i][1])
        for j in range(dummies):
            df_val[col+'_dum_'+str(j)]=0
            for i,cat in enumerate([conv[0] for conv in conversion_dict[col]]):
                df_val.loc[df_val[col]==cat,col+'_dum_'+str(j)]=bin_conv[i]%10
                bin_conv[i]=bin_conv[i]//10
    df_val.drop(col,axis=1,inplace=True)
    print(col+' done')

Sector done


In [14]:
os.chdir('../Statistics/')

In [15]:
edd_df = pd.read_excel('report.xlsx',sheet_name='edd_v01',header=0)

In [16]:
transform_dict = {'log':lambda x: np.log(x),'sqr':lambda x: x**2,'sqrt':lambda x: np.sqrt(x),'exp':lambda x:np.exp(x),
                 'cube':lambda x: x**3,'cuberoot': lambda x: cbrt(x)}

In [17]:
for col in list(edd_df['Var']):
    applied = edd_df.loc[edd_df['Var']==col,'conversions'].values[0]
    if applied != '' and applied != 'categorical':
        try:
            for t in applied.split(','):
                df_val[col+'_'+t] = df_val[col].apply(transform_dict[t])
        except:
            pass

In [18]:
os.chdir('../Imputation_models')

In [19]:
for col in [x for x in numerical if x!='Close']:
    model = pickle.load(open(col+'_impute.pickle','rb'))
    if df_val[col].isnull().any():
        indices = df_val.loc[df_val[col].isnull()].index.tolist()
        df_val.loc[indices,col]=np.array(model.predict(np.array(df_val.loc[indices,imputation_cols])))
        del indices
    del model
    print(col+' imputed')

Prev_Close(in 10^5) imputed
High_1d(in 10^5) imputed
Low_1d(in 10^5) imputed
Last_1d(in 10^5) imputed
VWAP_1d(in 10^5) imputed
Volume_1d(in 10^9) imputed
Turnover_1d(in 10^11) imputed
Trades_1d(in 10^7) imputed
Deliverable_Volume_1d(in 10^9) imputed
pct_Deliverble_1d imputed
Open_nifty_1d(in 10^5) imputed
High_nifty_1d(in 10^5) imputed
Low_nifty_1d(in 10^5) imputed
Close_nifty_1d(in 10^5) imputed
Volume_nifty_1d(in 10^9) imputed
Turnover_nifty_1d(in 10^11) imputed
Open_auto_1d(in 10^5) imputed
High_auto_1d(in 10^5) imputed
Low_auto_1d(in 10^5) imputed
Close_auto_1d(in 10^5) imputed
Open_bank_1d(in 10^5) imputed
High_bank_1d(in 10^5) imputed
Low_bank_1d(in 10^5) imputed
Close_bank_1d(in 10^5) imputed
Open_fmcg_1d(in 10^5) imputed
High_fmcg_1d(in 10^5) imputed
Low_fmcg_1d(in 10^5) imputed
Close_fmcg_1d(in 10^5) imputed
Open_it_1d(in 10^5) imputed
High_it_1d(in 10^5) imputed
Low_it_1d(in 10^5) imputed
Close_it_1d(in 10^5) imputed
Open_media_1d(in 10^4) imputed
High_media_1d(in 10^4) imput

In [22]:
x_val,y_val = np.array(df_val[model_columns]),np.array(df_val['Close'])
x,y = np.array(df_val[lr_model_columns]),np.array(df_val['Close'])
x_pca,y_pca = pca.transform(scale.transform(df_val[model_columns])),np.array(df_val['Close'])

In [23]:
os.chdir('../Models')

In [25]:
models = {'LR':pickle.load(open('LR.pickle','rb')),
                      'RF':pickle.load(open('RF.pickle','rb')),
                    'GBM':pickle.load(open('GBM.pickle','rb')),
          'RF_full':pickle.load(open('RF_full.pickle','rb')),
          'GBM_full':pickle.load(open('GBM_full.pickle','rb')),
          'RF_pca':pickle.load(open('RF_pca.pickle','rb')),
          'GBM_pca':pickle.load(open('GBM_pca.pickle','rb'))
         }

  from numpy.core.umath_tests import inner1d


In [26]:
df_val['RF_Prediction'] = models['RF'].predict(x)

In [27]:
df_val['GBM_Prediction'] = models['GBM'].predict(x)

In [28]:
df_val['LR_Prediction'] = models['LR'].predict(x)

In [29]:
df_val['RF_full_Prediction'] = models['RF_full'].predict(x_val)

In [30]:
df_val['GBM_full_Prediction'] = models['GBM_full'].predict(x_val)

In [31]:
df_val['RF_pca_Prediction'] = models['RF_pca'].predict(x_pca)

In [32]:
df_val['GBM_pca_Prediction'] = models['GBM_pca'].predict(x_pca)

In [33]:
scores = [r2_score(df_val['Close'],df_val['LR_Prediction']),r2_score(df_val['Close'],df_val['RF_Prediction']),r2_score(df_val['Close'],df_val['GBM_Prediction']),
         r2_score(df_val['Close'],df_val['RF_full_Prediction']),r2_score(df_val['Close'],df_val['GBM_full_Prediction']),
          r2_score(df_val['Close'],df_val['RF_pca_Prediction']),r2_score(df_val['Close'],df_val['GBM_pca_Prediction'])
         ]

In [34]:
df_scores = pd.DataFrame(list(zip(['LR','RF','GBM','RF_full','GBM_full','RF_pca','GBM_pca'],scores)),columns=['models','r2_score'])

In [35]:
df_scores

Unnamed: 0,models,r2_score
0,LR,0.999359
1,RF,0.986922
2,GBM,0.965342
3,RF_full,0.986851
4,GBM_full,0.988646
5,RF_pca,0.891839
6,GBM_pca,0.974851


In [36]:
os.chdir('../Input_Data/')

In [37]:
open_prices = pd.read_csv('open_price.csv',header=None)

In [38]:
open_prices.columns = ['Symbol','Date','Open']

In [39]:
df_val[['Symbol','Date']] = df[['Symbol','Date']]

In [40]:
df_final = pd.merge(df_val,open_prices,on=['Symbol','Date'])

In [41]:
def loss_precision(model):
    expected_profits = df_final[model+'_Prediction'] - df_final['Prev_Close(in 10^5)']*(10**5)
    actual_profits = df_final['Close'] - df_final['Open']
    expected_lift = expected_profits.apply(lambda x: 1 if x >0 else 0)
    actual_lift = actual_profits.apply(lambda x: 1 if x>0 else 0)
    precision = precision_score(y_true=actual_lift,y_pred=expected_lift)
    lps,pps=0,0
    for i,j,k in zip(list(expected_lift),list(actual_lift),list(actual_profits)):
        if i==1 and j==0:
            lps = lps + (-1)*k
        elif i==1 and j==1:
            pps+=k
    return precision,lps,pps

In [42]:
df_scores['precision'] = df_scores['models'].apply(lambda x: loss_precision(x)[0])

In [43]:
df_scores['lps'] = df_scores['models'].apply(lambda x: loss_precision(x)[1])

In [44]:
df_scores['pps'] = df_scores['models'].apply(lambda x: loss_precision(x)[2])

In [45]:
df_scores

Unnamed: 0,models,r2_score,precision,lps,pps
0,LR,0.999359,0.479348,22895.55,17958.1
1,RF,0.986922,0.468651,9421.25,8135.9
2,GBM,0.965342,0.480027,23100.15,18163.45
3,RF_full,0.986851,0.473616,9459.4,8716.3
4,GBM_full,0.988646,0.469414,22667.8,18385.9
5,RF_pca,0.891839,0.462604,3460.65,2730.7
6,GBM_pca,0.974851,0.44788,19965.85,13503.45


In [46]:
df_final['ensemble_prediction'] = (df_final['LR_Prediction']*.48+df_final['RF_Prediction']*.47+df_final['GBM_Prediction']*.48
                                  +df_final['RF_full_Prediction']*.47+df_final['GBM_full_Prediction']*.47+
                                  df_final['RF_pca_Prediction']*.46+df_final['GBM_pca_Prediction']*.45)/(.48*2+.47*3+.46+.45)

In [47]:
expected_profits = df_final['ensemble_prediction'] - df_final['Prev_Close(in 10^5)']*(10**5)
actual_profits = df_final['Close'] - df_final['Open']
expected_lift = expected_profits.apply(lambda x: 1 if x >0 else 0)
actual_lift = actual_profits.apply(lambda x: 1 if x>0 else 0)
precision = precision_score(y_true=actual_lift,y_pred=expected_lift)
lps,pps=0,0
for i,j,k in zip(list(expected_lift),list(actual_lift),list(actual_profits)):
    if i==1 and j==0:
        lps = lps + (-1)*k
    elif i==1 and j==1:
            pps+=k

In [48]:
precision,lps,pps

(0.475103734439834, 10822.949999999999, 9154.399999999992)

In [49]:
df_scores['P/L'] = df_scores['pps']-df_scores['lps']

In [50]:
df_scores

Unnamed: 0,models,r2_score,precision,lps,pps,P/L
0,LR,0.999359,0.479348,22895.55,17958.1,-4937.45
1,RF,0.986922,0.468651,9421.25,8135.9,-1285.35
2,GBM,0.965342,0.480027,23100.15,18163.45,-4936.7
3,RF_full,0.986851,0.473616,9459.4,8716.3,-743.1
4,GBM_full,0.988646,0.469414,22667.8,18385.9,-4281.9
5,RF_pca,0.891839,0.462604,3460.65,2730.7,-729.95
6,GBM_pca,0.974851,0.44788,19965.85,13503.45,-6462.4


In [51]:
append_list = ['Ensembled',r2_score(df_final['Close'],df_final['ensemble_prediction']),precision,lps,pps,pps-lps]

In [53]:
df_append = pd.DataFrame([append_list],columns=df_scores.columns)

In [55]:
df_scores = df_scores.append(df_append,ignore_index=True)

In [56]:
df_scores

Unnamed: 0,models,r2_score,precision,lps,pps,P/L
0,LR,0.999359,0.479348,22895.55,17958.1,-4937.45
1,RF,0.986922,0.468651,9421.25,8135.9,-1285.35
2,GBM,0.965342,0.480027,23100.15,18163.45,-4936.7
3,RF_full,0.986851,0.473616,9459.4,8716.3,-743.1
4,GBM_full,0.988646,0.469414,22667.8,18385.9,-4281.9
5,RF_pca,0.891839,0.462604,3460.65,2730.7,-729.95
6,GBM_pca,0.974851,0.44788,19965.85,13503.45,-6462.4
7,Ensembled,0.993337,0.475104,10822.95,9154.4,-1668.55


In [57]:
from openpyxl import load_workbook,Workbook

In [58]:
os.chdir('../Statistics/')

In [59]:
rf_importance = pd.DataFrame(list(zip(lr_model_columns,list(models['RF'].feature_importances_))),columns=['Var','Importance'])
book = load_workbook('report.xlsx')
writer = pd.ExcelWriter('report.xlsx', engine='openpyxl') 
writer.book = book
writer.sheets = dict((ws.title, ws) for ws in book.worksheets)

rf_importance.to_excel(writer, "rf_feature_importance", index=False)

writer.save()

In [60]:
gbm_importance = pd.DataFrame(list(zip(lr_model_columns,list(models['GBM'].feature_importances_))),columns=['Var','Importance'])
book = load_workbook('report.xlsx')
writer = pd.ExcelWriter('report.xlsx', engine='openpyxl') 
writer.book = book
writer.sheets = dict((ws.title, ws) for ws in book.worksheets)

gbm_importance.to_excel(writer, "gbm_feature_importance", index=False)

writer.save()

In [61]:
linear_importance = pd.DataFrame(list(zip([1]+list(lr_model_columns),[models['LR'].intercept_]+list(models['LR'].coef_))),columns=['Var','coeff'])
book = load_workbook('report.xlsx')
writer = pd.ExcelWriter('report.xlsx', engine='openpyxl') 
writer.book = book
writer.sheets = dict((ws.title, ws) for ws in book.worksheets)

linear_importance.to_excel(writer, "linear_model_coef", index=False)

writer.save()

In [62]:
rf_importance = pd.DataFrame(list(zip(model_columns,list(models['RF_full'].feature_importances_))),columns=['Var','Importance'])
book = load_workbook('report.xlsx')
writer = pd.ExcelWriter('report.xlsx', engine='openpyxl') 
writer.book = book
writer.sheets = dict((ws.title, ws) for ws in book.worksheets)

rf_importance.to_excel(writer, "rf_full_feature_importance", index=False)

writer.save()

In [63]:
gbm_importance = pd.DataFrame(list(zip(model_columns,list(models['GBM_full'].feature_importances_))),columns=['Var','Importance'])
book = load_workbook('report.xlsx')
writer = pd.ExcelWriter('report.xlsx', engine='openpyxl') 
writer.book = book
writer.sheets = dict((ws.title, ws) for ws in book.worksheets)

gbm_importance.to_excel(writer, "gbm_full_feature_importance", index=False)

writer.save()

In [64]:
book = load_workbook('report.xlsx')
writer = pd.ExcelWriter('report.xlsx', engine='openpyxl') 
writer.book = book
writer.sheets = dict((ws.title, ws) for ws in book.worksheets)

df_scores.to_excel(writer, "model_scores", index=False)

writer.save()

In [65]:
os.chdir('../output_data/')

In [66]:
df_final.columns

Index(['Prev_Close(in 10^5)', 'High_1d(in 10^5)', 'Low_1d(in 10^5)',
       'Last_1d(in 10^5)', 'Close', 'VWAP_1d(in 10^5)', 'Volume_1d(in 10^9)',
       'Turnover_1d(in 10^11)', 'Trades_1d(in 10^7)',
       'Deliverable_Volume_1d(in 10^9)',
       ...
       'GBM_Prediction', 'LR_Prediction', 'RF_full_Prediction',
       'GBM_full_Prediction', 'RF_pca_Prediction', 'GBM_pca_Prediction',
       'Symbol', 'Date', 'Open', 'ensemble_prediction'],
      dtype='object', length=223)

In [67]:
df_final.to_csv('validation_predicted.csv',index=False)

In [68]:
os.chdir('../Statistics/')

In [69]:
df_final.to_excel('Q1_predictions.xlsx',sheet_name='data',index=False)