In [1]:
import pandas as pd
import numpy as np
import pickle
import os

In [2]:
from sklearn.metrics import r2_score,mean_squared_error,precision_score,recall_score,f1_score,accuracy_score

In [3]:
from scipy.special import cbrt

In [30]:
os.chdir('../pickles')

In [31]:
numerical = pickle.load(open('numerical.pickle','rb'))
categorical = pickle.load(open('categorical.pickle','rb'))
conversion_dict = pickle.load(open('conversion_dict.pickle','rb'))
imputation_cols = pickle.load(open('imputation_cols.pickle','rb'))
model_columns = pickle.load(open('model_columns.pickle','rb'))
lr_model_columns = pickle.load(open('lr_model_columns.pickle','rb'))
order = pickle.load(open('order.pickle','rb'))
pca = pickle.load(open('pca.pickle','rb'))
scale = pickle.load(open('scale.pickle','rb'))

In [32]:
os.chdir('../Input_Data')

In [33]:
df = pd.read_csv('validation.csv',header=0)

In [34]:
rename_dict = {col : col.replace('%','pct_') for col in df.columns if '%' in col}

In [35]:
df_val = df

In [36]:
df_val.rename(rename_dict,axis=1,inplace=True)

In [37]:
df_val['lift'] = df_val['actual_return'].apply(lambda x: 1 if x >0 else 0) 

In [38]:
for col in categorical:
    for category in conversion_dict[col]:
        df_val[col+'_dum_'+str(category)] = 0
        df_val.loc[df_val[col]==category,col+'_dum_'+str(category)]=1
    print(col+' done')

Sector done


In [39]:
rename_dict = {col : col.replace(' ','_').replace('-','_').replace('&','_') for col in df.columns if '-' in col or ' ' in col}

In [40]:
df_val.rename(rename_dict,axis=1,inplace=True)

In [41]:
for col in model_columns:
    df_val[col] = df_val[col].apply(lambda x : x if np.abs(x) <= 1 else np.sign(x)*1)

In [43]:
x,y = np.array(df_val[model_columns]),np.array(df_val['actual_return'])

In [44]:
os.chdir('../Models/')

In [45]:
model = pickle.load(open('LR_wr.pickle','rb'))

In [46]:
y_pred = model.predict(x)

In [47]:
r2_score(y,y_pred)

-0.04315661946650362

In [48]:
df_val['lift_predicted'] = [max(0,np.sign(x)) for x in y_pred]

In [49]:
accuracy_score(y_true=df_val['lift'],y_pred=df_val['lift_predicted'])

0.4996774193548387

In [50]:
precision_score(y_true=df_val['lift'],y_pred=df_val['lift_predicted'])

0.43034825870646765

In [51]:
recall_score(y_true=df_val['lift'],y_pred=df_val['lift_predicted'])

0.2404447533009034

In [52]:
df_val['month'] = df_val['Date'].apply(lambda x : str(x)[5:7])
df_val['year'] = df_val['Date'].apply(lambda x: str(x)[0:4])

In [53]:
df_val['correct_prediction'] = df_val[['lift','lift_predicted']].apply(lambda x: 1 if int(x[0]) == int(x[1]) else 0,axis=1)

In [54]:
df_val['precise_prediction'] = df_val[['lift','lift_predicted']].apply(lambda x: 1 if int(x[0]) == 1 and int(x[1]) == 1 else 0,axis=1)

In [55]:
df_lr_grp = df_val.groupby(['month','year','Symbol','Sector']).agg({'lift':['count','sum'],'lift_predicted':['sum'],
                                                          'correct_prediction':['sum'],'precise_prediction':['sum']})

In [56]:
df_lr_grp.reset_index(inplace=True)

In [57]:
df_lr_grp.columns = ['month','year','Symbol','Sector','trade_days','price_lift','pred_price_lift','correct_predcitions','precise_predictions']

In [58]:
df_lr_grp

Unnamed: 0,month,year,Symbol,Sector,trade_days,price_lift,pred_price_lift,correct_predcitions,precise_predictions
0,01,2019,ADANIPORTS,Services - Shipping,23,8,7.0,12,2
1,01,2019,ASIANPAINT,Consumer Goods,23,13,7.0,9,3
2,01,2019,AXISBANK,Banking,23,14,7.0,12,5
3,01,2019,BAJAJ-AUTO,Automobile,23,10,6.0,13,3
4,01,2019,BAJAJFINSV,Financial Services,23,8,8.0,13,3
5,01,2019,BAJFINANCE,Financial Services,23,8,8.0,13,3
6,01,2019,BHARTIARTL,Telecommunication,23,12,8.0,7,2
7,01,2019,BPCL,Energy - Oil & Gas,23,13,4.0,12,3
8,01,2019,BRITANNIA,Consumer Goods,23,10,8.0,15,5
9,01,2019,CIPLA,Pharmaceuticals,23,8,6.0,9,0


In [59]:
os.chdir('../Statistics/')

In [60]:
from openpyxl import Workbook,load_workbook

In [61]:
wb = load_workbook('report.xlsx')
writer = pd.ExcelWriter('report.xlsx',engine='openpyxl')
writer.book = wb
df_lr_grp.to_excel(writer,sheet_name='lr_price_stats_val_wr',index=False)
writer.save()
writer.close()

In [13]:
# for col in categorical:
#     if len(conversion_dict[col])==1:
#         category = conversion_dict[col][0]
#         df_val[col+'_dum_'+str(category)] = 0
#         df_val.loc[df_val[col]==category,col+'_dum_'+str(category)]=1
#     else:
#         total_categories = len(conversion_dict[col])
#         dummies = len(str(int(bin(total_categories)[2:],10)))
#         bin_conv=[]
#         for i in range(total_categories):
#             bin_conv.append(conversion_dict[col][i][1])
#         for j in range(dummies):
#             df_val[col+'_dum_'+str(j)]=0
#             for i,cat in enumerate([conv[0] for conv in conversion_dict[col]]):
#                 df_val.loc[df_val[col]==cat,col+'_dum_'+str(j)]=bin_conv[i]%10
#                 bin_conv[i]=bin_conv[i]//10
#     df_val.drop(col,axis=1,inplace=True)
#     print(col+' done')

Sector done


In [24]:
os.chdir('../Statistics/')

In [15]:
edd_df = pd.read_excel('report.xlsx',sheet_name='edd_v01',header=0)

In [16]:
transform_dict = {'log':lambda x: np.log(x),'sqr':lambda x: x**2,'sqrt':lambda x: np.sqrt(x),'exp':lambda x:np.exp(x),
                 'cube':lambda x: x**3,'cuberoot': lambda x: cbrt(x)}

In [17]:
for col in list(edd_df['Var']):
    applied = edd_df.loc[edd_df['Var']==col,'conversions'].values[0]
    if applied != '' and applied != 'categorical':
        try:
            for t in applied.split(','):
                df_val[col+'_'+t] = df_val[col].apply(transform_dict[t])
        except:
            pass

In [18]:
os.chdir('../Imputation_models')

In [19]:
for col in [x for x in numerical if x!='actual_return']:
    model = pickle.load(open(col+'_impute.pickle','rb'))
    if df_val[col].isnull().any():
        indices = df_val.loc[df_val[col].isnull()].index.tolist()
        df_val.loc[indices,col]=np.array(model.predict(np.array(df_val.loc[indices,imputation_cols])))
        del indices
    del model
    print(col+' imputed')

Prev_Close(in 10^5) imputed
High_1d(in 10^5) imputed
Low_1d(in 10^5) imputed
Last_1d(in 10^5) imputed
VWAP_1d(in 10^5) imputed
Volume_1d(in 10^9) imputed
Turnover_1d(in 10^11) imputed
Trades_1d(in 10^7) imputed
Deliverable_Volume_1d(in 10^9) imputed
pct_Deliverble_1d imputed
Open_nifty_1d(in 10^5) imputed
High_nifty_1d(in 10^5) imputed
Low_nifty_1d(in 10^5) imputed
Close_nifty_1d(in 10^5) imputed
Volume_nifty_1d(in 10^9) imputed
Turnover_nifty_1d(in 10^11) imputed
Open_auto_1d(in 10^5) imputed
High_auto_1d(in 10^5) imputed
Low_auto_1d(in 10^5) imputed
Close_auto_1d(in 10^5) imputed
Open_bank_1d(in 10^5) imputed
High_bank_1d(in 10^5) imputed
Low_bank_1d(in 10^5) imputed
Close_bank_1d(in 10^5) imputed
Open_fmcg_1d(in 10^5) imputed
High_fmcg_1d(in 10^5) imputed
Low_fmcg_1d(in 10^5) imputed
Close_fmcg_1d(in 10^5) imputed
Open_it_1d(in 10^5) imputed
High_it_1d(in 10^5) imputed
Low_it_1d(in 10^5) imputed
Close_it_1d(in 10^5) imputed
Open_media_1d(in 10^4) imputed
High_media_1d(in 10^4) imput

In [20]:
x_val,y_val = np.array(df_val[model_columns]),np.array(df_val['actual_return'])
x,y = np.array(df_val[lr_model_columns]),np.array(df_val['actual_return'])
x_pca,y_pca = pca.transform(scale.transform(df_val[model_columns])),np.array(df_val['actual_return'])

In [21]:
os.chdir('../Models')

In [22]:
models = {'LR':pickle.load(open('LR.pickle','rb')),
                      'RF':pickle.load(open('RF.pickle','rb')),
                    'GBM':pickle.load(open('GBM.pickle','rb')),
          'RF_full':pickle.load(open('RF_full.pickle','rb')),
          'GBM_full':pickle.load(open('GBM_full.pickle','rb')),
          'RF_pca':pickle.load(open('RF_pca.pickle','rb')),
          'GBM_pca':pickle.load(open('GBM_pca.pickle','rb'))
         }

  from numpy.core.umath_tests import inner1d


In [23]:
df_val['RF_Prediction'] = models['RF'].predict(x)

In [24]:
df_val['GBM_Prediction'] = models['GBM'].predict(x)

In [25]:
df_val['LR_Prediction'] = models['LR'].predict(x)

In [26]:
df_val['RF_full_Prediction'] = models['RF_full'].predict(x_val)

In [27]:
df_val['GBM_full_Prediction'] = models['GBM_full'].predict(x_val)

In [28]:
df_val['RF_pca_Prediction'] = models['RF_pca'].predict(x_pca)

In [29]:
df_val['GBM_pca_Prediction'] = models['GBM_pca'].predict(x_pca)

In [38]:
scores = [r2_score(df_val['actual_return'],df_val['LR_Prediction']),r2_score(df_val['actual_return'],df_val['RF_Prediction']),r2_score(df_val['actual_return'],df_val['GBM_Prediction']),
         r2_score(df_val['actual_return'],df_val['RF_full_Prediction']),r2_score(df_val['actual_return'],df_val['GBM_full_Prediction']),
          r2_score(df_val['actual_return'],df_val['RF_pca_Prediction']),r2_score(df_val['actual_return'],df_val['GBM_pca_Prediction'])
         ]

In [39]:
df_scores = pd.DataFrame(list(zip(['LR','RF','GBM','RF_full','GBM_full','RF_pca','GBM_pca'],scores)),columns=['models','r2_score'])

In [40]:
df_scores

Unnamed: 0,models,r2_score
0,LR,-0.016013
1,RF,-0.001164
2,GBM,-0.007195
3,RF_full,-0.000192
4,GBM_full,-0.002842
5,RF_pca,-0.001964
6,GBM_pca,-0.010827


In [41]:
os.chdir('../Input_Data/')

In [42]:
open_prices = pd.read_csv('open_price.csv',header=None)

In [43]:
open_prices.columns = ['Symbol','Date','Open','Close']

In [44]:
df_val[['Symbol','Date']] = df[['Symbol','Date']]

In [45]:
df_final = pd.merge(df_val,open_prices,on=['Symbol','Date'])

In [53]:
def loss_precision(model):
    expected_profits = df_final[model+'_Prediction']
    actual_profits = df_final['actual_return']
    expected_lift = expected_profits.apply(lambda x: 1 if x >0 else 0)
    actual_lift = actual_profits.apply(lambda x: 1 if x>0 else 0)
    precision = precision_score(y_true=actual_lift,y_pred=expected_lift)
    recall = recall_score(y_true=actual_lift,y_pred=expected_lift)
    f1 = f1_score(y_true=actual_lift,y_pred=expected_lift)
    return [precision,recall,f1]

In [54]:
df_scores['precision'] = df_scores['models'].apply(lambda x: loss_precision(x)[0])
df_scores['recall']=df_scores['models'].apply(lambda x: loss_precision(x)[1])
df_scores['f1_score']=df_scores['models'].apply(lambda x: loss_precision(x)[2])

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [55]:
df_scores

Unnamed: 0,models,r2_score,precision,recall,f1_score
0,LR,-0.016013,0.454418,0.225156,0.301115
1,RF,-0.001164,0.609756,0.069493,0.124766
2,GBM,-0.007195,0.489519,0.275886,0.352889
3,RF_full,-0.000192,0.0,0.0,0.0
4,GBM_full,-0.002842,0.470908,0.421821,0.445015
5,RF_pca,-0.001964,0.533333,0.011119,0.021784
6,GBM_pca,-0.010827,0.403691,0.243224,0.303556


In [56]:
df_final['ensemble_prediction'] = (df_final['LR_Prediction']*.45+df_final['RF_Prediction']*.61+df_final['GBM_Prediction']*.49
                                  +df_final['RF_full_Prediction']*0+df_final['GBM_full_Prediction']*.47+
                                  df_final['RF_pca_Prediction']*.53+df_final['GBM_pca_Prediction']*.40)/(.45+.61+.49+.47+.53+.4)

In [57]:
expected_profits = df_final['ensemble_prediction']
actual_profits = df_final['actual_return']
expected_lift = expected_profits.apply(lambda x: 1 if x >0 else 0)
actual_lift = actual_profits.apply(lambda x: 1 if x>0 else 0)
precision = precision_score(y_true=actual_lift,y_pred=expected_lift)
recall = recall_score(y_true=actual_lift,y_pred=expected_lift)
f1 = f1_score(y_true=actual_lift,y_pred=expected_lift)

In [58]:
precision

0.501984126984127

In [59]:
df_scores

Unnamed: 0,models,r2_score,precision,recall,f1_score
0,LR,-0.016013,0.454418,0.225156,0.301115
1,RF,-0.001164,0.609756,0.069493,0.124766
2,GBM,-0.007195,0.489519,0.275886,0.352889
3,RF_full,-0.000192,0.0,0.0,0.0
4,GBM_full,-0.002842,0.470908,0.421821,0.445015
5,RF_pca,-0.001964,0.533333,0.011119,0.021784
6,GBM_pca,-0.010827,0.403691,0.243224,0.303556


In [60]:
append_list = ['Ensembled',r2_score(df_final['Close'],df_final['ensemble_prediction']),precision,recall,f1]

In [61]:
df_append = pd.DataFrame([append_list],columns=df_scores.columns)

In [62]:
df_scores = df_scores.append(df_append,ignore_index=True)

In [63]:
df_scores

Unnamed: 0,models,r2_score,precision,recall,f1_score
0,LR,-0.016013,0.454418,0.225156,0.301115
1,RF,-0.001164,0.609756,0.069493,0.124766
2,GBM,-0.007195,0.489519,0.275886,0.352889
3,RF_full,-0.000192,0.0,0.0,0.0
4,GBM_full,-0.002842,0.470908,0.421821,0.445015
5,RF_pca,-0.001964,0.533333,0.011119,0.021784
6,GBM_pca,-0.010827,0.403691,0.243224,0.303556
7,Ensembled,-0.270823,0.501984,0.175817,0.260422


In [64]:
from openpyxl import load_workbook,Workbook

In [65]:
os.chdir('../Statistics/')

In [66]:
rf_importance = pd.DataFrame(list(zip(lr_model_columns,list(models['RF'].feature_importances_))),columns=['Var','Importance'])
book = load_workbook('report.xlsx')
writer = pd.ExcelWriter('report.xlsx', engine='openpyxl') 
writer.book = book
writer.sheets = dict((ws.title, ws) for ws in book.worksheets)

rf_importance.to_excel(writer, "rf_feature_importance", index=False)

writer.save()

In [67]:
gbm_importance = pd.DataFrame(list(zip(lr_model_columns,list(models['GBM'].feature_importances_))),columns=['Var','Importance'])
book = load_workbook('report.xlsx')
writer = pd.ExcelWriter('report.xlsx', engine='openpyxl') 
writer.book = book
writer.sheets = dict((ws.title, ws) for ws in book.worksheets)

gbm_importance.to_excel(writer, "gbm_feature_importance", index=False)

writer.save()

In [68]:
linear_importance = pd.DataFrame(list(zip([1]+list(lr_model_columns),[models['LR'].intercept_]+list(models['LR'].coef_))),columns=['Var','coeff'])
book = load_workbook('report.xlsx')
writer = pd.ExcelWriter('report.xlsx', engine='openpyxl') 
writer.book = book
writer.sheets = dict((ws.title, ws) for ws in book.worksheets)

linear_importance.to_excel(writer, "linear_model_coef", index=False)

writer.save()

In [86]:
rf_importance = pd.DataFrame(list(zip(model_columns,list(models['RF_full'].feature_importances_))),columns=['Var','Importance'])
book = load_workbook('report.xlsx')
writer = pd.ExcelWriter('report.xlsx', engine='openpyxl') 
writer.book = book
writer.sheets = dict((ws.title, ws) for ws in book.worksheets)

rf_importance.to_excel(writer, "rf_full_feature_importance", index=False)

writer.save()

In [87]:
gbm_importance = pd.DataFrame(list(zip(model_columns,list(models['GBM_full'].feature_importances_))),columns=['Var','Importance'])
book = load_workbook('report.xlsx')
writer = pd.ExcelWriter('report.xlsx', engine='openpyxl') 
writer.book = book
writer.sheets = dict((ws.title, ws) for ws in book.worksheets)

gbm_importance.to_excel(writer, "gbm_full_feature_importance", index=False)

writer.save()

In [69]:
book = load_workbook('report.xlsx')
writer = pd.ExcelWriter('report.xlsx', engine='openpyxl') 
writer.book = book
writer.sheets = dict((ws.title, ws) for ws in book.worksheets)

df_scores.to_excel(writer, "model_scores", index=False)

writer.save()

In [70]:
os.chdir('../output_data/')

In [71]:
df_final.columns

Index(['Prev_Close(in 10^5)', 'High_1d(in 10^5)', 'Low_1d(in 10^5)',
       'Last_1d(in 10^5)', 'VWAP_1d(in 10^5)', 'Volume_1d(in 10^9)',
       'Turnover_1d(in 10^11)', 'Trades_1d(in 10^7)',
       'Deliverable_Volume_1d(in 10^9)', 'pct_Deliverble_1d',
       ...
       'LR_Prediction', 'RF_full_Prediction', 'GBM_full_Prediction',
       'RF_pca_Prediction', 'GBM_pca_Prediction', 'Symbol', 'Date', 'Open',
       'Close', 'ensemble_prediction'],
      dtype='object', length=230)

In [91]:
df_final.to_csv('validation_predicted.csv',index=False)

In [92]:
os.chdir('../Statistics/')

In [93]:
df_final.to_excel('Q1_predictions.xlsx',sheet_name='data',index=False)