In [4]:
#Importing Libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn import neighbors
from sklearn.tree import DecisionTreeRegressor 
from sklearn.ensemble import RandomForestRegressor 
import xgboost as xgb
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

In [5]:
files = ['airtel.csv', 'tatamotors.csv', 'ITC.csv', 'Cipla.csv', 'Raymond.csv', 'bajajelectricals.csv', 'L&T.csv', 'Reliance.csv', 'SBI.csv', 'TataSteel.csv', 'TCS.csv']
company_names = {i:i[:-4].upper() for i in files}
columns = ['R2', 'Mean Absolute Error', 'Mean Squared Error', 'Root Mean Squared Error']
print(company_names)

{'airtel.csv': 'AIRTEL', 'tatamotors.csv': 'TATAMOTORS', 'ITC.csv': 'ITC', 'Cipla.csv': 'CIPLA', 'Raymond.csv': 'RAYMOND', 'bajajelectricals.csv': 'BAJAJELECTRICALS', 'L&T.csv': 'L&T', 'Reliance.csv': 'RELIANCE', 'SBI.csv': 'SBI', 'TataSteel.csv': 'TATASTEEL', 'TCS.csv': 'TCS'}


In [6]:
rf = {key:[] for key in files}
knn = {key:[] for key in files}
dt = {key:[] for key in files}
lr = {key:[] for key in files}
xg = {key:[] for key in files}

In [8]:
#Initializing models for all algorithms
model_rf = RandomForestRegressor(n_estimators = 100, random_state = 0)
model_dt = DecisionTreeRegressor()
model_lr = LinearRegression()
model_xg = xgb.XGBRegressor(objective="reg:linear", n_estimators=75, subsample=0.75, max_depth=7)

#k-NN is not pre-initialized as it uses gridsearch to find best parameters to initialize model


for i in files:
    
    #--------------------Preprocessing Data-------------#
    #Reading dataset in pandas
    df = pd.read_csv('Dataset/' + i)
    #Droping unwanted columns
    df.drop(["Date", "Adj Close"], axis=1, inplace=True)
    
    #Copying closing price to another column so we can predict today's closing price using yesterday's data
    df['Close_Tom'] =  df['Close']
    
    #Shifting close_tom column down by one row so we can use yesterdays data to predict todays closing price
    df['Close_Tom'] =  df['Close_Tom'].shift(-1)
    
    #Dropping columns with null values
    df.dropna(axis=0, how='any', inplace=True)
    df.isnull().sum()
    
    #columns for x and y
    x  = df[['Open','High','Low','Volume', 'Close']]
    y = df['Close_Tom']
    
    #Normalising data
    scaler = MinMaxScaler()
    df[['Open','High','Low','Volume', 'Close']] = scaler.fit_transform(df[['Open','High','Low','Volume', 'Close']])
    df.head()
    
    #Splitting data
    x_train , x_test , y_train , y_test = train_test_split(x ,y)
    
    #---------------------Random Forest--------------------# 
    model_rf.fit(x_train, y_train)   
    
    #predicting stock price using our model
    preds = model_rf.predict(x_test)
    
    #Appending data to dictionary
    #R2
    rf[i].append(round(model_rf.score(x_test, y_test), 4))
    #Mean Absolute Error
    rf[i].append(round(metrics.mean_absolute_error(y_test,preds), 4))
    #Mean Squared Error
    rf[i].append(round(metrics.mean_squared_error(y_test,preds), 4))
    #Root Mean Squared Error
    rf[i].append(round((metrics.mean_squared_error(y_test,preds))**0.5, 4))
    
    #---------------------k-NN--------------------#
    
    #using gridsearch to find the best parameter
    params = {'n_neighbors':[1,2,3,4,5,6,7,8,9,10,11,12,13,14]}
    knn_init = neighbors.KNeighborsRegressor()
    model_knn = GridSearchCV(knn_init, params, cv=7)
    model_knn.fit(x_train,y_train)
    
    #predicting stock price using our model
    preds = model_knn.predict(x_test)
    
    #Appending data to dictionary
    #R2
    knn[i].append(round(model_knn.score(x_test, y_test), 4))
    #Mean Absolute Error
    knn[i].append(round(metrics.mean_absolute_error(y_test,preds), 4))
    #Mean Squared Error
    knn[i].append(round(metrics.mean_squared_error(y_test,preds), 4))
    #Root Mean Squared Error
    knn[i].append(round((metrics.mean_squared_error(y_test,preds))**0.5, 4))
    
    #--------------Decision Tree---------------#
    model_dt.fit(x_train,y_train)
    
    #predicting stock price using our model
    preds = model_dt.predict(x_test)
    
    #Appending data to dictionary
    #R2
    dt[i].append(round(model_dt.score(x_test, y_test), 4))
    #Mean Absolute Error
    dt[i].append(round(metrics.mean_absolute_error(y_test,preds), 4))
    #Mean Squared Error
    dt[i].append(round(metrics.mean_squared_error(y_test,preds), 4))
    #Root Mean Squared Error
    dt[i].append(round((metrics.mean_squared_error(y_test,preds))**0.5, 4))
    
    #-----------------Linear Regression-----------------#
    model_lr.fit(x_train,y_train)
    
    #predicting stock price using our model
    preds = model_lr.predict(x_test)

    #Appending data to dictionary
    #R2
    lr[i].append(round(model_lr.score(x_test, y_test), 4))
    
    #Mean Absolute Error
    lr[i].append(round(metrics.mean_absolute_error(y_test,preds), 4))
    
    #Mean Squared Error
    lr[i].append(round(metrics.mean_squared_error(y_test,preds), 4))
    
    #Root Mean Squared Error
    lr[i].append(round((metrics.mean_squared_error(y_test,preds))**0.5, 4))
    
    #-----------------XG Boost-----------------#
    model_xg.fit(x_train, y_train)
    
    preds = model_xg.predict(x_test)
    
    #Appending data to dictionary
    #R2
    xg[i].append(round(model_lr.score(x_test, y_test), 4))
    
    #Mean Absolute Error
    xg[i].append(round(metrics.mean_absolute_error(y_test,preds), 4))
    
    #Mean Squared Error
    xg[i].append(round(metrics.mean_squared_error(y_test,preds), 4))
    
    #Root Mean Squared Error
    xg[i].append(round((metrics.mean_squared_error(y_test,preds))**0.5, 4))  
    
    #Printing Info
    print(i + ' Done')
    
print("Training Completed")

airtel.csv Done
tatamotors.csv Done
ITC.csv Done
Cipla.csv Done
Raymond.csv Done
bajajelectricals.csv Done
L&T.csv Done
Reliance.csv Done
SBI.csv Done
TataSteel.csv Done
TCS.csv Done
Training Completed


In [9]:
df_rf = pd.DataFrame.from_dict(rf, orient='index', columns=columns)
df_rf.rename(index=company_names, inplace=True)
df_rf.to_csv('Results/RF_Results.csv') 

df_knn = pd.DataFrame.from_dict(knn, orient='index', columns=columns)
df_knn.rename(index=company_names, inplace=True)
df_knn.to_csv('Results/k-NN_Results.csv') 

df_dt = pd.DataFrame.from_dict(dt, orient='index', columns=columns)
df_dt.rename(index=company_names, inplace=True)
df_dt.to_csv('Results/DT_Results.csv') 

df_lr = pd.DataFrame.from_dict(lr, orient='index', columns=columns)
df_lr.rename(index=company_names, inplace=True)
df_lr.to_csv('Results/LR_Results.csv') 

df_xg = pd.DataFrame.from_dict(xg, orient='index', columns=columns)
df_xg.rename(index=company_names, inplace=True)
df_xg.to_csv('Results/XGBoost_Results.csv') 

In [10]:
print("Random Forest")
df_rf

Random Forest


Unnamed: 0,R2,Mean Absolute Error,Mean Squared Error,Root Mean Squared Error
AIRTEL,0.9966,5.1451,60.621,7.7859
TATAMOTORS,0.9987,3.3929,29.1263,5.3969
ITC,0.9994,1.4544,6.3435,2.5186
CIPLA,0.9991,3.9734,43.6569,6.6073
RAYMOND,0.998,6.1623,97.3926,9.8688
BAJAJELECTRICALS,0.9967,4.6826,104.0183,10.1989
L&T,0.999,8.1553,204.2471,14.2915
RELIANCE,0.9992,6.4171,149.1147,12.2113
SBI,0.9985,2.5186,17.4967,4.1829
TATASTEEL,0.9974,6.415,102.0751,10.1032


In [11]:
print("k-NN")
df_knn

k-NN


Unnamed: 0,R2,Mean Absolute Error,Mean Squared Error,Root Mean Squared Error
AIRTEL,0.0489,97.2898,16784.8541,129.5564
TATAMOTORS,0.0667,110.8507,20148.939,141.947
ITC,0.1915,72.898,7950.3111,89.1645
CIPLA,0.7668,69.6796,11907.2745,109.1205
RAYMOND,0.526,91.233,22851.4277,151.1669
BAJAJELECTRICALS,0.7643,38.0544,7345.5121,85.706
L&T,0.7778,141.7502,46653.3486,215.9939
RELIANCE,0.6137,144.4835,75519.1961,274.8076
SBI,0.1862,80.0074,9436.3347,97.1408
TATASTEEL,0.21,135.1318,31380.5752,177.1456


In [12]:
print("Decision Tree")
df_dt

Decision Tree


Unnamed: 0,R2,Mean Absolute Error,Mean Squared Error,Root Mean Squared Error
AIRTEL,0.9941,6.8034,104.5277,10.2239
TATAMOTORS,0.9974,4.7186,57.0109,7.5506
ITC,0.9989,1.9448,10.4819,3.2376
CIPLA,0.9985,5.3193,75.4023,8.6835
RAYMOND,0.9963,8.3359,176.542,13.2869
BAJAJELECTRICALS,0.9939,6.067,189.9811,13.7834
L&T,0.9985,10.5736,319.4971,17.8745
RELIANCE,0.9986,8.6111,269.5857,16.4191
SBI,0.9969,3.5013,35.9505,5.9959
TATASTEEL,0.9949,8.8618,201.21,14.1849


In [13]:
print("Linear Regression")
df_lr

Linear Regression


Unnamed: 0,R2,Mean Absolute Error,Mean Squared Error,Root Mean Squared Error
AIRTEL,0.9971,4.7773,50.9726,7.1395
TATAMOTORS,0.9989,3.0512,23.6634,4.8645
ITC,0.9995,1.2707,5.081,2.2541
CIPLA,0.9993,3.5186,35.3586,5.9463
RAYMOND,0.9985,5.3309,74.4472,8.6283
BAJAJELECTRICALS,0.9976,4.0709,76.1243,8.7249
L&T,0.9992,7.2233,166.1431,12.8897
RELIANCE,0.9994,5.715,125.9006,11.2205
SBI,0.9988,2.2591,13.3723,3.6568
TATASTEEL,0.9979,5.6779,81.8777,9.0486


In [15]:
print("XG Boost")
df_xg

XG Boost


Unnamed: 0,R2,Mean Absolute Error,Mean Squared Error,Root Mean Squared Error
AIRTEL,0.9971,5.5405,69.5052,8.337
TATAMOTORS,0.9989,3.6018,32.5217,5.7028
ITC,0.9995,1.5768,7.348,2.7107
CIPLA,0.9993,4.3998,55.3092,7.437
RAYMOND,0.9985,6.6718,119.1363,10.915
BAJAJELECTRICALS,0.9976,5.1206,141.0671,11.8772
L&T,0.9992,8.9114,234.0781,15.2996
RELIANCE,0.9994,7.0839,191.8803,13.8521
SBI,0.9988,2.6691,18.4401,4.2942
TATASTEEL,0.9979,6.661,110.1132,10.4935
