In [1]:
#Importing Libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn import neighbors
from sklearn.tree import DecisionTreeRegressor 
from sklearn.ensemble import RandomForestRegressor 
import xgboost as xgb
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
files = ['airtel.csv', 'tatamotors.csv', 'ITC.csv', 'Cipla.csv', 'Raymond.csv', 'bajajelectricals.csv', 'L&T.csv', 'Reliance.csv', 'SBI.csv', 'TataSteel.csv', 'TCS.csv']
company_names = {i:i[:-4].upper() for i in files}
columns = ['R2', 'Mean Absolute Error', 'Mean Squared Error', 'Root Mean Squared Error']
print(company_names)

{'airtel.csv': 'AIRTEL', 'tatamotors.csv': 'TATAMOTORS', 'ITC.csv': 'ITC', 'Cipla.csv': 'CIPLA', 'Raymond.csv': 'RAYMOND', 'bajajelectricals.csv': 'BAJAJELECTRICALS', 'L&T.csv': 'L&T', 'Reliance.csv': 'RELIANCE', 'SBI.csv': 'SBI', 'TataSteel.csv': 'TATASTEEL', 'TCS.csv': 'TCS'}


In [3]:
rf = {key:[] for key in files}
knn = {key:[] for key in files}
dt = {key:[] for key in files}
lr = {key:[] for key in files}
xg = {key:[] for key in files}

In [4]:
#Initializing models for all algorithms
model_rf = RandomForestRegressor(n_estimators = 100, random_state = 0)
model_dt = DecisionTreeRegressor()
model_lr = LinearRegression()
model_xg = xgb.XGBRegressor(objective="reg:squarederror", n_estimators=75, subsample=0.75, max_depth=7)

#k-NN is not pre-initialized as it uses gridsearch to find best parameters to initialize model


for i in files:
    
    #--------------------Preprocessing Data-------------#
    #Reading dataset in pandas
    df = pd.read_csv('Dataset/' + i)
    #Droping unwanted columns
    df.drop(["Date", "Adj Close"], axis=1, inplace=True)
    
    #Copying closing price to another column so we can predict today's closing price using yesterday's data
    df['Close_Tom'] =  df['Close']
    
    #Shifting close_tom column down by one row so we can use yesterdays data to predict todays closing price
    df['Close_Tom'] =  df['Close_Tom'].shift(-1)
    
    #Dropping columns with null values
    df.dropna(axis=0, how='any', inplace=True)
    df.isnull().sum()
    
    #columns for x and y
    x  = df[['Open','High','Low','Volume', 'Close']]
    y = df['Close_Tom']
    
    #Normalising data
    scaler = MinMaxScaler()
    df[['Open','High','Low','Volume', 'Close']] = scaler.fit_transform(df[['Open','High','Low','Volume', 'Close']])
    df.head()
    
    #Splitting data
    x_train , x_test , y_train , y_test = train_test_split(x ,y)
    
    #---------------------Random Forest--------------------# 
    model_rf.fit(x_train, y_train)   
    
    #predicting stock price using our model
    preds = model_rf.predict(x_test)
    
    #Appending data to dictionary
    #R2
    rf[i].append(round(model_rf.score(x_test, y_test), 4))
    #Mean Absolute Error
    rf[i].append(round(metrics.mean_absolute_error(y_test,preds), 4))
    #Mean Squared Error
    rf[i].append(round(metrics.mean_squared_error(y_test,preds), 4))
    #Root Mean Squared Error
    rf[i].append(round((metrics.mean_squared_error(y_test,preds))**0.5, 4))
    
    #---------------------k-NN--------------------#
    
    #using gridsearch to find the best parameter
    params = {'n_neighbors':[1,2,3,4,5,6,7,8,9,10,11,12,13,14]}
    knn_init = neighbors.KNeighborsRegressor()
    model_knn = GridSearchCV(knn_init, params, cv=7)
    model_knn.fit(x_train,y_train)
    
    #predicting stock price using our model
    preds = model_knn.predict(x_test)
    
    #Appending data to dictionary
    #R2
    knn[i].append(round(model_knn.score(x_test, y_test), 4))
    #Mean Absolute Error
    knn[i].append(round(metrics.mean_absolute_error(y_test,preds), 4))
    #Mean Squared Error
    knn[i].append(round(metrics.mean_squared_error(y_test,preds), 4))
    #Root Mean Squared Error
    knn[i].append(round((metrics.mean_squared_error(y_test,preds))**0.5, 4))
    
    #--------------Decision Tree---------------#
    model_dt.fit(x_train,y_train)
    
    #predicting stock price using our model
    preds = model_dt.predict(x_test)
    
    #Appending data to dictionary
    #R2
    dt[i].append(round(model_dt.score(x_test, y_test), 4))
    #Mean Absolute Error
    dt[i].append(round(metrics.mean_absolute_error(y_test,preds), 4))
    #Mean Squared Error
    dt[i].append(round(metrics.mean_squared_error(y_test,preds), 4))
    #Root Mean Squared Error
    dt[i].append(round((metrics.mean_squared_error(y_test,preds))**0.5, 4))
    
    #-----------------Linear Regression-----------------#
    model_lr.fit(x_train,y_train)
    
    #predicting stock price using our model
    preds = model_lr.predict(x_test)

    #Appending data to dictionary
    #R2
    lr[i].append(round(model_lr.score(x_test, y_test), 4))
    
    #Mean Absolute Error
    lr[i].append(round(metrics.mean_absolute_error(y_test,preds), 4))
    
    #Mean Squared Error
    lr[i].append(round(metrics.mean_squared_error(y_test,preds), 4))
    
    #Root Mean Squared Error
    lr[i].append(round((metrics.mean_squared_error(y_test,preds))**0.5, 4))
    
    #-----------------XG Boost-----------------#
    model_xg.fit(x_train, y_train)
    
    preds = model_xg.predict(x_test)
    
    #Appending data to dictionary
    #R2
    xg[i].append(round(model_lr.score(x_test, y_test), 4))
    
    #Mean Absolute Error
    xg[i].append(round(metrics.mean_absolute_error(y_test,preds), 4))
    
    #Mean Squared Error
    xg[i].append(round(metrics.mean_squared_error(y_test,preds), 4))
    
    #Root Mean Squared Error
    xg[i].append(round((metrics.mean_squared_error(y_test,preds))**0.5, 4))  
    
    #Printing Info
    print(i + ' Done')
    
print("Training Completed")

airtel.csv Done
tatamotors.csv Done
ITC.csv Done
Cipla.csv Done
Raymond.csv Done
bajajelectricals.csv Done
L&T.csv Done
Reliance.csv Done
SBI.csv Done
TataSteel.csv Done
TCS.csv Done
Training Completed


In [5]:
df_rf = pd.DataFrame.from_dict(rf, orient='index', columns=columns)
df_rf.rename(index=company_names, inplace=True)
df_rf.to_csv('Results/RF_Results.csv') 

df_knn = pd.DataFrame.from_dict(knn, orient='index', columns=columns)
df_knn.rename(index=company_names, inplace=True)
df_knn.to_csv('Results/k-NN_Results.csv') 

df_dt = pd.DataFrame.from_dict(dt, orient='index', columns=columns)
df_dt.rename(index=company_names, inplace=True)
df_dt.to_csv('Results/DT_Results.csv') 

df_lr = pd.DataFrame.from_dict(lr, orient='index', columns=columns)
df_lr.rename(index=company_names, inplace=True)
df_lr.to_csv('Results/LR_Results.csv') 

df_xg = pd.DataFrame.from_dict(xg, orient='index', columns=columns)
df_xg.rename(index=company_names, inplace=True)
df_xg.to_csv('Results/XGBoost_Results.csv') 

In [6]:
print("Random Forest")
df_rf

Random Forest


Unnamed: 0,R2,Mean Absolute Error,Mean Squared Error,Root Mean Squared Error
AIRTEL,0.997,4.8327,52.6609,7.2568
TATAMOTORS,0.9984,3.5801,33.3641,5.7762
ITC,0.9994,1.4386,6.1673,2.4834
CIPLA,0.9991,3.8819,43.4971,6.5952
RAYMOND,0.9981,6.0156,105.3692,10.265
BAJAJELECTRICALS,0.9981,4.1968,59.6145,7.721
L&T,0.999,8.4456,218.8039,14.792
RELIANCE,0.9991,6.6674,183.5994,13.5499
SBI,0.9983,2.5363,19.1817,4.3797
TATASTEEL,0.9973,6.5936,113.1144,10.6355


In [7]:
print("k-NN")
df_knn

k-NN


Unnamed: 0,R2,Mean Absolute Error,Mean Squared Error,Root Mean Squared Error
AIRTEL,0.0443,97.2845,16576.1526,128.7484
TATAMOTORS,0.0889,107.6558,19554.8378,139.8386
ITC,0.2085,70.8296,7706.0638,87.7842
CIPLA,0.7805,67.6901,10798.0929,103.9139
RAYMOND,0.5379,95.9616,25271.0063,158.9686
BAJAJELECTRICALS,0.7934,41.5152,6594.0926,81.204
L&T,0.7627,148.1826,50445.9041,224.6017
RELIANCE,0.6108,147.8617,77332.066,278.0864
SBI,0.18,77.6856,9124.2086,95.5207
TATASTEEL,0.1396,145.7376,35465.9957,188.3242


In [8]:
print("Decision Tree")
df_dt

Decision Tree


Unnamed: 0,R2,Mean Absolute Error,Mean Squared Error,Root Mean Squared Error
AIRTEL,0.9948,6.5696,90.7288,9.5252
TATAMOTORS,0.9975,4.7022,54.6079,7.3897
ITC,0.9989,1.9418,11.1216,3.3349
CIPLA,0.9985,5.1746,74.5084,8.6318
RAYMOND,0.9967,8.281,180.5858,13.4382
BAJAJELECTRICALS,0.9969,5.618,98.2869,9.914
L&T,0.9983,11.1335,364.5445,19.093
RELIANCE,0.9984,9.0169,325.7669,18.049
SBI,0.9971,3.4216,32.7446,5.7223
TATASTEEL,0.9953,8.7762,192.0982,13.8599


In [9]:
print("Linear Regression")
df_lr

Linear Regression


Unnamed: 0,R2,Mean Absolute Error,Mean Squared Error,Root Mean Squared Error
AIRTEL,0.9973,4.4547,46.1475,6.7932
TATAMOTORS,0.9987,3.2076,28.1025,5.3012
ITC,0.9995,1.2823,4.9192,2.2179
CIPLA,0.9993,3.4043,35.1288,5.927
RAYMOND,0.9983,5.5134,92.7213,9.6292
BAJAJELECTRICALS,0.9986,3.7235,45.8934,6.7745
L&T,0.9991,7.5418,182.7193,13.5174
RELIANCE,0.9993,5.921,148.2085,12.1741
SBI,0.9984,2.3478,17.2823,4.1572
TATASTEEL,0.9978,5.9369,91.1969,9.5497


In [10]:
print("XG Boost")
df_xg

XG Boost


Unnamed: 0,R2,Mean Absolute Error,Mean Squared Error,Root Mean Squared Error
AIRTEL,0.9973,5.0876,60.7684,7.7954
TATAMOTORS,0.9987,3.7453,35.873,5.9894
ITC,0.9995,1.5499,7.2706,2.6964
CIPLA,0.9993,4.2947,52.9512,7.2768
RAYMOND,0.9983,6.5485,130.8946,11.4409
BAJAJELECTRICALS,0.9986,4.5604,84.9047,9.2144
L&T,0.9991,8.9857,241.6449,15.5449
RELIANCE,0.9993,7.2221,221.3417,14.8776
SBI,0.9984,2.6758,20.1374,4.4875
TATASTEEL,0.9978,6.9886,126.2457,11.2359
