# ML Model

# 1) importing key modules

In [1]:
#support both Python 2 and Python 3 with minimal overhead.
from __future__ import absolute_import, division, print_function
#ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np 
import pandas as pd
import requests
import pickle

In [3]:
#For Visuals
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
from matplotlib import rcParams
rcParams['figure.figsize'] = 11, 8
%config InlineBackend.figure_format = 'svg'
%matplotlib inline

In [4]:
# for modeling
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression,Ridge,Lasso,RidgeCV, ElasticNet
from sklearn.ensemble import RandomForestRegressor,BaggingRegressor,GradientBoostingRegressor,AdaBoostRegressor 
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import PolynomialFeatures
import random
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import MinMaxScaler

# Regression evaluation
from sklearn.metrics import mean_squared_log_error,mean_squared_error, r2_score,mean_absolute_error 

# Classification
from sklearn.metrics import accuracy_score

#Model helper
from sklearn.model_selection import GridSearchCV , KFold , cross_val_score

## 2)- Loading data-files

In [5]:
df = pd.read_csv('testset2.csv')

In [6]:
df.head(2)

Unnamed: 0.1,Unnamed: 0,m1,m2,m3,m4,m5,m6,m7,m8,m9,...,m16,m17,m18,m19,m20,m21,m22,m23,m24,m25
0,0,0.572565,-0.014704,-0.352985,-0.012697,-0.138549,0.620313,-0.602656,-0.071517,-0.131941,...,-0.199931,-0.360027,0.154202,0.369976,0.320812,1.252255,1.154858,0.609724,-0.126952,0.075709
1,1,-1.233754,0.643106,-0.268247,-0.588117,-0.231391,1.816856,0.369814,0.784052,0.870746,...,0.943058,-0.475841,-0.085267,-0.60149,-0.601191,0.635048,-0.433662,-0.456993,0.429443,0.423808


In [7]:
df = df.drop(['Unnamed: 0'], axis=1)

In [8]:
df.shape

(80000, 25)

In [9]:
# Selecting all columns except last one that is 'm25'.

X = df.iloc[:,:-1].values          
y = df['m25'].values

In [10]:
X

array([[ 0.57256488, -0.01470396, -0.35298469, ...,  1.1548579 ,
         0.60972403, -0.12695157],
       [-1.23375423,  0.64310596, -0.26824732, ..., -0.4336621 ,
        -0.4569929 ,  0.42944315],
       [-0.77836448, -0.45371563, -2.10020124, ...,  0.18666614,
         0.21163226, -0.56662082],
       ...,
       [ 0.94546444, -0.39535052,  0.3145296 , ..., -0.98010944,
         0.17685301, -0.17008695],
       [ 0.1151475 , -1.04815064,  0.65888561, ...,  1.15475919,
         1.25093225,  1.49547161],
       [ 0.31239622,  0.75315802,  0.94000431, ..., -0.16357309,
         1.71373922,  0.2796558 ]])

In [11]:
X.shape

(80000, 24)

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=2019)

# Modeling

In [13]:
R2_Scores = []
models = ['Linear Regression' , 'Lasso Regression' ,'Ridge Regression', 'AdaBoost Regression' ,'GradientBoosting Regression',
          'RandomForest Regression' , 'KNeighbours Regression']

### i- Linear Regression

In [14]:
clf_lr = LinearRegression()
clf_lr.fit(X_train , y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [15]:
y_pred = clf_lr.predict(X_test)

In [16]:
y_pred[:5]

array([ 0.10972659,  0.13205332,  0.03730852, -0.11999982, -0.0522195 ])

In [17]:
y_test[:5]

array([ 0.07428443, -0.11529221, -0.16358595,  0.04638495, -0.88075262])

In [18]:
accuracies = cross_val_score(estimator = clf_lr, X = X_train, y = y_train, cv = 5,verbose = 1)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.1s finished


In [19]:
print('####### Linear Regression #######')
print('Score : %.4f' % clf_lr.score(X_test, y_test))
print(accuracies)

####### Linear Regression #######
Score : 0.0113
[0.01111921 0.01183187 0.01019182 0.0117513  0.0094422 ]


In [20]:
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred)**0.5
r2 = r2_score(y_test, y_pred)

In [21]:
print('MSE    : %0.2f ' % mse)
print('MAE    : %0.2f ' % mae)
print('RMSE   : %0.2f ' % rmse)
print('R2     : %0.4f ' % r2)

R2_Scores.append(r2)

MSE    : 0.82 
MAE    : 0.72 
RMSE   : 0.91 
R2     : 0.0113 


### ii-Lasso Regression

In [22]:
clf_la = Lasso(normalize=True)
clf_la.fit(X_train , y_train)

Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=True, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [23]:
y_pred = clf_la.predict(X_test)

In [24]:
accuracies = cross_val_score(estimator = clf_la, X = X_train, y = y_train, cv = 5,verbose = 1)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.1s finished


In [25]:
print('###### Lasso Regression ######')
print('Score : %.4f' % clf_la.score(X_test, y_test))
print(accuracies)

###### Lasso Regression ######
Score : -0.0000
[-9.35032691e-06 -1.27091843e-07 -1.13841486e-05 -1.25588065e-04
 -1.94523527e-05]


In [26]:
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred)**0.5
r2 = r2_score(y_test, y_pred)

In [27]:
print('MSE    : %0.2f ' % mse)
print('MAE    : %0.2f ' % mae)
print('RMSE   : %0.2f ' % rmse)
print('R2     : %0.4f ' % r2)

R2_Scores.append(r2)

MSE    : 0.83 
MAE    : 0.73 
RMSE   : 0.91 
R2     : -0.0000 


### iii)-Ridge Regression

In [28]:
clf_rr = Ridge(normalize=True)
clf_rr.fit(X_train , y_train)
accuracies = cross_val_score(estimator = clf_rr, X = X_train, y = y_train, cv = 5,verbose = 1)
y_pred = clf_rr.predict(X_test)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.0s finished


In [29]:
print('###### Ridge Regression ######')
print('Score : %.4f' % clf_rr.score(X_test, y_test))
print(accuracies)

###### Ridge Regression ######
Score : 0.0094
[0.00926652 0.00962429 0.00859177 0.00945139 0.00840532]


In [30]:
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred)**0.5
r2 = r2_score(y_test, y_pred)

print('')
print('MSE    : %0.2f ' % mse)
print('MAE    : %0.2f ' % mae)
print('RMSE   : %0.2f ' % rmse)
print('R2     : %0.4f ' % r2)

R2_Scores.append(r2)


MSE    : 0.82 
MAE    : 0.72 
RMSE   : 0.91 
R2     : 0.0094 


### iv)-AdaBosst Regression

In [31]:
clf_ar = AdaBoostRegressor(n_estimators=1000)
clf_ar.fit(X_train , y_train)

AdaBoostRegressor(base_estimator=None, learning_rate=1.0, loss='linear',
         n_estimators=1000, random_state=None)

In [32]:
y_pred = clf_ar.predict(X_test)

In [33]:
print('###### AdaBoost Regression ######')
print('Score : %.4f' % clf_ar.score(X_test, y_test))
print(accuracies)

###### AdaBoost Regression ######
Score : 0.0077
[0.00926652 0.00962429 0.00859177 0.00945139 0.00840532]


In [34]:
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred)**0.5
r2 = r2_score(y_test, y_pred)

print('')
print('MSE    : %0.2f ' % mse)
print('MAE    : %0.2f ' % mae)
print('RMSE   : %0.2f ' % rmse)
print('R2     : %0.4f ' % r2)

R2_Scores.append(r2)


MSE    : 0.82 
MAE    : 0.72 
RMSE   : 0.91 
R2     : 0.0077 


### v)-GradientBoosting Regression

In [35]:
clf_gbr = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1,max_depth=1, random_state=0, loss='ls',verbose = 1)
clf_gbr.fit(X_train , y_train)
accuracies = cross_val_score(estimator = clf_gbr, X = X_train, y = y_train, cv = 5,verbose = 1)
y_pred = clf_gbr.predict(X_test)

      Iter       Train Loss   Remaining Time 
         1           0.8235            5.06s
         2           0.8232            4.67s
         3           0.8228            4.34s
         4           0.8225            4.14s
         5           0.8222            4.02s
         6           0.8220            4.08s
         7           0.8217            4.16s
         8           0.8215            4.04s
         9           0.8212            3.93s
        10           0.8210            3.85s
        20           0.8193            3.28s
        30           0.8182            2.82s
        40           0.8174            2.39s
        50           0.8167            1.98s
        60           0.8161            1.58s
        70           0.8156            1.18s
        80           0.8152            0.78s
        90           0.8149            0.39s
       100           0.8146            0.00s
      Iter       Train Loss   Remaining Time 
         1           0.8245            2.98s
        

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


         3           0.8238            2.85s
         4           0.8235            3.37s
         5           0.8232            3.31s
         6           0.8229            3.19s
         7           0.8227            3.10s
         8           0.8224            3.03s
         9           0.8222            2.96s
        10           0.8220            2.91s
        20           0.8203            2.51s
        30           0.8192            2.30s
        40           0.8183            1.94s
        50           0.8176            1.65s
        60           0.8170            1.33s
        70           0.8165            0.98s
        80           0.8161            0.64s
        90           0.8157            0.32s
       100           0.8153            0.00s
      Iter       Train Loss   Remaining Time 
         1           0.8296            4.07s
         2           0.8293            4.23s
         3           0.8289            3.96s
         4           0.8286            3.78s
         

[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   16.6s finished


In [36]:
print('###### Gradient Boosting Regression #######')
print('Score : %.4f' % clf_gbr.score(X_test, y_test))
print(accuracies)

###### Gradient Boosting Regression #######
Score : 0.0089
[0.00952889 0.00890773 0.00847191 0.00928892 0.0085154 ]


In [37]:
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred)**0.5
r2 = r2_score(y_test, y_pred)

print('')
print('MSE    : %0.2f ' % mse)
print('MAE    : %0.2f ' % mae)
print('RMSE   : %0.2f ' % rmse)
print('R2     : %0.4f ' % r2)

R2_Scores.append(r2)


MSE    : 0.82 
MAE    : 0.72 
RMSE   : 0.91 
R2     : 0.0089 


### vi)-RandomForest Regression

In [38]:
clf_rf = RandomForestRegressor()
clf_rf.fit(X_train , y_train)
accuracies = cross_val_score(estimator = clf_rf, X = X_train, y = y_train, cv = 5,verbose = 1)
y_pred = clf_rf.predict(X_test)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  1.0min finished


In [39]:
print('###### Random Forest ######')
print('Score : %.4f' % clf_rf.score(X_test, y_test))
print(accuracies)

###### Random Forest ######
Score : -0.1021
[-0.0982551  -0.12052511 -0.10863236 -0.09638342 -0.09954106]


In [40]:
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred)**0.5
r2 = r2_score(y_test, y_pred)

print('')
print('MSE    : %0.2f ' % mse)
print('MAE    : %0.2f ' % mae)
print('RMSE   : %0.2f ' % rmse)
print('R2     : %0.4f ' % r2)


MSE    : 0.91 
MAE    : 0.76 
RMSE   : 0.96 
R2     : -0.1021 


### Tuning Parameters

In [41]:
no_of_test=[100]
params_dict={'n_estimators':no_of_test,'n_jobs':[-1],'max_features':["auto",'sqrt','log2']}
clf_rf=GridSearchCV(estimator=RandomForestRegressor(),param_grid=params_dict,scoring='r2')
clf_rf.fit(X_train,y_train)
print('Score : %.4f' % clf_rf.score(X_test, y_test))
pred=clf_rf.predict(X_test)
r2 = r2_score(y_test, pred)
print('R2     : %0.4f ' % r2)
R2_Scores.append(r2)

Score : -0.0013
R2     : -0.0013 


### vii)-KNeighbours Regression

In [42]:
clf_knn = KNeighborsRegressor()
clf_knn.fit(X_train , y_train)
accuracies = cross_val_score(estimator = clf_knn, X = X_train, y = y_train, cv = 5,verbose = 1)
y_pred = clf_knn.predict(X_test)
print('')
print('###### KNeighbours Regression ######')
print('Score : %.4f' % clf_knn.score(X_test, y_test))
print(accuracies)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  3.8min finished



###### KNeighbours Regression ######
Score : -0.1832
[-0.18228581 -0.20664405 -0.19382787 -0.1917326  -0.18681382]


In [43]:
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred)**0.5
r2 = r2_score(y_test, y_pred)

print('')
print('MSE    : %0.2f ' % mse)
print('MAE    : %0.2f ' % mae)
print('RMSE   : %0.2f ' % rmse)
print('R2     : %0.4f ' % r2)
R2_Scores.append(r2)


MSE    : 0.98 
MAE    : 0.79 
RMSE   : 0.99 
R2     : -0.1832 


### Tuning Parameters

In [44]:
n_neighbors=[]
for i in range (0,50,5):
    if(i!=0):
        n_neighbors.append(i)
params_dict={'n_neighbors':n_neighbors,'n_jobs':[-1]}
clf_knn=GridSearchCV(estimator=KNeighborsRegressor(),param_grid=params_dict,scoring='r2')
clf_knn.fit(X_train,y_train)
print('Score : %.4f' % clf_knn.score(X_test, y_test))
pred=clf_knn.predict(X_test)
r2 = r2_score(y_test, pred)
print('R2     : %0.4f ' % r2)

Score : -0.0101
R2     : -0.0101 


# Comparison of R2-Score 

In [45]:
compare = pd.DataFrame({'MODEL' : models , 'R2-Scores' : R2_Scores})
compare.sort_values(by='R2-Scores' ,ascending=False)

Unnamed: 0,MODEL,R2-Scores
0,Linear Regression,0.011325
2,Ridge Regression,0.009372
4,GradientBoosting Regression,0.008906
3,AdaBoost Regression,0.007701
1,Lasso Regression,-7e-06
5,RandomForest Regression,-0.00132
6,KNeighbours Regression,-0.183204


**The best R2 is from above is 0.0113 . So, we think that "Linear Regression" is the best most optimized model for our prediction problem.**