## Linear Regression

Here, we will perform the same procedure done in the *DecisionTree.ipynb* file, just by changing the regression model to Gradient Boosting.


In [2]:
from preparingData import preparing_data # read data
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from math import sqrt # RMSE
from sklearn.metrics import mean_squared_error # error metric
from sklearn.model_selection import cross_val_score, cross_val_predict
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold # import KFold
import numpy as np
# model
from sklearn import linear_model


In [3]:
X,y=preparing_data()
X_scaled = preprocessing.scale(X)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

  


In [4]:
def rmse(model, y, x):
    return sqrt(mean_squared_error(y, model.predict(x)))

**Here, we create the regression model:**

In [5]:
reg = linear_model.Ridge(alpha=.5)
reg.fit(X_train, y_train)
predictions = reg.predict(X_test)

In [6]:
y_test[152094:152144].tolist()

[27900,
 7494,
 34850,
 22974,
 30349,
 17497,
 19500,
 26967,
 30987,
 23257,
 8488,
 34996,
 21495,
 37987,
 9896,
 20811,
 6788,
 9998,
 30487,
 13750,
 31999,
 23799,
 18900,
 9747,
 16988,
 17986,
 26991,
 16966,
 15988,
 19989,
 20995,
 21488,
 11942,
 31804,
 6900,
 26730,
 12049,
 11599,
 17728,
 10250,
 14950,
 8295,
 21492,
 19988,
 12995,
 19000,
 47375,
 12999,
 9989,
 6900]

In [7]:
predictions[152094:152144].tolist()

[24923.415561548845,
 14378.554358366224,
 27112.678983583042,
 24121.276253789423,
 24575.656315566837,
 22351.195897350844,
 26089.589379521247,
 22699.289936077894,
 23539.666803308195,
 26239.395570951718,
 13773.06081666844,
 25690.02115959159,
 25025.188165267133,
 24366.872431534528,
 14840.315929177767,
 21760.624687702737,
 10528.503863292974,
 14805.930842721373,
 23298.445187324636,
 22672.333493498314,
 14454.883854033598,
 20654.91547462717,
 20450.336447108482,
 12595.477343278988,
 26700.383272341154,
 21764.387674509522,
 20508.962078355937,
 12009.7097736719,
 20711.362908899486,
 24103.19940555569,
 22508.231091361402,
 25002.06562370716,
 21621.94027948718,
 24610.08290821715,
 12895.196758490478,
 22544.95336231046,
 16592.281216659456,
 10955.580377825447,
 23900.52395218078,
 23033.70022257099,
 11442.828855974196,
 11244.522286210993,
 16946.51947940056,
 22671.270415788164,
 12591.33622399535,
 24438.528523291963,
 28469.22642298675,
 16629.3330540208,
 21454.72

In [8]:
rmse(reg, y_train, X_train)

7952.906389540915

In [9]:
rmse(reg, y_test, X_test)

7954.592151225256

**Cross-validation:**

In [10]:
kf = KFold(n_splits=10) # Define the split - into 5 folds 
kf.get_n_splits(X_scaled) # returns the number of splitting iterations in the cross-validator
KFold(n_splits=10, random_state=None, shuffle=False)
print(kf) 

KFold(n_splits=10, random_state=None, shuffle=False)


In [12]:
list_rmse_train = []
list_rmse_test = []
scores_train = []
for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    reg = linear_model.Ridge(alpha=.5)
    reg.fit(X_train, y_train)
    list_rmse_train.append(rmse(reg, y_train, X_train))
    list_rmse_test.append(rmse(reg, y_test, X_test))

    

**Standard deviation of error in training set after cross-validation**

In [13]:
np.std(list_rmse_train)

68.18189367605405

**Standard deviation of error in test set after cross-validation**

In [14]:
np.std(list_rmse_test)

625.1284844207817

**Mean of the error in the training set after cross-validation**

In [15]:
np.mean(list_rmse_train)

7939.565395575459

**Mean of the error in the test set after cross-validation**

In [16]:
np.mean(list_rmse_test)

8198.77606055087