In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

## Importation des données

In [2]:
database = pd.read_csv("../Data/data_house.csv")

In [3]:
# Total number of NaN entries in a column must be less than 30% of total entries
database = database.loc[:, database.isna().sum() <= 0.3*database.shape[0]]

In [4]:
selected = [
    'SalePrice',
    'OverallQual',
    'GrLivArea',
    'GarageCars',  
    'GarageArea',  
    'TotalBsmtSF',
    '1stFlrSF',  
    'FullBath',      
    'TotRmsAbvGrd',  
    'YearBuilt', 
    'YearRemodAdd'
]

In [6]:
df = database.loc[:, selected]

In [7]:
y = df.SalePrice
X = df[['OverallQual',
    'GrLivArea',
    'GarageCars',  
    'GarageArea',  
    'TotalBsmtSF',
    '1stFlrSF',  
    'FullBath',      
    'TotRmsAbvGrd',  
    'YearBuilt', 
    'YearRemodAdd']]

In [39]:
# dataset
dataset = df
# prepare a range of alpha values to test
alphas = np.array([1, 10, 100, 800, 900, 1000, 1050, 1100, 1125, 1150, 1175, 1200])
# create and fit a ridge regression model, testing each alpha
model = Lasso()
grid = GridSearchCV(estimator=model, param_grid=dict(alpha=alphas), cv=5)
grid.fit(X, y)
print(grid)
# summarize the results of the grid search
print(grid.best_score_)
print(grid.best_estimator_.alpha)
print(grid.best_estimator_)
print(grid.best_params_)

GridSearchCV(cv=5, estimator=Lasso(),
             param_grid={'alpha': array([   1,   10,  100,  800,  900, 1000, 1050, 1100, 1125, 1150, 1175,
       1200])})
0.7611505833134443
1150
Lasso(alpha=1150)
{'alpha': 1150}


In [45]:
def get_best_alpha(data, target, model, alphas):
    ''' Get best alpha for data=X, taget=y, model can be Lasso() or Rdge(), alphas=[val,val]'''
    parameters = {'alpha' : alphas}
    grid = GridSearchCV(model, parameters, cv=5)
    grid.fit(data, target)
    print(grid.best_estimator_)

In [46]:
get_best_alpha(X, y, Lasso(), [1, 10, 100, 800, 900, 1000, 1050, 1100, 1125, 1150, 1175, 1200])

Lasso(alpha=1150)


In [37]:
results = pd.DataFrame(grid.cv_results_)

In [38]:
results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.004341,0.001191,0.002363,0.000509,1,{'alpha': 1},0.82302,0.789643,0.77856,0.776249,0.63066,0.759627,0.066614,12
1,0.003698,0.000942,0.002168,0.000389,10,{'alpha': 10},0.822993,0.789665,0.778652,0.776311,0.630527,0.75963,0.06667,11
2,0.002793,0.0001,0.002055,0.000491,100,{'alpha': 100},0.822689,0.789863,0.779538,0.776903,0.629472,0.759693,0.067121,10
3,0.003083,0.000513,0.002034,0.000235,800,{'alpha': 800},0.822004,0.790006,0.784268,0.779437,0.62892,0.760927,0.06766,9
4,0.003198,0.000576,0.002402,0.000633,900,{'alpha': 900},0.822142,0.789971,0.784663,0.779706,0.628675,0.761031,0.067819,8
5,0.00335,0.000726,0.002309,0.000586,1000,{'alpha': 1000},0.82225,0.789905,0.784988,0.780019,0.628403,0.761113,0.06798,7
6,0.002923,0.000302,0.002083,0.000493,1050,{'alpha': 1050},0.822293,0.789842,0.785126,0.78016,0.628256,0.761135,0.068057,6
7,0.002821,0.00017,0.002172,0.000461,1100,{'alpha': 1100},0.822329,0.789773,0.785241,0.780291,0.628102,0.761147,0.068133,4
8,0.002919,0.000303,0.001961,9.9e-05,1125,{'alpha': 1125},0.822344,0.789736,0.785295,0.780352,0.628023,0.76115,0.068171,2
9,0.002776,0.0003,0.002124,0.000419,1150,{'alpha': 1150},0.822357,0.789697,0.785347,0.780411,0.627941,0.761151,0.068209,1


In [43]:
# dataset
dataset = df
# prepare a range of alpha values to test
alphas = np.array([1, 10,20,45, 50, 65, 70, 75, 80, 85, 90, 100, 110, 120, 1000])
# create and fit a ridge regression model, testing each alpha
model = Ridge()
grid = GridSearchCV(estimator=model, param_grid=dict(alpha=alphas), cv=5)
grid.fit(X, y)
print(grid)
# summarize the results of the grid search
print(grid.best_score_)
print(grid.best_estimator_.alpha)

GridSearchCV(cv=5, estimator=Ridge(),
             param_grid={'alpha': array([   1,   10,   20,   45,   50,   65,   70,   75,   80,   85,   90,
        100,  110,  120, 1000])})
0.7607170708614998
80


In [47]:
get_best_alpha(X, y, Ridge(), [1, 10,20,45, 50, 65, 70, 75, 80, 85, 90, 100, 110, 120, 1000] )

Ridge(alpha=80)


In [44]:
xtrain, xtest, ytrain, ytest = train_test_split(X, y, train_size=0.8)

In [71]:
def get_metrics(data, target, model):
    ''' 
    Get R2 and RMSE for a regression
    data = X
    target = y
    model can be Lasso(alpha=int) or Ridge(alpha=int)
    '''
    xtrain, xtest, ytrain, ytest = train_test_split(data, target, train_size=0.8)
    regr = model
    regr.fit(xtrain, ytrain)
    ytrainpred = regr.predict(xtrain)
    ytestpred = regr.predict(xtest)
    print(regr)
    print ("Training set : R2 = {} RMSE = {}\nTesting set : R2 = {} RMSE = {}" 
      .format(round(r2_score(ytrain, ytrainpred),3), round(mean_squared_error(ytrain, ytrainpred, squared=False),3), round(r2_score(ytest, ytestpred),3), round(mean_squared_error(ytest, ytestpred, squared=False),3)))

In [72]:
get_metrics(X, y, Ridge(alpha=80))

Ridge(alpha=80)
Training set : R2 = 0.764 RMSE = 39419.763
Testing set : R2 = 0.815 RMSE = 31027.982


## RandomizedSearchCV

In [74]:
from sklearn.model_selection import RandomizedSearchCV

In [77]:
from scipy.stats import uniform as sp_rand

In [113]:
# dataset
dataset = df
# prepare a uniform distribution to sample for the alpha parameter
param_grid = {'alpha': sp_rand()}
# create and fit a regression model, testing random alpha values
model = Ridge()
rsearch = RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_iter=100, cv=5, random_state=0)
rsearch.fit(X, y)
print(rsearch)
# summarize the results of the random parameter search
print(rsearch.best_score_)
print(rsearch.best_estimator_.alpha)
print(rsearch.best_estimator_)

RandomizedSearchCV(cv=5, estimator=Ridge(), n_iter=100,
                   param_distributions={'alpha': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f9b9376feb0>},
                   random_state=0)
0.7596634398978886
0.9883738380592262
Ridge(alpha=0.9883738380592262)


In [146]:
def get_best_alpha_by_randomizedSCV(data, target, regr):
    model_params = {'alpha': [i for i in range(2000)]}
    search = RandomizedSearchCV(regr, model_params, n_iter=200, cv=5, random_state=0)
    search.fit(data, target)
    print(search.best_estimator_)

In [147]:
get_best_alpha_by_randomizedSCV(X, y, Ridge())

Ridge(alpha=80)


In [138]:
get_best_alpha_by_randomizedSCV(X, y, Lasso())

Lasso(alpha=1148)


In [114]:
get_metrics(X, y, Lasso(alpha=1148))

Lasso(alpha=1148)
Training set : R2 = 0.768 RMSE = 36695.44
Testing set : R2 = 0.773 RMSE = 43481.063


In [115]:
get_metrics(X, y, Lasso(alpha=1150))

Lasso(alpha=1150)
Training set : R2 = 0.763 RMSE = 39066.095
Testing set : R2 = 0.81 RMSE = 32963.13
