# Made Using Google Colab

In [4]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.metrics import make_scorer, mean_squared_log_error
from sklearn.model_selection import cross_val_score

from feature_select import Train


def rmsle(pred, true):
  return mean_squared_log_error(pred, true) ** 0.5


scorer = make_scorer(rmsle, greater_is_better=False)

train = Train(pd.read_csv('train_cleaned.csv'))

# Split the data
X = train.cleaned.drop(columns='SalePrice')
y = train.cleaned['SalePrice']

X_train, y_train, X_val, y_val, X_test, y_test = train.train_val_test_split(
    X=X, y=y, random_state=42
)

print(X_train.shape, y_train.shape, X_val.shape,
      y_val.shape, X_test.shape, y_test.shape)

(751, 25) (751,) (251, 25) (251,) (251, 25) (251,)


In [6]:
lr = LinearRegression()
score = cross_val_score(lr, X_train, y_train, scoring=scorer, cv=5)
mean_score = sum(score) / len(score)
print(f'Mean Score (rmsle) for Train: {mean_score}')
print(score)

Mean Score (rmsle) for Train: -0.1568687571587947
[-0.16153411 -0.17553237 -0.15370515 -0.14908811 -0.14448405]


In [45]:
# Ridge Regression with multiple severities for penalty
# 10-fold cross validation

from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV


ridge = Ridge(normalize=True)
params_lambda = {'alpha': [1e-15, 1e-10, 1e-08, 0.001, 0.01, 0.1, 1, 5, 10, 20, 30, 35, 40, 45, 50, 55, 100]}

grid_search = GridSearchCV(ridge, param_grid=params_lambda, scoring=scorer, cv=10)
grid_search.fit(X_train, y_train)

GridSearchCV(cv=10, error_score=nan,
             estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True,
                             max_iter=None, normalize=True, random_state=None,
                             solver='auto', tol=0.001),
             iid='deprecated', n_jobs=None,
             param_grid={'alpha': [1e-15, 1e-10, 1e-08, 0.001, 0.01, 0.1, 1, 5,
                                   10, 20, 30, 35, 40, 45, 50, 55, 100]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=make_scorer(rmsle, greater_is_better=False), verbose=0)

In [46]:
print('Best Alpha: ', grid_search.best_params_)
print('Best RMSLE: ', grid_search.best_score_)

Best Alpha:  {'alpha': 0.1}
Best RMSLE:  -0.1524478797316533


In [47]:
y_pred_t = grid_search.predict(X_test)
y_pred_v = grid_search.predict(X_val)
print(rmsle(y_pred_t, y_test))
print(rmsle(y_pred_v, y_val))

0.1606310098347631
0.16362209330053964


In [48]:
test = Train(pd.read_csv('test.csv'))
cleaned_df = test.clean_df()

IDs = cleaned_df['Id']
feat = cleaned_df[cleaned_df.columns[1:]]

y_pred = grid_search.predict(feat)
submission_gs_ridge = pd.DataFrame({'Id': IDs, 'SalePrice': y_pred})
submission_gs_ridge.head()

Unnamed: 0,Id,SalePrice
0,1461,117624.51195
1,1462,166843.115987
2,1463,173419.47309
3,1464,182080.834126
4,1465,177360.882334


In [49]:
# submission_gs_ridge.to_csv('grid_search_ridge0.csv', index=False)

In [38]:
# Lasso Regression with multiple severities for penalty
# 10-fold cross validation

from sklearn.linear_model import Lasso


lasso = Lasso(tol=0.1)  # Need to change tolerance from 0.0001 to 0.1 to suppress some warnings
# params_lambda is the same
grid_search = GridSearchCV(lasso, param_grid=params_lambda,
                           scoring=scorer, cv=10)
grid_search.fit(X_train, y_train)

GridSearchCV(cv=10, error_score=nan,
             estimator=Lasso(alpha=1.0, copy_X=True, fit_intercept=True,
                             max_iter=1000, normalize=False, positive=False,
                             precompute=False, random_state=None,
                             selection='cyclic', tol=0.1, warm_start=False),
             iid='deprecated', n_jobs=None,
             param_grid={'alpha': [1e-15, 1e-10, 1e-08, 0.001, 0.01, 0.1, 1, 5,
                                   10, 20, 30, 35, 40, 45, 50, 55, 100]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=make_scorer(rmsle, greater_is_better=False), verbose=0)

In [39]:
print('Best Alpha: ', grid_search.best_params_)
print('Best RMSLE: ', grid_search.best_score_)

Best Alpha:  {'alpha': 100}
Best RMSLE:  -0.1574323790697726


In [40]:
y_pred_t = grid_search.predict(X_test)
y_pred_v = grid_search.predict(X_val)
print(rmsle(y_pred_t, y_test))
print(rmsle(y_pred_v, y_val))

0.1660785449996377
0.16474253926508842


In [None]:
# The results are not better than Ridge so, I conclude the features I am using
# are important enough to keep.