In [1]:
# imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error

In [115]:
# read in data
train_data = pd.read_csv('../datasets/train_clean_final.csv')
test_data = pd.read_csv('../datasets/test_clean.csv')

In [118]:
# remove unnamed and extra columns
test_data.drop(['Unnamed: 0'], axis=1, inplace=True)
train_data.drop(['Unnamed: 0'], axis=1, inplace=True)


## Final Predictions
In this notebook, we will use the models and hyperparameters we determined in our other notebooks to train on all the train data, predict on the test data, and create our submission csvs for Kaggle

In [119]:
# define training and testing data and IDs
X_train = train_data.drop(columns = ['saleprice', 'id'])
X_test = test_data.drop(columns = ['id'])
y_train = train_data['saleprice']
IDs = test_data['id']

### Linear Regression

In [120]:
# Instantiate, fit and score our model on train data
lr = LinearRegression()
lr.fit(X_train, y_train)
# Train score
print(f'train r-squared score: {lr.score(X_train, y_train)}')

train r-squared score: 0.8551992468329397


In [121]:
# Print RMSE for training data
y_preds = lr.predict(X_train)
mse = mean_squared_error(y_train, y_preds, squared=False)
print(f'RMSE is {mse}')

RMSE is 30079.050958826432


In [112]:
# predict saleprice for test data
y_pred_final = lr.predict(X_test)

In [113]:
# create dataframe for submission
lr_df = pd.DataFrame()
lr_df['Id'] = IDs
lr_df['SalePrice'] = y_pred_final

In [114]:
# write to csv for submission
lr_df.to_csv('../Submissions/lr_preds_3.csv')

### Ridge Regression

#### Standard Scaling

In [89]:
# Scale our data.
sc = StandardScaler()
Z_train = sc.fit_transform(X_train)
Z_test = sc.transform(X_test)

#### Cross Validate and Predict

In [90]:
# define list of alphas
alphas = np.logspace(1, 3, 1000)

In [91]:
# cross validate ridge regression over a range of alphas

ridge_cv = RidgeCV(alphas=alphas, scoring='r2', cv=8)
ridge_cv.fit(Z_train, y_train)

RidgeCV(alphas=array([  10.        ,   10.04620421,   10.09262191,   10.13925408,
         10.1861017 ,   10.23316578,   10.28044732,   10.32794732,
         10.37566679,   10.42360674,   10.47176819,   10.52015218,
         10.56875971,   10.61759183,   10.66664958,   10.715934  ,
         10.76544613,   10.81518703,   10.86515775,   10.91535935,
         10.96579291,   11.0164595 ,   11.06736018,   11.11849605,
         11.16986818,   11.22147768,   11.27332564,   11.32541315,
         11.3...
        882.96999555,  887.04968897,  891.14823228,  895.2657126 ,
        899.40221741,  903.55783461,  907.73265252,  911.92675985,
        916.14024571,  920.37319966,  924.62571164,  928.89787202,
        933.18977157,  937.50150151,  941.83315346,  946.18481947,
        950.55659201,  954.94856398,  959.36082871,  963.79347996,
        968.24661193,  972.72031925,  977.21469697,  981.72984062,
        986.26584613,  990.8228099 ,  995.40082876, 1000.        ]),
        cv=8, scoring='r2')

In [92]:
# optimal ridge alpha for entire training data
ridge_cv.alpha_

45.56786265841064

In [93]:
# fit ridge model with optimal alpha
ridge = Ridge(alpha = 45.56786265841064)
ridge.fit(Z_train, y_train)

Ridge(alpha=45.56786265841064)

In [94]:
# print R2 scores for train data
print(ridge.score(Z_train, y_train))

0.8546021953948765


In [95]:
# Final Predictions
y_pred_final = ridge.predict(Z_test)

In [96]:
# create dataframe for submission
ridge_df = pd.DataFrame()
ridge_df['Id'] = IDs
ridge_df['SalePrice'] = y_pred_final

In [97]:
# write to csv for submission
ridge_df.to_csv('../Submissions/ridge_preds_2.csv')

### Lasso Regression

In [66]:
# cross validate ridge regression over a range of alphas
alphas = np.linspace(0, 500, 5000)
lasso_cv = LassoCV(alphas=alphas, n_alphas=100, cv=5, random_state=21, max_iter=10000)
lasso_cv.fit(Z_train, y_train)

  positive,
  positive,
  positive,
  positive,
  positive,
  positive,
  positive,
  positive,
  positive,
  positive,


LassoCV(alphas=array([0.00000000e+00, 1.00020004e-01, 2.00040008e-01, ...,
       4.99799960e+02, 4.99899980e+02, 5.00000000e+02]),
        cv=5, max_iter=10000, random_state=21)

In [67]:
# print the optimal value of alpha
lasso_cv.alpha_

189.9379875975195

In [68]:
# fit lasso with optimal alpha
lasso = Lasso(alpha=189.9379875975195)
lasso.fit(Z_train, y_train)

Lasso(alpha=189.9379875975195)

In [69]:
# print R2 scores
print(lasso.score(Z_train, y_train))

0.8548806384621117


In [53]:
# Final Predictions
y_pred_final = lasso.predict(Z_test)

In [54]:
# create dataframe for submission
lasso_df = pd.DataFrame()
lasso_df['Id'] = IDs
lasso_df['SalePrice'] = y_pred_final

In [55]:
# write to csv for submission
lasso_df.to_csv('../Submissions/lasso_preds_1.csv')