In [1]:
# Polynomial Regression
# We cannot go for polynomial regression directly on the categorical features, we need to convert them to integers first
# But if we go for polynomial features on get dummies, the dimensionality of the dataset would increase widely
# We must go for feature selection - Using main dataset

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
df = pd.read_csv('train_no_missing.csv')

In [4]:
df.shape

(1460, 77)

In [5]:
df.corr()['SalePrice'].sort_values(ascending=False)

SalePrice        1.000000
OverallQual      0.790982
GrLivArea        0.708624
GarageCars       0.640409
GarageArea       0.623431
TotalBsmtSF      0.613581
1stFlrSF         0.605852
FullBath         0.560664
TotRmsAbvGrd     0.533723
YearBuilt        0.522897
YearRemodAdd     0.507101
MasVnrArea       0.475241
Fireplaces       0.466929
BsmtFinSF1       0.386420
LotFrontage      0.334901
WoodDeckSF       0.324413
2ndFlrSF         0.319334
OpenPorchSF      0.315856
HalfBath         0.284108
LotArea          0.263843
BsmtFullBath     0.227122
BsmtUnfSF        0.214479
BedroomAbvGr     0.168213
ScreenPorch      0.111447
PoolArea         0.092404
MoSold           0.046432
3SsnPorch        0.044584
BsmtFinSF2      -0.011378
BsmtHalfBath    -0.016844
MiscVal         -0.021190
Id              -0.021917
LowQualFinSF    -0.025606
YrSold          -0.028923
OverallCond     -0.077856
MSSubClass      -0.084284
EnclosedPorch   -0.128578
KitchenAbvGr    -0.135907
Name: SalePrice, dtype: float64

In [6]:
# Let's use only those features which have more than 30% correlation with SalePrice
# Notice that all are integer features

In [7]:
feature_df = df[['OverallQual', 'GrLivArea', 'GarageCars', 'GarageArea', 'TotalBsmtSF', '1stFlrSF', 'FullBath', 'TotRmsAbvGrd',
                'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'Fireplaces', 'BsmtFinSF1', 'LotFrontage', 'WoodDeckSF', 
                '2ndFlrSF', 'OpenPorchSF']]

In [8]:
from sklearn.preprocessing import PolynomialFeatures

In [9]:
poly_converter = PolynomialFeatures(degree=3, include_bias=False)

In [10]:
polynomial_features = poly_converter.fit_transform(feature_df)

In [11]:
polynomial_features.shape

(1460, 1139)

In [12]:
from sklearn.model_selection import train_test_split

In [13]:
X_train, X_test, y_train, y_test = train_test_split(polynomial_features, df['SalePrice'], test_size=0.3, random_state=101)

In [14]:
X_train.shape

(1022, 1139)

In [15]:
from sklearn.preprocessing import StandardScaler

In [16]:
scaler = StandardScaler()

In [17]:
scaler.fit(X_train)

StandardScaler()

In [18]:
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [19]:
from sklearn.linear_model import Ridge,RidgeCV

In [20]:
ridge_cv_model = RidgeCV(alphas=(0.1, 1.0, 10.0))

In [21]:
ridge_cv_model.fit(X_train, y_train)

RidgeCV(alphas=array([ 0.1,  1. , 10. ]))

In [22]:
ridge_predictions = ridge_cv_model.predict(X_test)

In [23]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [24]:
mae = mean_absolute_error(y_test, ridge_predictions)
mae

20256.25819397551

In [25]:
rmse = np.sqrt(mean_squared_error(y_test, ridge_predictions))
rmse

33236.63877305249

In [26]:
ridge_cv_model.alpha_

10.0

In [27]:
# Since we haven't used a large number of features, the performance hasn't improved much compared to LR

In [28]:
df_test = pd.read_csv('test_no_missing.csv')[['OverallQual', 'GrLivArea', 'GarageCars', 'GarageArea', 'TotalBsmtSF', '1stFlrSF', 'FullBath', 'TotRmsAbvGrd', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'Fireplaces', 'BsmtFinSF1', 'LotFrontage', 'WoodDeckSF', '2ndFlrSF', 'OpenPorchSF']]

In [29]:
test_features = poly_converter.transform(df_test)

In [30]:
scaled_test = scaler.transform(test_features)

In [31]:
final_predictions = ridge_cv_model.predict(scaled_test)

In [32]:
final_predictions

array([124911.20878348, 142266.11457522, 185052.17834914, ...,
       192437.20441689, 118019.58151784, 238507.70883921])

In [33]:
final_ridge = pd.concat([pd.read_csv('test.csv')['Id'],pd.DataFrame(final_predictions)], axis=1)

In [34]:
final_ridge.columns = ['Id', 'SalePrice']

In [35]:
final_ridge.to_csv('RidgeSubmission.csv')

In [36]:
# We can also try ElasticNet

In [37]:
from sklearn.linear_model import ElasticNetCV

In [47]:
elastic_model = ElasticNetCV(l1_ratio=[.1, .5, .7, .9, .95, .99, 1], eps=0.001, n_alphas=1000, max_iter=1000000)

In [48]:
elastic_model.fit(X_train, y_train)

ElasticNetCV(l1_ratio=[0.1, 0.5, 0.7, 0.9, 0.95, 0.99, 1], max_iter=1000000,
             n_alphas=1000)

In [49]:
elastic_model.alpha_

5866.868346200209

In [50]:
elastic_predictions = elastic_model.predict(X_test)

In [51]:
mae = mean_absolute_error(y_test, elastic_predictions)
mae

21176.43204953221

In [52]:
rmse = np.sqrt(mean_squared_error(y_test, elastic_predictions))
rmse

45150.653871846065

In [46]:
# Elastic performed worse than Ridge