In [1]:
""" Script for Kaggle Housing Price prediction competition. Copied to Notebook from Spyder.
Created on Sat Jan 29 12:34:42 2022"""

import numpy as np
import pickle, os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler, PolynomialFeatures, OneHotEncoder, LabelEncoder
from sklearn.model_selection import KFold, cross_val_predict, GridSearchCV
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import r2_score
from sklearn.pipeline import Pipeline
from scipy.stats import boxcox
from scipy.special import inv_boxcox

pd.options.mode.chained_assignment = None  # default='warn'

pd.set_option('display.max_columns', 35)

#os.chdir("H:/Dropbox/Kaggle/house_prices")
#path 

In [2]:
hspr = pd.read_csv("../input/house-prices-advanced-regression-techniques/train.csv") # titanic_fullsample
hspr['sample']='train'
test_s = pd.read_csv("../input/house-prices-advanced-regression-techniques/test.csv") 
test_s['SalePrice']=np.nan
test_s[['SalePrice', 'sample']] = [np.nan, 'test']
hspr=pd.concat([hspr, test_s])
hspr.reset_index(inplace=True, drop=True)
print(hspr.head())
print(hspr.shape)

   Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0   1          60       RL         65.0     8450   Pave   NaN      Reg   
1   2          20       RL         80.0     9600   Pave   NaN      Reg   
2   3          60       RL         68.0    11250   Pave   NaN      IR1   
3   4          70       RL         60.0     9550   Pave   NaN      IR1   
4   5          60       RL         84.0    14260   Pave   NaN      IR1   

  LandContour Utilities LotConfig LandSlope Neighborhood Condition1  \
0         Lvl    AllPub    Inside       Gtl      CollgCr       Norm   
1         Lvl    AllPub       FR2       Gtl      Veenker      Feedr   
2         Lvl    AllPub    Inside       Gtl      CollgCr       Norm   
3         Lvl    AllPub    Corner       Gtl      Crawfor       Norm   
4         Lvl    AllPub       FR2       Gtl      NoRidge       Norm   

  Condition2 BldgType HouseStyle  ...  PavedDrive  WoodDeckSF  OpenPorchSF  \
0       Norm     1Fam     2Story  ...           Y 

In [3]:
#%% data cleaning ###

cols_tokeep = ['Id', 'SalePrice', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'ExterCond',
               'BsmtFinSF1', 'TotalBsmtSF', 'HeatingQC', '1stFlrSF', '2ndFlrSF', 'GrLivArea', 'FullBath', 'HalfBath', 
               'KitchenQual', 'GarageArea', 'GarageCars', 'TotRmsAbvGrd', 'BedroomAbvGr',
               'ExterQual', 'sample']
hspr = hspr[cols_tokeep]
hspr.info()
hspr0 = hspr.copy()
hspr.drop(columns=['Id'],inplace=True)


ord_cols = ['ExterCond', 'HeatingQC', 'KitchenQual', 'ExterQual']
hspr[ord_cols] = hspr[ord_cols].replace(['Po', 'Fa', 'TA', 'Gd', 'Ex'], [1,2,3,4,5])
#print(hspr.BsmtCond.value_counts())

hspr['BsmtFinSF1'][hspr['BsmtFinSF1'].isna()] = hspr['BsmtFinSF1'].min()
hspr['TotalBsmtSF'][hspr['TotalBsmtSF'].isna()] = hspr['TotalBsmtSF'].min()
hspr['KitchenQual'][hspr['KitchenQual'].isna()] = hspr['KitchenQual'].min()
hspr['GarageArea'][hspr['GarageArea'].isna()] = hspr['GarageArea'].min()
hspr['GarageCars'][hspr['GarageCars'].isna()] = hspr['GarageCars'].min()
hspr.info()

# it makes sense to replace YearBuilt with Age
hspr['Age']=2010-hspr.YearBuilt
hspr.drop(columns=['YearBuilt'], inplace=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2919 entries, 0 to 2918
Data columns (total 22 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Id            2919 non-null   int64  
 1   SalePrice     1460 non-null   float64
 2   LotArea       2919 non-null   int64  
 3   OverallQual   2919 non-null   int64  
 4   OverallCond   2919 non-null   int64  
 5   YearBuilt     2919 non-null   int64  
 6   ExterCond     2919 non-null   object 
 7   BsmtFinSF1    2918 non-null   float64
 8   TotalBsmtSF   2918 non-null   float64
 9   HeatingQC     2919 non-null   object 
 10  1stFlrSF      2919 non-null   int64  
 11  2ndFlrSF      2919 non-null   int64  
 12  GrLivArea     2919 non-null   int64  
 13  FullBath      2919 non-null   int64  
 14  HalfBath      2919 non-null   int64  
 15  KitchenQual   2918 non-null   object 
 16  GarageArea    2918 non-null   float64
 17  GarageCars    2918 non-null   float64
 18  TotRmsAbvGrd  2919 non-null 

In [4]:
#%% check for skew and outliers ###

# check skew:
    
temp = (hspr.dtypes == np.int64)
num_cols = hspr.columns[temp]
skew_vals = hspr[num_cols].skew() 
skew_limit = 1
    
skew_cols = (skew_vals
             .sort_values(ascending=False)
             .to_frame()
             .rename(columns={0:'Skew'})
             .query('abs(Skew) > {}'.format(skew_limit)))

print(skew_cols)

# transform LotArea
hspr['LotArea']=np.log1p(hspr.LotArea)
hspr['LotArea'].skew()

                Skew
LotArea    12.829025
1stFlrSF    1.470360
ExterCond   1.316590
GrLivArea   1.270010


-0.505010100221913

In [5]:
#%% Model fitting ###

X = hspr[hspr['sample']=='train'].copy()
X.drop(columns=['sample', 'SalePrice'], inplace=True)
y = hspr.SalePrice[hspr['sample']=='train'].copy()
y = np.log(y)

s = StandardScaler()
X = s.fit_transform(X)

### first, fit ols ###

lm = LinearRegression()
predictions = cross_val_predict(lm, X, y, cv = 10)
score_lm = r2_score(y, predictions)
print(score_lm)
# 79.4%

### second, try lasso ###

lasso = Lasso(max_iter=100000)
params = {
    'alpha': np.geomspace(0.0001, 1, 15)
}

grid = GridSearchCV(lasso, params, cv=10)

grid.fit(X, y)
print(grid.best_score_, grid.best_params_)
# 80.5%

lasso.fit(X,y)
lasso.coef_

### third, try ridge with polyfeatures ###

estimator = Pipeline([("polynomial_features", PolynomialFeatures()),
        ("ridge_regression", Ridge())])

params = {
    'polynomial_features__degree': [1, 2, 3],
    'ridge_regression__alpha': np.linspace(50, 500, 10)
}

grid = GridSearchCV(estimator, params, cv=10)

grid.fit(X, y)
print(grid.best_score_, grid.best_params_)
#grid.grid_scores_
# 84%

np.exp((np.absolute(grid.predict(X)-y)).mean())

0.8481928795025486
0.8460036486983817 {'alpha': 0.0026826957952797246}
0.8818119355396915 {'polynomial_features__degree': 2, 'ridge_regression__alpha': 300.0}


1.0867007282027819

In [6]:
#%% predict ###

X_test = hspr[hspr['sample']=='test'].copy()
X_test.drop(columns=['sample', 'SalePrice'], inplace=True)
X_test = s.transform(X_test)

yhat = grid.predict(X_test)
yhat = np.exp(yhat)

id_ = hspr0.Id[hspr0['sample']=='test']

results = pd.DataFrame({'Id': id_, 'SalePrice': yhat}, columns=['Id', 'SalePrice'])
results.to_csv('HousePrices_subm6_3.csv', index=False)