In [15]:
import numpy as np
import pickle, os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler, PolynomialFeatures, OneHotEncoder, LabelEncoder
from sklearn.model_selection import KFold, cross_val_predict, GridSearchCV
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import r2_score
from sklearn.pipeline import Pipeline
from scipy.stats import boxcox
from scipy.special import inv_boxcox

pd.set_option('display.max_columns', 35)

In [16]:
path = '../input/house-prices-advanced-regression-techniques/train.csv'
hspr = pd.read_csv(path) # titanic_fullsample
hspr['sample']='train'
hspr.reset_index(inplace=True, drop=True)
print(hspr.head())
print(hspr.shape)

   Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0   1          60       RL         65.0     8450   Pave   NaN      Reg   
1   2          20       RL         80.0     9600   Pave   NaN      Reg   
2   3          60       RL         68.0    11250   Pave   NaN      IR1   
3   4          70       RL         60.0     9550   Pave   NaN      IR1   
4   5          60       RL         84.0    14260   Pave   NaN      IR1   

  LandContour Utilities LotConfig LandSlope Neighborhood Condition1  \
0         Lvl    AllPub    Inside       Gtl      CollgCr       Norm   
1         Lvl    AllPub       FR2       Gtl      Veenker      Feedr   
2         Lvl    AllPub    Inside       Gtl      CollgCr       Norm   
3         Lvl    AllPub    Corner       Gtl      Crawfor       Norm   
4         Lvl    AllPub       FR2       Gtl      NoRidge       Norm   

  Condition2 BldgType HouseStyle  ...  PavedDrive  WoodDeckSF  OpenPorchSF  \
0       Norm     1Fam     2Story  ...           Y 

In [17]:
hspr0 = hspr.copy()
cols_tokeep = ['SalePrice', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'ExterCond',
               'BsmtFinSF1', 'TotalBsmtSF', 'HeatingQC', '1stFlrSF', '2ndFlrSF', 'GrLivArea', 'FullBath', 'HalfBath', 
               'KitchenQual', 'GarageArea', 'GarageCars', 'TotRmsAbvGrd', 'BedroomAbvGr',
               'ExterQual', 'LotFrontage', 'sample']
hspr = hspr[cols_tokeep]
hspr.dropna(subset=hspr.columns.drop('SalePrice'), inplace=True)
hspr.info()
# there are no missing values.

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1201 entries, 0 to 1459
Data columns (total 22 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   SalePrice     1201 non-null   int64  
 1   LotArea       1201 non-null   int64  
 2   OverallQual   1201 non-null   int64  
 3   OverallCond   1201 non-null   int64  
 4   YearBuilt     1201 non-null   int64  
 5   ExterCond     1201 non-null   object 
 6   BsmtFinSF1    1201 non-null   int64  
 7   TotalBsmtSF   1201 non-null   int64  
 8   HeatingQC     1201 non-null   object 
 9   1stFlrSF      1201 non-null   int64  
 10  2ndFlrSF      1201 non-null   int64  
 11  GrLivArea     1201 non-null   int64  
 12  FullBath      1201 non-null   int64  
 13  HalfBath      1201 non-null   int64  
 14  KitchenQual   1201 non-null   object 
 15  GarageArea    1201 non-null   int64  
 16  GarageCars    1201 non-null   int64  
 17  TotRmsAbvGrd  1201 non-null   int64  
 18  BedroomAbvGr  1201 non-null 

In [18]:
ord_cols = ['ExterCond', 'HeatingQC', 'KitchenQual', 'ExterQual']
hspr[ord_cols] = hspr[ord_cols].replace(['Po', 'Fa', 'TA', 'Gd', 'Ex'], [1,2,3,4,5])
#print(hspr.BsmtCond.value_counts())

# it makes sense to replace YearBuilt with Age
hspr['Age']=2010-hspr.YearBuilt
hspr.drop(columns=['YearBuilt'], inplace=True)

# transform LotArea
hspr['LotArea']=np.log1p(hspr.LotArea)

In [19]:
# preprocessing
#hspr.head()
X = hspr[hspr['sample']=='train'].copy()
X.drop(columns=['sample', 'SalePrice'], inplace=True)
y = hspr.SalePrice[hspr['sample']=='train'].copy()

s = StandardScaler()
X = s.fit_transform(X)

Unnamed: 0,SalePrice,LotArea,OverallQual,OverallCond,ExterCond,BsmtFinSF1,TotalBsmtSF,HeatingQC,1stFlrSF,2ndFlrSF,GrLivArea,FullBath,HalfBath,KitchenQual,GarageArea,GarageCars,TotRmsAbvGrd,BedroomAbvGr,ExterQual,LotFrontage,sample,Age
0,208500,9.04204,7,5,3,706,856,5,856,854,1710,2,1,4,548,2,8,3,4,65.0,train,7
1,181500,9.169623,6,8,3,978,1262,5,1262,0,1262,2,0,3,460,2,6,3,3,80.0,train,34
2,223500,9.328212,7,5,3,486,920,5,920,866,1786,2,1,4,608,2,6,3,4,68.0,train,9
3,140000,9.164401,7,5,3,216,756,4,961,756,1717,1,0,4,642,3,7,3,3,60.0,train,95
4,250000,9.565284,8,5,3,655,1145,5,1145,1053,2198,2,1,4,836,3,9,4,4,84.0,train,10
