In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from scipy.stats import norm, skew
import sklearn
from dython import nominal, data_utils

# Change display limit to prevent truncating
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

## Data Loading

In [2]:
# Loading of train and test data
train_data_raw = pd.read_csv('../data/train.csv')

In [3]:
from sklearn.model_selection import train_test_split

y = train_data_raw["SalePrice"]
X = train_data_raw.drop("SalePrice", axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

train_data = pd.concat([X_train, y_train], axis=1)
test_data = pd.concat([X_test, y_test], axis=1)

train_data.shape

(1022, 81)

In [4]:
train_data.tail(5)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
763,764,60,RL,82.0,9430,Pave,,Reg,Lvl,AllPub,Inside,Gtl,NoRidge,Norm,Norm,1Fam,2Story,8,5,1999,1999,Gable,CompShg,VinylSd,VinylSd,BrkFace,673.0,Gd,TA,PConc,Gd,TA,Mn,GLQ,1163,Unf,0,89,1252,GasA,Ex,Y,SBrkr,1268,1097,0,2365,1,0,2,1,3,1,Gd,8,Typ,1,Gd,Attchd,1999.0,RFn,3,856,TA,TA,Y,0,128,0,0,180,0,,,,0,7,2009,WD,Normal,337000
835,836,20,RL,60.0,9600,Pave,,Reg,Lvl,AllPub,Inside,Gtl,Sawyer,Norm,Norm,1Fam,1Story,4,7,1950,1995,Gable,CompShg,VinylSd,HdBoard,,0.0,TA,TA,CBlock,Gd,TA,No,BLQ,442,Unf,0,625,1067,GasA,TA,Y,SBrkr,1067,0,0,1067,0,0,2,0,2,1,Gd,4,Min2,0,,Attchd,1996.0,Unf,2,436,TA,TA,Y,290,0,0,0,0,0,,,,0,2,2010,WD,Normal,128000
1216,1217,90,RM,68.0,8930,Pave,,Reg,Lvl,AllPub,Inside,Gtl,Sawyer,RRAe,Norm,Duplex,1.5Fin,6,5,1978,1978,Gable,CompShg,VinylSd,VinylSd,,0.0,TA,TA,Slab,,,,,0,,0,0,0,GasA,TA,Y,SBrkr,1318,584,0,1902,0,0,2,0,4,2,TA,8,Typ,0,,Attchd,1978.0,Unf,2,539,TA,TA,Y,0,0,0,0,0,0,,,,0,4,2010,WD,Normal,112000
559,560,120,RL,,3196,Pave,,Reg,Lvl,AllPub,Inside,Gtl,Blmngtn,Norm,Norm,TwnhsE,1Story,7,5,2003,2004,Gable,CompShg,VinylSd,VinylSd,BrkFace,18.0,Gd,TA,PConc,Gd,TA,Gd,Unf,0,Unf,0,1374,1374,GasA,Ex,Y,SBrkr,1557,0,0,1557,0,0,2,0,2,1,Gd,7,Typ,1,TA,Attchd,2003.0,Fin,2,420,TA,TA,Y,143,20,0,0,0,0,,,,0,10,2006,WD,Normal,234000
684,685,60,RL,58.0,16770,Pave,,IR2,Lvl,AllPub,CulDSac,Gtl,NoRidge,Norm,Norm,1Fam,2Story,7,5,1998,1998,Gable,CompShg,VinylSd,VinylSd,BrkFace,30.0,Gd,TA,PConc,Gd,TA,No,Unf,0,Unf,0,1195,1195,GasA,Gd,Y,SBrkr,1195,644,0,1839,0,0,2,1,4,1,TA,7,Typ,0,,Attchd,1998.0,Fin,2,486,TA,TA,Y,0,81,0,0,0,0,,,,0,6,2010,WD,Normal,221000


## Model training

### Data Preprocessing

In [5]:
# Adjust Feature Types, Some categorical features appeared as continuous
train_data = train_data.astype({
    "MSSubClass": object,
    "OverallQual": object,
    "OverallCond": object,
    "MoSold": object,
    "YrSold": object,
    "YearBuilt": object,
    "YearRemodAdd": object,
    "BsmtFullBath": object,
    "BsmtHalfBath": object,
    "FullBath": object,
    "HalfBath": object,
    "BedroomAbvGr": object,
    "KitchenAbvGr": object,
    "TotRmsAbvGrd": object,
    "Fireplaces": object,
    "GarageYrBlt": object,
    "GarageCars": object
})

In [6]:
# Create additional features for better predictions

from datetime import date

todays_date = date.today()
train_data['YearsSinceBuilt'] = todays_date.year - train_data['YearBuilt']
train_data['YearsSinceRemodAdd'] = todays_date.year - train_data['YearRemodAdd']
train_data['YearsSinceGarageYrBlt'] = todays_date.year - train_data['GarageYrBlt']

### Data Cleaning

In [7]:
selected_features = ['GrLivArea', 'GarageArea', 'TotalBsmtSF', 'HouseStyle', 'Neighborhood', 'OverallQual', 'ExterQual', 'KitchenQual', 'Functional', 'FireplaceQu', 'YearsSinceBuilt', 'YearsSinceRemodAdd', 'BsmtExposure', 'HalfBath', 'YearsSinceGarageYrBlt', 'Electrical', 'BsmtFullBath', 'BldgType', 'KitchenAbvGr', 'Heating', 'CentralAir', 'GarageType', 'GarageFinish', 'BsmtQual', 'SalePrice']
train_data = train_data[selected_features]

In [8]:
%%capture --no-display
# Specify null values in BsmtQual as No (No basement)
train_data.loc[:, 'BsmtQual'] = train_data.loc[:, 'BsmtQual'].fillna('No')
train_data.loc[:, 'GarageType'] = train_data.loc[:, 'GarageType'].fillna('No')
train_data.loc[:, 'GarageFinish'] = train_data.loc[:, 'GarageFinish'].fillna('No')
train_data.loc[:, 'FireplaceQu'] = train_data.loc[:, 'FireplaceQu'].fillna('No')
train_data.loc[:, 'BsmtExposure'] = train_data.loc[:, 'BsmtExposure'].fillna('No')
train_data.loc[:, 'YearsSinceGarageYrBlt'] = train_data.loc[:, 'YearsSinceGarageYrBlt'].fillna(0)

train_data = train_data.dropna(subset='Electrical')
train_data.reset_index(inplace=True)

train_data.isna().values.sum()

0

### Data Selection

In [9]:
selected_categorical_features = ['HouseStyle', 'CentralAir', 'GarageType', 'GarageFinish', 'Neighborhood', 'OverallQual', 'ExterQual', 'KitchenQual', 'Functional', 'FireplaceQu', 'Heating', 'BsmtExposure', 'HalfBath', 'YearsSinceGarageYrBlt', 'Electrical', 'BsmtFullBath', 'BldgType', 'KitchenAbvGr', 'BsmtQual']
selected_continuous_features = ['GrLivArea', 'GarageArea', 'TotalBsmtSF', 'YearsSinceBuilt', 'YearsSinceRemodAdd']

In [10]:
# Our target feature
# Transformate it using the log function to correct the distribution
y_train = np.log1p(train_data['SalePrice'])

In [11]:
# Our predictors
X_train_cat = train_data[selected_categorical_features]
X_train_con = train_data[selected_continuous_features]

### Data Transformation

In [12]:
# Transform the continuous features with the skewness of more than 0.5
skewness = X_train_con.apply(lambda x: skew(x))
skewness = skewness[abs(skewness) > 0.5]
skewed_features = skewness.index
X_train_con[skewed_features] = np.log1p(X_train_con[skewed_features].astype(float))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_con[skewed_features] = np.log1p(X_train_con[skewed_features].astype(float))


### Data Encoding

#### Ordinal Encoding

In [13]:
from sklearn.preprocessing import OrdinalEncoder
oe = OrdinalEncoder(categories=[
    ["No", "Po", "Fa", "TA", "Gd", "Ex"],
    ["N", "Y"],
    ["Po", "Fa", "TA", "Gd", "Ex"],
    ["Po", "Fa", "TA", "Gd", "Ex"],
    ["No", "Unf", "RFn", "Fin"],
    ["Sal", "Sev", "Maj2", "Maj1", "Mod", "Min2", "Min1", "Typ"],
    ["No", "Po", "Fa", "TA", "Gd", "Ex"],
    ["No", "Mn", "Av", "Gd"]
])

ordinal_encoded_colums = ['BsmtQual', 'CentralAir', 'ExterQual', 'KitchenQual', 'GarageFinish', 'Functional', 'FireplaceQu', 'BsmtExposure']
X_train_cat_ordinal_encoded_array = oe.fit_transform(X_train_cat[ordinal_encoded_colums])
X_train_cat_ordinal_encoded = pd.DataFrame(X_train_cat_ordinal_encoded_array, columns=ordinal_encoded_colums)
X_train_cat_not_one_encoded = X_train_cat.drop(ordinal_encoded_colums, axis=1)
X_train_cat = pd.concat([X_train_cat_ordinal_encoded, X_train_cat_not_one_encoded], axis=1)

#### One-Hot Encoding

In [14]:
# Apply One-Hot Encoding to Neighborhood
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder()

one_hot_encoded_colums = ['HouseStyle', 'GarageType', 'Neighborhood', 'Heating', 'Electrical', 'BldgType']
X_train_cat_hot_encoded_array = ohe.fit_transform(X_train_cat[one_hot_encoded_colums]).toarray()
X_train_cat_hot_encoded = pd.DataFrame(X_train_cat_hot_encoded_array, columns=ohe.get_feature_names_out())
X_train_cat_not_one_encoded = X_train_cat.drop(one_hot_encoded_colums, axis=1)
X_train_cat = pd.concat([X_train_cat_hot_encoded, X_train_cat_not_one_encoded], axis=1)

### Model training

In [15]:
# Metric
import numpy as np
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_log_error

def compute_rmsle(y_test: np.ndarray, y_pred: np.ndarray, precision: int = 2) -> float:
    rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
    return round(rmsle, precision)

In [16]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X_train = pd.concat([X_train_con, X_train_cat], axis = 1)

# Scaling
std_scaler = StandardScaler()
X_train.loc[:, selected_continuous_features] = std_scaler.fit_transform(X_train.loc[:, selected_continuous_features])

In [17]:
from sklearn.linear_model import LinearRegression

# Train
model = LinearRegression()
model.fit(X_train, y_train)

LinearRegression()

## Model evaluation

In [18]:
test_data.tail(5)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
271,272,20,RL,73.0,39104,Pave,,IR1,Low,AllPub,CulDSac,Sev,ClearCr,Norm,Norm,1Fam,1Story,7,7,1954,2005,Flat,Membran,Plywood,Plywood,,0.0,TA,TA,CBlock,Gd,TA,Gd,LwQ,226,GLQ,1063,96,1385,GasA,Ex,Y,SBrkr,1363,0,0,1363,1,0,1,0,2,1,TA,5,Mod,2,TA,Attchd,1954.0,Unf,2,439,TA,TA,Y,81,0,0,0,0,0,,,,0,4,2008,WD,Normal,241500
445,446,20,RL,73.0,9855,Pave,,Reg,Lvl,AllPub,Corner,Gtl,Edwards,Norm,Norm,1Fam,1Story,6,5,1956,1956,Hip,CompShg,Wd Sdng,Wd Sdng,,0.0,TA,TA,CBlock,TA,TA,No,Unf,0,Unf,0,1436,1436,GasA,Fa,Y,SBrkr,1689,0,0,1689,0,0,1,0,3,1,TA,7,Typ,1,Gd,Attchd,1956.0,Unf,2,480,TA,TA,Y,0,0,0,0,0,0,,MnPrv,,0,11,2009,COD,Normal,127500
654,655,20,RL,91.0,10437,Pave,,IR1,Lvl,AllPub,Inside,Gtl,NoRidge,Norm,Norm,1Fam,1Story,8,6,1995,1995,Hip,CompShg,MetalSd,MetalSd,BrkFace,660.0,Gd,Gd,PConc,Gd,TA,Gd,GLQ,1696,Unf,0,413,2109,GasA,Ex,Y,SBrkr,2113,0,0,2113,1,0,2,1,2,1,Gd,7,Typ,1,TA,Attchd,1995.0,Fin,3,839,TA,TA,Y,236,46,0,0,0,0,,,,0,8,2008,WD,Normal,350000
1280,1281,20,RL,67.0,9808,Pave,,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,1Story,7,5,2002,2002,Gable,CompShg,VinylSd,VinylSd,BrkFace,110.0,Gd,TA,PConc,Gd,TA,No,GLQ,788,Unf,0,785,1573,GasA,Ex,Y,SBrkr,1573,0,0,1573,1,0,2,0,3,1,Gd,6,Typ,0,,Attchd,2002.0,RFn,2,544,TA,TA,Y,0,72,0,0,0,0,,,,0,3,2009,WD,Normal,227000
898,899,20,RL,100.0,12919,Pave,,IR1,Lvl,AllPub,Inside,Gtl,NridgHt,Norm,Norm,1Fam,1Story,9,5,2009,2010,Hip,CompShg,VinylSd,VinylSd,Stone,760.0,Ex,TA,PConc,Ex,TA,Gd,GLQ,2188,Unf,0,142,2330,GasA,Ex,Y,SBrkr,2364,0,0,2364,1,0,2,1,2,1,Ex,11,Typ,2,Gd,Attchd,2009.0,Fin,3,820,TA,TA,Y,0,67,0,0,0,0,,,,0,3,2010,New,Partial,611657


In [19]:
test_data.shape

(438, 81)

### Data Preprocessing

In [20]:
# Adjust Feature Types, Some categorical features appeared as continuous
test_data = test_data.astype({
    "MSSubClass": object,
    "OverallQual": object,
    "OverallCond": object,
    "MoSold": object,
    "YrSold": object,
    "YearBuilt": object,
    "YearRemodAdd": object,
    "BsmtFullBath": object,
    "BsmtHalfBath": object,
    "FullBath": object,
    "HalfBath": object,
    "BedroomAbvGr": object,
    "KitchenAbvGr": object,
    "TotRmsAbvGrd": object,
    "Fireplaces": object,
    "GarageYrBlt": object,
    "GarageCars": object
})

In [21]:
todays_date = date.today()
test_data['YearsSinceBuilt'] = todays_date.year - test_data['YearBuilt']
test_data['YearsSinceRemodAdd'] = todays_date.year - test_data['YearRemodAdd']
test_data['YearsSinceGarageYrBlt'] = todays_date.year - test_data['GarageYrBlt']

### Data Cleaning

In [22]:
selected_features = ['GrLivArea', 'GarageArea', 'TotalBsmtSF', 'HouseStyle', 'Neighborhood', 'OverallQual', 'ExterQual', 'KitchenQual', 'Functional', 'FireplaceQu', 'YearsSinceBuilt', 'YearsSinceRemodAdd', 'BsmtExposure', 'HalfBath', 'YearsSinceGarageYrBlt', 'Electrical', 'BsmtFullBath', 'BldgType', 'KitchenAbvGr', 'Heating', 'CentralAir', 'GarageType', 'GarageFinish', 'BsmtQual', 'SalePrice']
test_data = test_data[selected_features]

In [23]:
%%capture --no-display
# Specify null values in BsmtQual as No (No basement)
test_data.loc[:, 'BsmtQual'] = test_data.loc[:, 'BsmtQual'].fillna('No')
test_data.loc[:, 'GarageType'] = test_data.loc[:, 'GarageType'].fillna('No')
test_data.loc[:, 'GarageFinish'] = test_data.loc[:, 'GarageFinish'].fillna('No')
test_data.loc[:, 'FireplaceQu'] = test_data.loc[:, 'FireplaceQu'].fillna('No')
test_data.loc[:, 'BsmtExposure'] = test_data.loc[:, 'BsmtExposure'].fillna('No')
test_data.loc[:, 'YearsSinceGarageYrBlt'] = test_data.loc[:, 'YearsSinceGarageYrBlt'].fillna(0)

test_data = test_data.dropna(subset='Electrical')
test_data.reset_index(inplace=True)

test_data.isna().values.sum()

0

### Data Selection

In [24]:
selected_categorical_features = ['HouseStyle', 'CentralAir', 'GarageType', 'GarageFinish', 'Neighborhood', 'OverallQual', 'ExterQual', 'KitchenQual', 'Functional', 'FireplaceQu', 'Heating', 'BsmtExposure', 'HalfBath', 'YearsSinceGarageYrBlt', 'Electrical', 'BsmtFullBath', 'BldgType', 'KitchenAbvGr', 'BsmtQual']
selected_continuous_features = ['GrLivArea', 'GarageArea', 'TotalBsmtSF', 'YearsSinceBuilt', 'YearsSinceRemodAdd']

# Our target feature
# Transformate it using the log function to correct the distribution
y_test = np.log1p(test_data['SalePrice'])

# Our predictors
X_test_cat = test_data[selected_categorical_features]
X_test_con = test_data[selected_continuous_features]

### Data Transformation

In [25]:
skewness = X_test_con.apply(lambda x: skew(x))
skewness = skewness[abs(skewness) > 0.5]
skewed_features = skewness.index
X_test_con[skewed_features] = np.log1p(X_test_con[skewed_features].astype(float))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_con[skewed_features] = np.log1p(X_test_con[skewed_features].astype(float))


### Data Encoding

#### Ordinal Encoding

In [26]:
X_test_cat_ordinal_encoded_array = oe.transform(X_test_cat[ordinal_encoded_colums])
X_test_cat_ordinal_encoded = pd.DataFrame(X_test_cat_ordinal_encoded_array, columns=ordinal_encoded_colums)
X_test_cat_not_one_encoded = X_test_cat.drop(ordinal_encoded_colums, axis=1)
X_test_cat = pd.concat([X_test_cat_ordinal_encoded, X_test_cat_not_one_encoded], axis=1)

#### One-Hot Encoding

In [27]:
X_test_cat_hot_encoded_array = ohe.transform(X_test_cat[one_hot_encoded_colums]).toarray()
X_test_cat_hot_encoded = pd.DataFrame(X_test_cat_hot_encoded_array, columns=ohe.get_feature_names_out())
X_test_cat_not_one_encoded = X_test_cat.drop(one_hot_encoded_colums, axis=1)
X_test_cat = pd.concat([X_test_cat_hot_encoded, X_test_cat_not_one_encoded], axis=1)

### Model predictions

In [28]:
X_test = pd.concat([X_test_con, X_test_cat], axis = 1)

# Scaling
X_test.loc[:, selected_continuous_features] = std_scaler.transform(X_test.loc[:, selected_continuous_features])

y_pred = model.predict(X_test)

### Model evaluation

In [29]:
compute_rmsle(y_test, y_pred)

0.01

The <b>RMSLE</b> score is 0.01 <br>
I am trying improve it but so far no success.