# Kaggle House competition
trying predict house price

In [1]:
# all imports I need
import pandas as pd
import numpy as np
#for cross_feature
from itertools import combinations

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

from sklearn.metrics import mean_absolute_error, mean_squared_error

#models
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.linear_model import SGDRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.dummy import DummyRegressor

#for pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

#preprocessing
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, Normalizer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.preprocessing import KBinsDiscretizer, PowerTransformer

#feature selection
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression

from scipy import stats

import seaborn as sns
import matplotlib as mplt
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
def transform_df(input_df,y_full):
    df = input_df.copy()
    df['Total_Bath'] = df['FullBath'] + df['HalfBath']
    features_cat_from_nan_very_good = ['BsmtQual', 'GarageFinish', 'FireplaceQu', 'MasVnrType', 
                                       'GarageType', 'BsmtFinType1', 'BsmtExposure']
    features_cat_no_nan_very_good = ['ExterQual', 'KitchenQual', 'Foundation', 'CentralAir', 'HeatingQC']
    features_cat_from_nan_maybe_good = ['GarageQual', 'GarageCond', 'Electrical', 'BsmtCond', 'Alley', 'Fence', 'PoolQC']
    features_cat_no_nan_maybe_good = ['SaleCondition', 'MSZoning', 'PavedDrive', 'LotShape', 'SaleType', 
                                      'HouseStyle', 'RoofStyle', 'BldgType', 'LandContour']
    features_num_good_corr = ['OverallQual', 'GrLivArea', 'GarageCars', 'GarageArea', 'TotalBsmtSF', '1stFlrSF', 
                              'FullBath', 'TotRmsAbvGrd', 'YearBuilt', 'YearRemodAdd', 'Total_Bath']
    
    crossed_df, cross_features = get_cross_features(df,y_full)
    return df.join(crossed_df)[#features_cat_from_nan_maybe_good + features_cat_no_nan_maybe_good
                               features_cat_from_nan_very_good + features_cat_no_nan_very_good +  
                              features_num_good_corr + cross_features]

In [3]:
def get_cross_features(df_in, y_full):
    df = df_in.copy()
    last_col_index = df.shape[1]
    all_num_columns = df.select_dtypes(exclude='object').columns # select only numeric features
    c = combinations(all_num_columns,2)
    for comb in c:
        x1 = comb[0]
        x2 = comb[1]
        df[[x1 + '_x_' + x2]] = df[x1] * df[x2] # add new columns from right side
        #df.join(pd.Series(df[x1]*df[x2], name=x1+'_x_'+x2))
    target_name = y_full.name
    cross_corr = df.iloc[:, last_col_index:].join(y_full).corr()[target_name].sort_values(ascending=False).dropna()
    cross_features = cross_corr[cross_corr > 0.5].iloc[1:].index # skip SalePrice
    return df.iloc[:, last_col_index:], list(cross_features)

# Load data
it is possible to load data from [here](https://www.kaggle.com/c/home-data-for-ml-course/data "Kaggle's House competition")

In [4]:
TRAIN_PATH = "../data/train.csv"
TEST_PATH = '../data/test.csv'
Full_train = pd.read_csv(TRAIN_PATH,index_col='Id')
Full_test = pd.read_csv(TEST_PATH)
Full_train.dropna(axis=0,subset=['SalePrice'], inplace=True)
y_full = Full_train.SalePrice
X_full = Full_train.drop(axis=1, columns=['SalePrice'])

In [13]:
columns_to_drop = ['MoSold','YrSold'] # these columns could be cause of data leakage, it is better to drop them
X_full.drop(columns_to_drop, axis=1, inplace=True)

In [14]:
transformed = transform_df(X_full, y_full)

In [15]:
X_full.shape

(1460, 77)

In [16]:
transformed

Unnamed: 0_level_0,BsmtQual,GarageFinish,FireplaceQu,MasVnrType,GarageType,BsmtFinType1,BsmtExposure,ExterQual,KitchenQual,Foundation,...,YearBuilt_x_GarageYrBlt,BsmtFinSF1_x_Total_Bath,MasVnrArea_x_TotalBsmtSF,OverallCond_x_1stFlrSF,GrLivArea_x_BedroomAbvGr,LotFrontage_x_TotRmsAbvGrd,BsmtFinSF1_x_FullBath,OverallCond_x_Total_Bath,MasVnrArea_x_Fireplaces,OverallQual_x_BsmtFinSF1
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,Gd,RFn,,BrkFace,Attchd,GLQ,No,Gd,Gd,PConc,...,4012009.0,2118,167776.0,4280,5130,520.0,1412,15,0.0,4942
2,Gd,RFn,TA,,Attchd,ALQ,Gd,TA,TA,CBlock,...,3904576.0,1956,0.0,10096,3786,480.0,1956,16,0.0,5868
3,Gd,RFn,TA,BrkFace,Attchd,GLQ,Mn,Gd,Gd,PConc,...,4004001.0,1458,149040.0,4600,5358,408.0,972,15,162.0,3402
4,TA,Unf,Gd,,Detchd,ALQ,No,TA,Gd,BrkTil,...,3826170.0,216,0.0,4805,5151,420.0,216,5,0.0,1512
5,Gd,RFn,TA,BrkFace,Attchd,GLQ,Av,Gd,Gd,PConc,...,4000000.0,1965,400750.0,5725,8792,756.0,1310,15,350.0,5240
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1456,Gd,RFn,TA,,Attchd,Unf,No,TA,TA,PConc,...,3996001.0,0,0.0,4765,4941,434.0,0,15,0.0,0
1457,Gd,Unf,TA,Stone,Attchd,ALQ,No,TA,TA,CBlock,...,3912484.0,1580,183498.0,12438,6219,595.0,1580,12,238.0,4740
1458,TA,RFn,Gd,,Attchd,GLQ,No,Ex,Gd,Stone,...,3767481.0,550,0.0,10692,9360,594.0,550,18,0.0,1925
1459,TA,Unf,,,Attchd,GLQ,Mn,TA,Gd,CBlock,...,3802500.0,49,0.0,6468,2156,340.0,49,6,0.0,245


In [17]:
    features_cat_from_nan_very_good = ['BsmtQual', 'GarageFinish', 'FireplaceQu', 'MasVnrType', 
                                       'GarageType', 'BsmtFinType1', 'BsmtExposure']
    features_cat_no_nan_very_good = ['ExterQual', 'KitchenQual', 'Foundation', 'CentralAir', 'HeatingQC']
    features_cat_from_nan_maybe_good = ['GarageQual', 'GarageCond', 'Electrical', 'BsmtCond', 'Alley', 'Fence', 'PoolQC']
    features_cat_no_nan_maybe_good = ['SaleCondition', 'MSZoning', 'PavedDrive', 'LotShape', 'SaleType', 
                                      'HouseStyle', 'RoofStyle', 'BldgType', 'LandContour']
    features_num_good_corr = ['OverallQual', 'GrLivArea', 'GarageCars', 'GarageArea', 'TotalBsmtSF', '1stFlrSF', 
                              'FullBath', 'TotRmsAbvGrd', 'YearBuilt', 'YearRemodAdd', 'Total_Bath']

In [18]:
indices_bad_basement = [333,949] # problematic indicies with basement. without this we can conclude, that NaN = No Basement

In [19]:
transformed = transformed.drop(index=indices_bad_basement)

In [20]:
transformed[features_cat_from_nan_very_good] = transformed[features_cat_from_nan_very_good].fillna('None')

In [21]:
transformed.isna().sum()[transformed.isna().sum() > 0]

OverallQual_x_GarageYrBlt      81
GrLivArea_x_GarageYrBlt        81
LotFrontage_x_GarageCars      259
LotFrontage_x_OverallQual     259
GarageYrBlt_x_GarageCars       81
LotFrontage_x_Total_Bath      259
GarageYrBlt_x_GarageArea       81
TotalBsmtSF_x_GarageYrBlt      81
1stFlrSF_x_GarageYrBlt         81
LotFrontage_x_FullBath        259
LotFrontage_x_GarageArea      259
MasVnrArea_x_GarageCars         8
FullBath_x_GarageYrBlt         81
GarageYrBlt_x_Total_Bath       81
MasVnrArea_x_GarageArea         8
TotRmsAbvGrd_x_GarageYrBlt     81
OverallQual_x_MasVnrArea        8
YearRemodAdd_x_GarageYrBlt     81
MasVnrArea_x_FullBath           8
MasVnrArea_x_GrLivArea          8
MasVnrArea_x_TotRmsAbvGrd       8
LotFrontage_x_GrLivArea       259
MasVnrArea_x_1stFlrSF           8
MasVnrArea_x_Total_Bath         8
YearBuilt_x_GarageYrBlt        81
MasVnrArea_x_TotalBsmtSF        8
LotFrontage_x_TotRmsAbvGrd    259
MasVnrArea_x_Fireplaces         8
dtype: int64

In [22]:
numeric_col = list((X_full.select_dtypes(exclude='object')).columns)
string_col = list((X_full.select_dtypes(include='object')).columns)
low_categorical_col = [col for col in X_full.columns
                  if X_full[col].dtype == 'object' and X_full[col].nunique() <10]

In [23]:
X_full.shape

(1460, 77)

# preprocessing
it is time to Build a model, final steps!

In [151]:
ez = X_full.select_dtypes(exclude='object')
ez.dropna()

Unnamed: 0_level_0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,GarageYrBlt,GarageCars,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,65.0,8450,7,5,2003,2003,196.0,706,0,...,2003.0,2,548,0,61,0,0,0,0,0
2,20,80.0,9600,6,8,1976,1976,0.0,978,0,...,1976.0,2,460,298,0,0,0,0,0,0
3,60,68.0,11250,7,5,2001,2002,162.0,486,0,...,2001.0,2,608,0,42,0,0,0,0,0
4,70,60.0,9550,7,5,1915,1970,0.0,216,0,...,1998.0,3,642,0,35,272,0,0,0,0
5,60,84.0,14260,8,5,2000,2000,350.0,655,0,...,2000.0,3,836,192,84,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1456,60,62.0,7917,6,5,1999,2000,0.0,0,0,...,1999.0,2,460,0,40,0,0,0,0,0
1457,20,85.0,13175,6,6,1978,1988,119.0,790,163,...,1978.0,2,500,349,0,0,0,0,0,0
1458,70,66.0,9042,7,9,1941,2006,0.0,275,0,...,1941.0,1,252,0,60,0,0,0,0,2500
1459,20,68.0,9717,5,6,1950,1996,0.0,49,1029,...,1950.0,1,240,366,0,112,0,0,0,0


In [155]:
#X = transformed.dropna(axis=0).select_dtypes(exclude='object')
#X = ez.dropna(axis=0) # is it really better?
X = transformed[features_num_good_corr]

In [156]:
x_train, x_valid, y_train, y_valid = train_test_split(X,pd.Series(y_full,index=X.index), random_state=42)

# Building a model

In [157]:
model = Pipeline([
   # ('preprocessing', prep),
    ('model', RandomForestRegressor(random_state=42))
])

In [158]:
model.fit(x_train, y_train)
preds = model.predict(x_valid)

## Evaluating the model

In [159]:
print(mean_absolute_error(preds, y_valid))
print(mean_squared_error(preds, y_valid))
print(model.score(x_valid, y_valid))

20111.682761056752
1355351825.2061446
0.7557933662281248


# Create sumbission on kaggle

In [None]:
#X_exam = Full_test[features].copy()

In [None]:
#sumbission on Kaggle
#predict_on_test = model.predict(X_exam)
#output = pd.DataFrame({'Id': Full_test.Id,
#                       'SalePrice': predict_on_test})
#output.to_csv('submission.csv', index=False)