In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import pandas_profiling
import seaborn as sns
import matplotlib.pyplot as plt

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
base_path = '/kaggle/input/house-prices-advanced-regression-techniques/'
train = pd.read_csv(os.path.join(base_path, 'train.csv'))
test = pd.read_csv(os.path.join(base_path, 'test.csv'))
train = train.drop(['Id'], axis=1)
test = test.drop(['Id'], axis=1)

In [None]:
# Concat the train data and test data
target = train['SalePrice']
train_index = len(train) - 1
all_data = pd.concat([train.drop(['SalePrice'], axis=1),test]).reset_index(drop=True)
all_data.head()

In [None]:
# Get the categorical columns and numerical columns
cat_columns = all_data.select_dtypes(include='object').columns
num_columns = all_data.select_dtypes(exclude='object').columns

MSSubClass seems like a categorical feature, it should not be int format

In [None]:
all_data['MSSubClass'] = all_data['MSSubClass'].astype('str')

In [None]:
SalePrice = train['SalePrice']
log_SalePrice = np.log(SalePrice) # take log to build a 'normal distribution'
SalePrice.hist()
plt.show()
log_SalePrice.hist()
plt.show()

### Numerical features

#### Missing values

In [None]:
missing_value = all_data[num_columns].isna().sum()
missing_value[missing_value > 0]

In [None]:
# # LotFrontage: Linear feet of street connected to property
# print(train.groupby(['Neighborhood'])['LotFrontage'].mean())
# # plot the mean value
# train['LotFrontage'].hist()
# plt.axvline(train['LotFrontage'].mean(), color='r') 

# Fill na with mean value
all_data['LotFrontage'] = all_data.groupby(['Neighborhood'])['LotFrontage'].apply(lambda x: x.fillna(x.mean()))


# # MasVnrArea: Masonry veneer area in square feet
# train['MasVnrArea'].hist()
all_data['MasVnrArea'] = all_data['MasVnrArea'].fillna(all_data['MasVnrArea'].mode()[0])

# # GarageYrBlt: Year garage was built
all_data['GarageYrBlt'] = all_data['GarageYrBlt'].fillna(all_data['GarageYrBlt'].median())

# # Bsmt & Garage
for c in ['BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath',\
         'GarageCars', 'GarageArea']:
    all_data[c] = all_data[c].fillna(0)

#### Correlation of numerical features with SalePrice

In [None]:
# Correlation of SalePrice and other numerical features
corr_ = train.corr()
np.abs(corr_['SalePrice']).sort_values(ascending=False).plot.bar()

In [None]:
# OverallQual: Rates the overall material and finish of the house
sns.catplot(x='OverallQual', y='SalePrice', kind='box', data=train)

In [None]:
# GrLivArea: Above grade (ground) living area square feet
sns.jointplot(x='GrLivArea', y='SalePrice', data=train)

In [None]:
# all_data[['YearBuilt', 'YearRemodAdd', 'YrSold']].head()
# create two features
all_data['diff_build_remodel'] = all_data['YearRemodAdd'] - all_data['YearBuilt']
all_data['fresh'] = all_data['YrSold'] - all_data['YearRemodAdd']

In [None]:
# MasVnrArea: Masonry veneer area in square feet
# all_data['MasVnrArea'].hist()
# Most values of this feature are 0
# create a MasVnrArea band 
all_data['MasVnrArea_band'] = all_data['MasVnrArea'].map(lambda x: 0 if x==0.0 else
                                             1 if x<400 else
                                             2)
# sns.catplot(x='MasVnrArea_band', y='SalePrice', kind='box', data=train)

In [None]:
# BsmtFinSF1, BsmtFinSF2, Type 1&2 finished square feet
# BsmtUnfSF: Unfinished square feet of basement area
# TotalBsmtSF: Total square feet of basement area

# drop SF2, keep total FinSF

In [None]:
# 1stFlrSF & 2nd FlrSF. First&Second Floor square feet
# create feature total_FlrSF
all_data['total_FlrSF'] = all_data['1stFlrSF'] + all_data['2ndFlrSF']

In [None]:
# OpenPorchSF: Open porch area in square feet
# EnclosedPorch: Enclosed Porch area in square feet
# ScreenPorch,3SsnPorch
# transform this feature to if we have porch
all_data['total_porch'] = all_data['OpenPorchSF'] + all_data['EnclosedPorch'] + all_data['ScreenPorch'] + all_data['3SsnPorch']
all_data['OpenPorchSF_have'] = all_data['OpenPorchSF'].apply(lambda x: 1 if x>0.0 else 0)
all_data['have_porch'] = all_data['total_porch'].apply(lambda x: 1 if x>0.0 else 0)

In [None]:
# HalfBath, BsmtFullBath, FullBath, BsmtHalfBath
# create feature total_bath
all_data['total_bath'] = 0.5 * all_data['HalfBath'] + all_data['BsmtFullBath'] + 0.5 * all_data['BsmtHalfBath'] + all_data['FullBath']

In [None]:
sns.catplot(x='total_bath', y='SalePrice', kind='swarm', data=all_data.iloc[:1459, :].join(train['SalePrice']))

In [None]:
# GarageYrBlt, 
# train['GarageYrBlt'][train['GarageYrBlt'] > 0].hist()
all_data['Garage_age'] = all_data['YrSold'] - all_data['GarageYrBlt']
all_data['Garage_age'] = all_data['Garage_age'].apply(lambda x: 0 if x<0 else x)

In [None]:
# Bedroom, kitchen, total room abvgr
# drop bedroom, kitchen

In [None]:
# Fireplaces
all_data['Fireplaces_exist'] = all_data['Fireplaces'].apply(lambda x: 1 if x>0 else 0)

In [None]:
# LowQualFinSF
all_data['LowQualFinSF'] = all_data['LowQualFinSF'].apply(lambda x: 1 if x>0.0 else 0)
# sns.catplot(x='LowQualFinSF', y='SalePrice', data=train)
# Drop this feature

In [None]:
# MiscVal: $Value of miscellaneous feature
# Almost all values are 0. drop it
all_data['MiscVal'].hist()

In [None]:
num_columns = all_data.select_dtypes(exclude='object').columns
num_columns

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler
X = all_data.loc[:1459, num_columns]
y = train['SalePrice']

scaler = MinMaxScaler()
scaler.fit_transform(X)
rf = RandomForestRegressor().fit(X, y)
feature_importance = pd.Series(rf.feature_importances_, index=X.columns)
feature_importance.sort_values().plot.bar()

Drop `have_porch`, `OpenPorchSF_have`, `BsmtFullBath`, `HalfBath`, `Fireplaces_exist`, `ScreenPorch`, `YrSold`, `FirePlaces`, `BedroomAbvGr`

### Skewness data transformation

In [None]:
from scipy.special import boxcox1p

num_cols = all_data.select_dtypes(exclude='object').columns
skewness = all_data[num_cols].skew().abs().sort_values(ascending=False)
cols_to_transform = skewness[skewness > 0.75].index
for c in cols_to_transform:
    try:
        all_data[c] = boxcox1p(all_data[c], 0.15)
    except:
        print('columns: ', c)

In [None]:
all_data[num_cols].skew().abs().sort_values(ascending=False).head(10)

Drop `PoolArea`, `3SsnPorch`, `LowQualFinSF`, `MiscVal`

### Categorical features

In [None]:
cat_columns

#### Missing value

In [None]:
all_data[cat_columns].isna().sum()[all_data[cat_columns].isna().sum()>0]

In [None]:
# MSZoning
# all_data['MSZoning'].hist()
all_data['MSZoning'] = all_data.groupby(['MSSubClass']).MSZoning.apply(lambda x: x.fillna(x.mode()[0]))

# Alley, PoolQC,Fence, MiscFeature
for c in ['Alley', 'PoolQC', 'MiscFeature']:
    all_data[c] = all_data[c].fillna('None')
    
# MasVnrType, Electrical, Utilities, 
all_data['MasVnrType'] = all_data['MasVnrType'].fillna('None')
for c in ['Electrical', 'Utilities', 'Exterior1st', 'Exterior2nd', 'Functional', 'SaleType']:
    all_data[c] = all_data[c].fillna(all_data[c].mode()[0])
    
# KitchenQual
all_data['KitchenQual'] = all_data['KitchenQual'].fillna('TA')

# Bsmt
for c in ['BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2']:
    all_data[c] = all_data[c].fillna('None')

# Garage
for c in ['GarageType', 'GarageFinish', 'GarageQual', 'GarageCond']:
    all_data[c] = all_data[c].fillna('None')

# FireplaceQu, Fence
all_data['FireplaceQu'] = all_data['FireplaceQu'].fillna('None')
all_data['Fence'] = all_data['Fence'].fillna('None')

#### Individual feature

In [None]:
cat_columns

In [None]:
all_data['Street'] = all_data['Street'].map({'Pave':1, 'Grvl':0})
all_data['Alley'] = all_data['Alley'].map({'Pave':1, 'Grvl':0, 'None':-1})
all_data['LotShape'] = all_data['LotShape'].map({'IR3':0, 'IR2':1, 'IR1':2, 'Reg':3})
all_data['LandSlope'] = all_data['LandSlope'].map({'Gtl':0, 'Mod':1, 'Sev':2})

all_data['HouseStyle'] = all_data['HouseStyle'].map({'1Story':1,
                                                     '1.5Unf':2,
                                                    '1.5Fin':3,
                                                    '2Story':4,
                                                    '2.5Unf':5,
                                                    '2,5Fin':6,
                                                    'SFoyer':4,
                                                    'SLvl':7})

for c in ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'HeatingQC', 'KitchenQual', 'FireplaceQu', 'GarageQual', 'GarageCond', 'PoolQC']:
    all_data[c] = all_data[c].map({'Po':0,'Fa':1,'TA':2,'Gd':3,'Ex':4,'None':-1})

all_data['BsmtExposure'] = all_data['BsmtExposure'].map({'None':-1,'No':0,'Mn':1,'Av':2,'Gd':3})
for c in ['BsmtFinType1', 'BsmtFinType2']:
    all_data[c] = all_data[c].map({'None':-1,
                                  'Unf':0,
                                  'LwQ':1,
                                  'Rec':2,
                                  'BLQ':3,'ALQ':4,'GLQ':5})

all_data['CentralAir'] = all_data['CentralAir'].map({'N':0, 'Y':1})

all_data['GarageType'] = all_data['GarageType'].apply(lambda x: 2 if x in ['Attchd', 'BuiltIn']
                                                     else 0 if x=='None'
                                                     else 1)
all_data['GarageFinish'] = all_data['GarageFinish'].apply(lambda x: 2 if x in ['Fin', 'RFn']
                                                     else 1 if x=='Unf'
                                                     else 0)

all_data['PavedDrive'] = all_data['PavedDrive'].map({'N':-1, 'P':0, 'Y':1})

all_data['SaleType'] = all_data['SaleType'].apply(lambda x:1 if x in ['WD', 'NEW', 'CWD','Con']
                                           else 0 if x=='Oth'
                                           else 2)

In [None]:
# drop Utilities
