In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
warnings.filterwarnings("ignore", category=DeprecationWarning)
from sklearn.preprocessing import LabelEncoder
from scipy.stats import skew #for some statistics
%matplotlib inline

In [2]:
test_df = pd.read_csv('test.csv' )
train_df = pd.read_csv('train.csv' )
test_df_Id = test_df['Id'] 

In [3]:
train_df = train_df[train_df.GrLivArea < 4500]
train_df.reset_index(drop=True, inplace=True)
train_df["SalePrice"] = np.log1p(train_df["SalePrice"])
y = train_df['SalePrice'].reset_index(drop=True)

data = pd.concat([train_df['SalePrice'], train_df['OverallQual']], axis=1)

upperlimit = np.percentile(train_df.SalePrice.values, 99.5)
train_df['SalePrice'].loc[train_df['SalePrice']>upperlimit] = upperlimit

train_df = train_df.drop(train_df[(train_df['GarageCars']>3)
                                  & (train_df['SalePrice']<350000)].index).reset_index(drop=True)

In [4]:
train_features = train_df.drop(['SalePrice'], axis=1)
test_features = test_df
total_features = pd.concat([train_features, test_features]).reset_index(drop=True)

In [5]:
# Since these column are actually a category , using a numerical number will lead the model to assume
# that it is numerical , so we convert to string .
total_features['MSSubClass'] = total_features['MSSubClass'].apply(str)
total_features['YrSold'] = total_features['YrSold'].astype(str)
total_features['MoSold'] = total_features['MoSold'].astype(str)



## Filling these columns With most suitable value for these columns
total_features['Functional'] = total_features['Functional'].fillna('Typ')
total_features['Electrical'] = total_features['Electrical'].fillna("SBrkr")
total_features['KitchenQual'] = total_features['KitchenQual'].fillna("TA")
total_features["PoolQC"] = total_features["PoolQC"].fillna("None")



## Filling these with MODE , i.e. , the most frequent value in these columns .
total_features['Exterior1st'] = total_features['Exterior1st'].fillna(total_features['Exterior1st'].mode()[0])
total_features['Exterior2nd'] = total_features['Exterior2nd'].fillna(total_features['Exterior2nd'].mode()[0])
total_features['SaleType'] = total_features['SaleType'].fillna(total_features['SaleType'].mode()[0])

In [6]:
## Missing data in GarageYrBit most probably means missing Garage , so replace NaN with zero .

for col in ('GarageYrBlt', 'GarageArea', 'GarageCars'):
    total_features[col] = total_features[col].fillna(0)

for col in ['GarageType', 'GarageFinish', 'GarageQual', 'GarageCond']:
    total_features[col] = total_features[col].fillna('None')


## Same with basement features

for col in ('BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2'):
    total_features[col] = total_features[col].fillna('None')

In [15]:
total_features['MSZoning'] = total_features.groupby('MSSubClass')['MSZoning'].transform(lambda x: x.fillna(x.mode()[0]))
objects = []
for i in total_features.columns:
    if total_features[i].dtype == object:
        objects.append(i)
total_features.update(total_features[objects].fillna('None'))

[['MSSubClass', 'MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'MoSold', 'YrSold', 'SaleType', 'SaleCondition']]


In [16]:
# We are still filling up missing values
total_features['LotFrontage'] = total_features.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))

numeric_dtypes = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numerics = []
for i in total_features.columns:
    if total_features[i].dtype in numeric_dtypes:
        numerics.append(i)
total_features.update(total_features[numerics].fillna(0))

['LotFrontage',
 'LotArea',
 'OverallQual',
 'OverallCond',
 'YearBuilt',
 'YearRemodAdd',
 'MasVnrArea',
 'BsmtFinSF1',
 'BsmtFinSF2']

In [17]:
#MSSubClass=The building class
total_features['MSSubClass'] = total_features['MSSubClass'].apply(str)


#Changing OverallCond into a categorical variable
total_features['OverallCond'] = total_features['OverallCond'].astype(str)


#Year and month sold are transformed into categorical features.
total_features['YrSold'] = total_features['YrSold'].astype(str)
total_features['MoSold'] = total_features['MoSold'].astype(str)

In [18]:
total_features = total_features.drop(['Utilities', 'Street', 'PoolQC',], axis=1)
cols = ('FireplaceQu', 'BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond',
        'ExterQual', 'ExterCond','HeatingQC', 'KitchenQual', 'BsmtFinType1',
        'BsmtFinType2', 'Functional', 'Fence', 'BsmtExposure', 'GarageFinish', 'LandSlope',
        'LotShape', 'PavedDrive',  'Alley', 'CentralAir', 'MSSubClass', 'OverallCond',
        'YrSold', 'MoSold')
# process columns, apply LabelEncoder to categorical features
for c in cols:
    lbl = LabelEncoder()
    lbl.fit(list(total_features[c].values))
    total_features[c] = lbl.transform(list(total_features[c].values))

Shape all_data: (2912, 77)


In [19]:
# Adding new features . Make sure that you understand this.

total_features['YrBltAndRemod']=total_features['YearBuilt']+total_features['YearRemodAdd']
total_features['TotalSF']=total_features['TotalBsmtSF'] + total_features['1stFlrSF'] + total_features['2ndFlrSF']

total_features['Total_sqr_footage'] = (total_features['BsmtFinSF1'] + total_features['BsmtFinSF2'] +
                                       total_features['1stFlrSF'] + total_features['2ndFlrSF'])

total_features['Total_Bathrooms'] = (total_features['FullBath'] + (0.5 * total_features['HalfBath']) +
                                     total_features['BsmtFullBath'] + (0.5 * total_features['BsmtHalfBath']))

total_features['Total_porch_sf'] = (total_features['OpenPorchSF'] + total_features['3SsnPorch'] +
                                    total_features['EnclosedPorch'] + total_features['ScreenPorch'] +
                                    total_features['WoodDeckSF'])


(2912, 82)

In [20]:
## For ex, if PoolArea = 0 , Then HasPool = 0 too

total_features['haspool'] = total_features['PoolArea'].apply(lambda x: 1 if x > 0 else 0)
total_features['has2ndfloor'] = total_features['2ndFlrSF'].apply(lambda x: 1 if x > 0 else 0)
total_features['hasgarage'] = total_features['GarageArea'].apply(lambda x: 1 if x > 0 else 0)
total_features['hasbsmt'] = total_features['TotalBsmtSF'].apply(lambda x: 1 if x > 0 else 0)
total_features['hasfireplace'] = total_features['Fireplaces'].apply(lambda x: 1 if x > 0 else 0)


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Alley,LotShape,LandContour,LotConfig,LandSlope,...,YrBltAndRemod,TotalSF,Total_sqr_footage,Total_Bathrooms,Total_porch_sf,haspool,has2ndfloor,hasgarage,hasbsmt,hasfireplace
0,1,10,RL,65.0,8450,1,3,Lvl,Inside,0,...,4006,2566.0,2416.0,3.5,61,0,1,1,1,0
1,2,5,RL,80.0,9600,1,3,Lvl,FR2,0,...,3952,2524.0,2240.0,2.5,298,0,0,1,1,1
2,3,10,RL,68.0,11250,1,0,Lvl,Inside,0,...,4003,2706.0,2272.0,3.5,42,0,1,1,1,1
3,4,11,RL,60.0,9550,1,0,Lvl,Corner,0,...,3885,2473.0,1933.0,2.0,307,0,1,1,1,1
4,5,10,RL,84.0,14260,1,0,Lvl,FR2,0,...,4000,3343.0,2853.0,3.5,276,0,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2907,2915,2,RM,21.0,1936,1,3,Lvl,Inside,0,...,3940,1638.0,1092.0,1.5,0,0,1,0,1,0
2908,2916,2,RM,21.0,1894,1,3,Lvl,Inside,0,...,3940,1638.0,1344.0,1.5,24,0,1,1,1,0
2909,2917,5,RL,160.0,20000,1,3,Lvl,Inside,0,...,3956,2448.0,2448.0,2.0,474,0,0,1,1,1
2910,2918,14,RL,62.0,10441,1,3,Lvl,Inside,0,...,3984,1882.0,1307.0,1.5,112,0,0,0,1,0


In [21]:
numeric_feats = total_features.dtypes[total_features.dtypes != "object"].index

# Check the skew of all numerical features
skewed_feats = total_features[numeric_feats].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)

skewness = pd.DataFrame({'Skew' :skewed_feats})


Skew in numerical features: 



Unnamed: 0,Skew
MiscVal,21.920854
PoolArea,17.673354
haspool,15.481305
LotArea,13.150188
LowQualFinSF,12.073977
3SsnPorch,11.362112
LandSlope,4.990731
KitchenAbvGr,4.333199
BsmtFinSF2,4.140401
EnclosedPorch,4.008192


In [22]:
skewness = skewness[abs(skewness) > 0.75]

from scipy.special import boxcox1p
skewed_features = skewness.index
lam = 0.15
for feat in skewed_features:
    #all_data[feat] += 1
    total_features[feat] = boxcox1p(total_features[feat], lam)

There are 67 skewed numerical features to Box Cox transform


In [23]:
numeric_feats = total_features.dtypes[total_features.dtypes != "object"].index

# Check the skew of all numerical features
skewed_feats = total_features[numeric_feats].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
skewness = pd.DataFrame({'Skew' :skewed_feats})


Skew in numerical features: 



Unnamed: 0,Skew
PoolArea,15.745853
haspool,15.481305
3SsnPorch,8.913714
LowQualFinSF,8.733244
MiscVal,5.589718
LandSlope,4.545628
BsmtHalfBath,3.796244
KitchenAbvGr,3.718111
ScreenPorch,2.973691
BsmtFinSF2,2.560119


In [24]:
final_features = pd.get_dummies(total_features).reset_index(drop=True)

(2912, 228)

In [26]:
#spliting the data into train and test datasets
train_data=final_features.iloc[:1453]
test_data=final_features.iloc[1453:]
print(train_data.shape)
test_data.shape

X_train = train_data
y_train = train_df['SalePrice']
X_test = test_data

(1453, 228)


((1453, 228), (1453,), (1459, 228))