# Feature Engineering

### Importing Data and Dependencies

In [1]:
# Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as ms
from scipy import stats
from scipy.stats import norm, skew

#Data 
train = pd.read_csv('Processed_DataSet/Cleaned_train_data.csv')
train.head(5)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,12.247699
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,12.109016
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,12.317171
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,11.849405
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,12.42922


### Adding feature

In [2]:
# Adding total sqfootage feature 
train['TotalSF'] = train['TotalBsmtSF'] + train['1stFlrSF'] + train['2ndFlrSF']

### Transforming some numerical variables that are really categorical

In [3]:
#MSSubClass=The building class
train['MSSubClass'] = train['MSSubClass'].apply(str)

#Changing OverallCond into a categorical variable
train['OverallCond'] = train['OverallCond'].astype(str)

#Year and month sold are transformed into categorical features.
train['YrSold'] = train['YrSold'].astype(str)
train['MoSold'] = train['MoSold'].astype(str)

### Label Encoding some categorical variables that may contain information in their ordering set

In [4]:
from sklearn.preprocessing import LabelEncoder

cols = ('FireplaceQu', 'BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond', 
        'ExterQual', 'ExterCond','HeatingQC', 'PoolQC', 'KitchenQual', 'BsmtFinType1', 
        'BsmtFinType2', 'Functional', 'Fence', 'BsmtExposure', 'GarageFinish', 'LandSlope',
        'LotShape', 'PavedDrive', 'Street', 'Alley', 'CentralAir', 'MSSubClass', 'OverallCond', 
        'YrSold', 'MoSold')

# process columns, apply LabelEncoder to categorical features
for c in cols:
    lbl = LabelEncoder() 
    lbl.fit(train[c].unique().tolist())
    train[c] = lbl.transform(list(train[c].values))

In [5]:
train.head(5)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice,TotalSF
0,1,9,RL,65.0,8450,1,1,3,Lvl,AllPub,...,3,4,,0,4,2,WD,Normal,12.247699,2566
1,2,4,RL,80.0,9600,1,1,3,Lvl,AllPub,...,3,4,,0,7,1,WD,Normal,12.109016,2524
2,3,9,RL,68.0,11250,1,1,0,Lvl,AllPub,...,3,4,,0,11,2,WD,Normal,12.317171,2706
3,4,10,RL,60.0,9550,1,1,0,Lvl,AllPub,...,3,4,,0,4,0,WD,Abnorml,11.849405,2473
4,5,9,RL,84.0,14260,1,1,0,Lvl,AllPub,...,3,4,,0,3,2,WD,Normal,12.42922,3343


### Skewed Feature

In [6]:
#Seperating Columns for Skew check
y_train = train.SalePrice
train.drop(['SalePrice','Id'],axis=1,inplace=True)

In [7]:
numeric_f = train.dtypes[train.dtypes != "object"].index

# Check the skew of all numerical features

skewed_f = train[numeric_f].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
skewness = pd.DataFrame({'Skew in train data' :skewed_f})
skewness.head(20)

Unnamed: 0,Skew in train data
MiscVal,24.434913
PoolArea,15.932532
LotArea,12.560986
3SsnPorch,10.28651
LowQualFinSF,8.995688
LandSlope,4.805032
KitchenAbvGr,4.480268
BsmtFinSF2,4.24755
ScreenPorch,4.11469
BsmtHalfBath,4.095895


In [8]:
#Transforming train Data

skewness = skewness[abs(skewness) > 0.75]
print("There were {} skewed numerical features in train data which were Box Cox transformed".format(skewness.shape[0]))

from scipy.special import boxcox1p
skewed_features = skewness.index
lam = 0.15
for feat in skewed_features:
    train[feat] = boxcox1p(train[feat], lam)

There were 59 skewed numerical features in train data which were Box Cox transformed


### Getting dummy variables

In [9]:
#Getting Dummy Variables
train = pd.get_dummies(train)
train['SalePrice'] = y_train
#Checking Shapes
train.shape

(1458, 223)

# Saving the Final Train DataSet

In [10]:
train.to_csv('Processed_DataSet/Final_train.csv',index=False)