## This is a pre processing pipeline using a house data set from Kaggle, the idea is to predict the value of houses

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import Imputer

In [2]:

DATA_PATH = os.path.abspath(os.path.join( os.getcwd() , 'appmodule\data'))


In [3]:
df = pd.read_csv(DATA_PATH + r'\train.csv')
print(df.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
Id               1460 non-null int64
MSSubClass       1460 non-null int64
MSZoning         1460 non-null object
LotFrontage      1201 non-null float64
LotArea          1460 non-null int64
Street           1460 non-null object
Alley            91 non-null object
LotShape         1460 non-null object
LandContour      1460 non-null object
Utilities        1460 non-null object
LotConfig        1460 non-null object
LandSlope        1460 non-null object
Neighborhood     1460 non-null object
Condition1       1460 non-null object
Condition2       1460 non-null object
BldgType         1460 non-null object
HouseStyle       1460 non-null object
OverallQual      1460 non-null int64
OverallCond      1460 non-null int64
YearBuilt        1460 non-null int64
YearRemodAdd     1460 non-null int64
RoofStyle        1460 non-null object
RoofMatl         1460 non-null object
Exterior1st      1460 non-n

In [4]:
# Separating data features according to data type: numeric or string

X_numeric_labels = set(df._get_numeric_data().columns.tolist())
X_categorical_labels = set(df.columns.tolist()).difference(X_numeric_labels)

In [5]:
print('These are numerical features {} \n'.format(X_numeric_labels))
print('These are categorical features {} \n'.format(X_categorical_labels))


These are numerical features {'OverallQual', 'YrSold', '2ndFlrSF', 'GarageArea', 'KitchenAbvGr', 'BsmtHalfBath', 'LowQualFinSF', 'YearRemodAdd', 'MiscVal', 'OpenPorchSF', 'EnclosedPorch', 'ScreenPorch', '3SsnPorch', 'GarageYrBlt', 'Fireplaces', 'GarageCars', 'PoolArea', '1stFlrSF', 'BsmtFinSF2', 'WoodDeckSF', 'TotRmsAbvGrd', 'BedroomAbvGr', 'FullBath', 'SalePrice', 'BsmtFinSF1', 'Id', 'BsmtUnfSF', 'BsmtFullBath', 'GrLivArea', 'MoSold', 'LotFrontage', 'MSSubClass', 'OverallCond', 'MasVnrArea', 'TotalBsmtSF', 'YearBuilt', 'LotArea', 'HalfBath'} 

These are categorical features {'RoofMatl', 'Condition2', 'PoolQC', 'Exterior2nd', 'GarageQual', 'Foundation', 'LandSlope', 'Fence', 'ExterCond', 'CentralAir', 'Exterior1st', 'Heating', 'BsmtCond', 'Condition1', 'GarageFinish', 'Street', 'GarageCond', 'BsmtQual', 'ExterQual', 'Functional', 'PavedDrive', 'MasVnrType', 'GarageType', 'LandContour', 'FireplaceQu', 'KitchenQual', 'BsmtFinType1', 'BsmtFinType2', 'SaleCondition', 'RoofStyle', 'Electric

### Some considerations


* <p> Page 107 of "our book":  "After executing the preceding code, the first column of the NumPy array X now holds the new
colour values, which are encoded as follows:
blue to 0
green to 1
red to 2
If we stop at this point and feed the array to our classifier, *we will make one of the most common 
mistakes in dealing with categorical data*. Can you spot the problem? *
Although the colour values do not come in any particular order, a learning algorithm will now assume
that green is larger than blue, and red is larger than green. 
Although this assumption is incorrect, the algorithm could still produce useful results. 
However, those results would not be optimal* "</p>

* <p> In the numerical feature variables there is the possibility that there are categorical numerical features variables. How to identify this kinf of features ? 
 I think the main problem is what strategy to use to replace values ? median, mode or mean ? </p>
 

In [6]:
#Let's check how many numerical features have missing values

print(df[df._get_numeric_data().columns.tolist()].isnull().sum())




Id                 0
MSSubClass         0
LotFrontage      259
LotArea            0
OverallQual        0
OverallCond        0
YearBuilt          0
YearRemodAdd       0
MasVnrArea         8
BsmtFinSF1         0
BsmtFinSF2         0
BsmtUnfSF          0
TotalBsmtSF        0
1stFlrSF           0
2ndFlrSF           0
LowQualFinSF       0
GrLivArea          0
BsmtFullBath       0
BsmtHalfBath       0
FullBath           0
HalfBath           0
BedroomAbvGr       0
KitchenAbvGr       0
TotRmsAbvGrd       0
Fireplaces         0
GarageYrBlt       81
GarageCars         0
GarageArea         0
WoodDeckSF         0
OpenPorchSF        0
EnclosedPorch      0
3SsnPorch          0
ScreenPorch        0
PoolArea           0
MiscVal            0
MoSold             0
YrSold             0
SalePrice          0
dtype: int64


In [7]:
# Dropping columns: ['Id', 'LotFrontage', 'GarageYrBlt', 'SalePrice',] from numerical features

df._get_numeric_data().drop(['Id', 'LotFrontage', 'GarageYrBlt', 'SalePrice'],1)

Unnamed: 0,MSSubClass,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
0,60,8450,7,5,2003,2003,196.0,706,0,150,...,548,0,61,0,0,0,0,0,2,2008
1,20,9600,6,8,1976,1976,0.0,978,0,284,...,460,298,0,0,0,0,0,0,5,2007
2,60,11250,7,5,2001,2002,162.0,486,0,434,...,608,0,42,0,0,0,0,0,9,2008
3,70,9550,7,5,1915,1970,0.0,216,0,540,...,642,0,35,272,0,0,0,0,2,2006
4,60,14260,8,5,2000,2000,350.0,655,0,490,...,836,192,84,0,0,0,0,0,12,2008
5,50,14115,5,5,1993,1995,0.0,732,0,64,...,480,40,30,0,320,0,0,700,10,2009
6,20,10084,8,5,2004,2005,186.0,1369,0,317,...,636,255,57,0,0,0,0,0,8,2007
7,60,10382,7,6,1973,1973,240.0,859,32,216,...,484,235,204,228,0,0,0,350,11,2009
8,50,6120,7,5,1931,1950,0.0,0,0,952,...,468,90,0,205,0,0,0,0,4,2008
9,190,7420,5,6,1939,1950,0.0,851,0,140,...,205,0,4,0,0,0,0,0,1,2008


In [8]:
# Checking for missing values in categorical values in the data set

print(df[list(X_categorical_labels)].isnull().sum())

RoofMatl            0
Condition2          0
PoolQC           1453
Exterior2nd         0
GarageQual         81
Foundation          0
LandSlope           0
Fence            1179
ExterCond           0
CentralAir          0
Exterior1st         0
Heating             0
BsmtCond           37
Condition1          0
GarageFinish       81
Street              0
GarageCond         81
BsmtQual           37
ExterQual           0
Functional          0
PavedDrive          0
MasVnrType          8
GarageType         81
LandContour         0
FireplaceQu       690
KitchenQual         0
BsmtFinType1       37
BsmtFinType2       38
SaleCondition       0
RoofStyle           0
Electrical          1
SaleType            0
HeatingQC           0
LotConfig           0
BldgType            0
MiscFeature      1406
HouseStyle          0
Alley            1369
LotShape            0
Utilities           0
Neighborhood        0
BsmtExposure       38
MSZoning            0
dtype: int64


In [51]:
# Dropping categorical features with missing values
# The following instruction returns the list of features with no missing values

X_categorical_not_missing_values_labels = df[list(X_categorical_labels)].isnull().sum().loc[lambda df: df.values == 0].axes[0].tolist()

df[list(X_categorical_labels)].drop(X_categorical_not_missing_values_labels, 1)

Unnamed: 0,PoolQC,GarageQual,Fence,BsmtCond,GarageFinish,GarageCond,BsmtQual,MasVnrType,GarageType,FireplaceQu,BsmtFinType1,BsmtFinType2,Electrical,MiscFeature,Alley,BsmtExposure
0,,TA,,TA,RFn,TA,Gd,BrkFace,Attchd,,GLQ,Unf,SBrkr,,,No
1,,TA,,TA,RFn,TA,Gd,,Attchd,TA,ALQ,Unf,SBrkr,,,Gd
2,,TA,,TA,RFn,TA,Gd,BrkFace,Attchd,TA,GLQ,Unf,SBrkr,,,Mn
3,,TA,,Gd,Unf,TA,TA,,Detchd,Gd,ALQ,Unf,SBrkr,,,No
4,,TA,,TA,RFn,TA,Gd,BrkFace,Attchd,TA,GLQ,Unf,SBrkr,,,Av
5,,TA,MnPrv,TA,Unf,TA,Gd,,Attchd,,GLQ,Unf,SBrkr,Shed,,No
6,,TA,,TA,RFn,TA,Ex,Stone,Attchd,Gd,GLQ,Unf,SBrkr,,,Av
7,,TA,,TA,RFn,TA,Gd,Stone,Attchd,TA,ALQ,BLQ,SBrkr,Shed,,Mn
8,,Fa,,TA,Unf,TA,TA,,Detchd,TA,Unf,Unf,FuseF,,,No
9,,Gd,,TA,RFn,TA,TA,,Attchd,TA,GLQ,Unf,SBrkr,,,No


In [26]:
# Separating categorical features in two sets: nominal and ordinal
# The values in features asociated with categorical ordinal feautures are: [Ex, Gd, GLQ, GdPrv, Fin] ... etc

categorical_ordinal_list_values = ['Ex', 'Gd', 'GLQ', 'GdPrv', 'Fin'] # There are not iall of the values. Here must be
# included all the possible values for the categorical ordinal features

X_categorical_nominal_labels = (df[list(X_categorical_labels)].drop(['GarageCond', 'BsmtFinType1', 'BsmtQual','MiscFeature','Alley','GarageType','BsmtFinType2',
         'BsmtExposure', 'FireplaceQu', 'MasVnrType', 'GarageQual', 'GarageFinish', 'Fence', 'PoolQC'],
        1).isin(categorical_ordinal_list_values) == False).all().loc[lambda df: df.values == True].axes

# The previous instruction returns the index of the categorical_nominal_values. I think we could include the following features:
#LandSlope has values: Gtl -> Gentle slopen; Mod -> Moderate Slope; Sev -> Severe Slope
#Functional has values: Typ -> Typical Functionality; Min1 -> Minor Deductions 1; Min2 -> Minor Deductions 2; Mod -> Moderate Deductions; Maj1 -> Major Deductions 1; Maj2 -> Major Deductions 2; Sev -> Severely Damaged; Sal -> Salvage only
#PavedDrive has values:   Y -> Paved ; P -> Partial Pavement; N -> Dirt/Gravel



In [27]:
print(X_categorical_nominal_labels)

[Index(['RoofMatl', 'Condition2', 'Exterior2nd', 'Foundation', 'LandSlope',
       'CentralAir', 'Exterior1st', 'Heating', 'Condition1', 'Street',
       'Functional', 'PavedDrive', 'LandContour', 'SaleCondition', 'RoofStyle',
       'Electrical', 'SaleType', 'LotConfig', 'BldgType', 'HouseStyle',
       'LotShape', 'Utilities', 'Neighborhood', 'MSZoning'],
      dtype='object')]


In [52]:
print(X_categorical_nominal_labels[0].tolist())

['RoofMatl', 'Condition2', 'Exterior2nd', 'Foundation', 'LandSlope', 'CentralAir', 'Exterior1st', 'Heating', 'Condition1', 'Street', 'Functional', 'PavedDrive', 'LandContour', 'SaleCondition', 'RoofStyle', 'Electrical', 'SaleType', 'LotConfig', 'BldgType', 'HouseStyle', 'LotShape', 'Utilities', 'Neighborhood', 'MSZoning']


### Remarks
<p> From the set of set of **X_categorical_not_missing_values_labels** the set of **X_categorical_nominal_labels** is extracted to get 
the set of **X_categorical_ordinal_labels**</p>

* On the set of **X_categorical_ordinal_labels** we can apply pag 104 **mapping strategy**

* On the set of **X_categorical_nominal_labels** we can apply **one-hot encoding strategy** pag 106

In [56]:
X_categorical_ordinal_labels = set(X_categorical_not_missing_values_labels).difference(set(X_categorical_nominal_labels[0].tolist()))

In [57]:
print(X_categorical_ordinal_labels)

{'ExterQual', 'KitchenQual', 'ExterCond', 'HeatingQC'}


In [58]:
df[list(X_categorical_ordinal_labels)]

Unnamed: 0,ExterQual,KitchenQual,ExterCond,HeatingQC
0,Gd,Gd,TA,Ex
1,TA,TA,TA,Ex
2,Gd,Gd,TA,Ex
3,TA,Gd,TA,Gd
4,Gd,Gd,TA,Ex
5,TA,TA,TA,Ex
6,Gd,Gd,TA,Ex
7,TA,TA,TA,Ex
8,TA,TA,TA,Gd
9,TA,TA,TA,Ex
