## Feature selection

Create own functions to filter features based on the following criteria:

* lower variance than x
* number of missing values is more than *x* %
* one of each pair of features, which are correlated together more than *x*

**Data Source:** output of the feature engineering exercise from last week.

In [1]:
import pandas as pd
import numpy as np 
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import f_regression, SelectKBest

In [2]:
df = pd.read_csv('df_numeric.csv')


In [8]:
def variance_filter(df, var):
    vt=VarianceThreshold(var)
    df.filtered=vt.fit_transform(df)
    selected_columns = df.columns[vt.get_support()]
    df_transformed = pd.DataFrame(df, columns = selected_columns)
    return df_transformed

In [16]:
variance_filter(df, 0.1)

Unnamed: 0,MSSubClass,LotFrontage,LotArea,LotShape,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,ExterQual,...,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice,LotFrontage_missing_ind,1stFlr_2ndFlr_SF,OverallGrade
0,60,65.0,8450,4,7,5,2003,2003,196.0,4,...,0,0,0,0,2,2008,208500,0,1710,35
1,20,80.0,9600,4,6,8,1976,1976,0.0,3,...,0,0,0,0,5,2007,181500,0,1262,48
2,60,68.0,11250,3,7,5,2001,2002,162.0,4,...,0,0,0,0,9,2008,223500,0,1786,35
3,70,60.0,9550,3,7,5,1915,1970,0.0,3,...,0,0,0,0,2,2006,140000,0,1717,35
4,60,84.0,14260,3,8,5,2000,2000,350.0,4,...,0,0,0,0,12,2008,250000,0,2198,40
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1453,60,62.0,7917,4,6,5,1999,2000,0.0,3,...,0,0,0,0,8,2007,175000,0,1647,30
1454,20,85.0,13175,4,6,6,1978,1988,119.0,3,...,0,0,0,0,2,2010,210000,0,2073,36
1455,70,66.0,9042,4,7,9,1941,2006,0.0,5,...,0,0,0,2500,5,2010,266500,0,2340,63
1456,20,68.0,9717,4,5,6,1950,1996,0.0,3,...,0,0,0,0,4,2010,142125,0,1078,30


In [54]:
def nan_pert(df, per):
    features = []
    for i in range(df.shape[1]):
        if df.isna().mean().gt(per)[i] == False:
            features.append(df.columns[i])
    return df[features]

In [55]:
nan_pert(df,0.8)

Unnamed: 0,MSSubClass,LotFrontage,LotArea,Street,LotShape,Utilities,LandSlope,OverallQual,OverallCond,YearBuilt,...,MoSold,YrSold,SalePrice,GarageYrBlt_missing_ind,LotFrontage_missing_ind,MasVnrArea_missing_ind,1stFlrSF_log,1stFlr_2ndFlr_SF,OverallGrade,SimplGarageQual
0,60,65.0,8450,2,4,4,3,7,5,2003,...,2,2008,208500,0,0,0,6.752270,1710,35,1
1,20,80.0,9600,2,4,4,3,6,8,1976,...,5,2007,181500,0,0,0,7.140453,1262,48,1
2,60,68.0,11250,2,3,4,3,7,5,2001,...,9,2008,223500,0,0,0,6.824374,1786,35,1
3,70,60.0,9550,2,3,4,3,7,5,1915,...,2,2006,140000,0,0,0,6.867974,1717,35,1
4,60,84.0,14260,2,3,4,3,8,5,2000,...,12,2008,250000,0,0,0,7.043160,2198,40,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1453,60,62.0,7917,2,4,4,3,6,5,1999,...,8,2007,175000,0,0,0,6.859615,1647,30,1
1454,20,85.0,13175,2,4,4,3,6,6,1978,...,2,2010,210000,0,0,0,7.636752,2073,36,1
1455,70,66.0,9042,2,4,4,3,7,9,1941,...,5,2010,266500,0,0,0,7.080026,2340,63,1
1456,20,68.0,9717,2,4,4,3,5,6,1950,...,4,2010,142125,0,0,0,6.982863,1078,30,1


In [58]:
# this returns the percentage of missing value in each column 
df.isna().mean().round(4)

MSSubClass                 0.0
LotFrontage                0.0
LotArea                    0.0
Street                     0.0
LotShape                   0.0
Utilities                  0.0
LandSlope                  0.0
OverallQual                0.0
OverallCond                0.0
YearBuilt                  0.0
YearRemodAdd               0.0
MasVnrArea                 0.0
ExterQual                  0.0
ExterCond                  0.0
BsmtQual                   0.0
BsmtCond                   0.0
BsmtFinType1               0.0
BsmtFinSF1                 0.0
BsmtFinType2               0.0
BsmtFinSF2                 0.0
BsmtUnfSF                  0.0
TotalBsmtSF                0.0
HeatingQC                  0.0
1stFlrSF                   0.0
2ndFlrSF                   0.0
LowQualFinSF               0.0
GrLivArea                  0.0
BsmtFullBath               0.0
BsmtHalfBath               0.0
FullBath                   0.0
HalfBath                   0.0
BedroomAbvGr               0.0
KitchenA

In [61]:
def remove_correlated(df, x):
    df_corr = df.corr().abs()


    indices = np.where(df_corr > x) 
    indices = [(df_corr.index[x], df_corr.columns[y]) 
    for x, y in zip(*indices)
    if x != y and x < y]


    for idx in indices: #each pair
        try:
            df.drop(idx[1], axis = 1, inplace=True)
        except KeyError:
            pass
    return df

In [63]:
remove_correlated(df, 0.7)

Unnamed: 0,MSSubClass,LotFrontage,LotArea,Street,LotShape,Utilities,LandSlope,OverallQual,OverallCond,YearBuilt,...,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,LotFrontage_missing_ind,MasVnrArea_missing_ind
0,60,65.0,8450,2,4,4,3,7,5,2003,...,61,0,0,0,0,0,2,2008,0,0
1,20,80.0,9600,2,4,4,3,6,8,1976,...,0,0,0,0,0,0,5,2007,0,0
2,60,68.0,11250,2,3,4,3,7,5,2001,...,42,0,0,0,0,0,9,2008,0,0
3,70,60.0,9550,2,3,4,3,7,5,1915,...,35,272,0,0,0,0,2,2006,0,0
4,60,84.0,14260,2,3,4,3,8,5,2000,...,84,0,0,0,0,0,12,2008,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1453,60,62.0,7917,2,4,4,3,6,5,1999,...,40,0,0,0,0,0,8,2007,0,0
1454,20,85.0,13175,2,4,4,3,6,6,1978,...,0,0,0,0,0,0,2,2010,0,0
1455,70,66.0,9042,2,4,4,3,7,9,1941,...,60,0,0,0,0,2500,5,2010,0,0
1456,20,68.0,9717,2,4,4,3,5,6,1950,...,0,112,0,0,0,0,4,2010,0,0
