## Feature selection

**Create three of your own functions to filter out features based on the following criteria:**

* lower variance than x
* number of missing values is more than % of *x* 
* one of each pair of features, which are correlated together more than *x*

**Data Source:** output of the feature engineering exercise from last week.

___________________

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

In [34]:
x = pd.read_csv('data/train.csv')
x.head()

## Missing Value greater than percent x

In [None]:
def remove_missing_values(x, missing_percent_drop_threshold=0.5):
#     takes in dataframe, removes missing above a percent threshold - percent out of 1
    total = x.isnull().sum().sort_values(ascending=False)
    percent = (x.isnull().sum()/x.isnull().count()).sort_values(ascending=False)
    missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
    missing_data.head(20)

    to_drop = missing_data[missing_data['Percent'] > missing_percent_drop_threshold].index.tolist()
    return(x.drop(to_drop, axis=1, inplace=True))

## Remove with small variance
(within column)

In [24]:
from sklearn.feature_selection import VarianceThreshold
df_numeric =pd.read_csv("data/df_numeric.csv")

In [21]:
# Assumptions - target variable removed, df is numeric
# import:
# from sklearn.feature_selection import VarianceThreshold 
def remove_small_variance(x, variance_threshold = 0.1):
    vt = VarianceThreshold(variance_threshold)
    x_transformed = vt.fit_transform(x)
    selected_columns = x.columns[vt.get_support()]
    x_transformed = pd.DataFrame(x_transformed, columns = selected_columns)
    return(x_transformed)

In [35]:
# df_numeric.shape

In [26]:
remove_small_variance(df_numeric)

Unnamed: 0,MSSubClass,LotFrontage,LotArea,LotShape,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,ExterQual,...,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice,LotFrontage_missing_ind,1stFlr_2ndFlr_SF,OverallGrade
0,60.0,65.0,8450.0,4.0,7.0,5.0,2003.0,2003.0,196.0,4.0,...,0.0,0.0,0.0,0.0,2.0,2008.0,208500.0,0.0,1710.0,35.0
1,20.0,80.0,9600.0,4.0,6.0,8.0,1976.0,1976.0,0.0,3.0,...,0.0,0.0,0.0,0.0,5.0,2007.0,181500.0,0.0,1262.0,48.0
2,60.0,68.0,11250.0,3.0,7.0,5.0,2001.0,2002.0,162.0,4.0,...,0.0,0.0,0.0,0.0,9.0,2008.0,223500.0,0.0,1786.0,35.0
3,70.0,60.0,9550.0,3.0,7.0,5.0,1915.0,1970.0,0.0,3.0,...,0.0,0.0,0.0,0.0,2.0,2006.0,140000.0,0.0,1717.0,35.0
4,60.0,84.0,14260.0,3.0,8.0,5.0,2000.0,2000.0,350.0,4.0,...,0.0,0.0,0.0,0.0,12.0,2008.0,250000.0,0.0,2198.0,40.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1453,60.0,62.0,7917.0,4.0,6.0,5.0,1999.0,2000.0,0.0,3.0,...,0.0,0.0,0.0,0.0,8.0,2007.0,175000.0,0.0,1647.0,30.0
1454,20.0,85.0,13175.0,4.0,6.0,6.0,1978.0,1988.0,119.0,3.0,...,0.0,0.0,0.0,0.0,2.0,2010.0,210000.0,0.0,2073.0,36.0
1455,70.0,66.0,9042.0,4.0,7.0,9.0,1941.0,2006.0,0.0,5.0,...,0.0,0.0,0.0,2500.0,5.0,2010.0,266500.0,0.0,2340.0,63.0
1456,20.0,68.0,9717.0,4.0,5.0,6.0,1950.0,1996.0,0.0,3.0,...,0.0,0.0,0.0,0.0,4.0,2010.0,142125.0,0.0,1078.0,30.0


## Remove correlated features
(between columns)


In [36]:
def remove_correlated_features(df, correlation_threshold=0.8):
#     Anything above correlation threshold will be tossed
# Assumptions - all numeric, target variable removed
    # step 1
    df_corr = df.corr().abs()

    # step 2
    indices = np.where(df_corr > correlation_threshold) 
    indices = [(df_corr.index[x], df_corr.columns[y]) 
    for x, y in zip(*indices)
        if x != y and x < y]

    # step 3
    for idx in indices: #each pair
        try:
            df.drop(idx[1], axis = 1, inplace=True)
        except KeyError:
            pass
    return(df)

In [28]:
df_numeric.shape

(1458, 60)

In [31]:
df = remove_correlated_features(df_numeric)

In [32]:
df.shape

(1458, 52)