# Feature Extraction and Selection Techniques
By: Hugo D. Lopes
## Objective of the notebook
Understand PCA, check about scaling and min max scaling. Evaluate classification results for RF and Logit. 

In [30]:
import numpy as np
import pandas as pd

from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

import matplotlib.pyplot as plt
import seaborn as sns

from codefiles.datagen import random_xy, x_plus_noise, data_3d
from codefiles.dataplot import plot_principal_components, plot_3d, plot_2d
%matplotlib inline
# %matplotlib notebook

# Load Credit Default dataset from Kaggle

In [2]:
def load_example_data(n_rows=10000):
    """
    Load example data
    """
    data = pd.read_csv('cs-training.csv', index_col=0)
    # Drop all rows with NaN
    data.dropna(inplace=True)
    # Get target column
    y = data['SeriousDlqin2yrs']
    # Get features columns
    X = data.ix[:, 1:]

    # Shuffle data and reduce dataset
    #idx = np.random.permutation(y.size)
    
    #X = X.iloc[idx]
    X = X.iloc[:n_rows]
    #y = y.iloc[idx]
    y = y.iloc[:n_rows]
    print('Number of features (columns):', X.shape[1])
    print('Number of loans (rows):', X.shape[0])
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1)
    return X, y, X_train, y_train, X_test, y_test

In [71]:
X, y, X_train, y_train, X_test, y_test = load_example_data(n_rows=40000)
X_train.head(8)

Number of features (columns): 10
Number of loans (rows): 40000


Unnamed: 0,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
27155,0.154235,58,1,0.982948,4104.0,13,0,2,0,0.0
19493,0.3976,59,0,0.86964,4080.0,11,0,3,0,0.0
35990,0.163177,56,0,0.011945,6529.0,1,0,0,0,3.0
241,0.973339,61,1,0.209962,5500.0,11,0,0,1,1.0
45892,0.082601,35,0,0.114269,5066.0,8,0,0,0,0.0
47558,0.309823,41,0,0.350112,5800.0,10,0,1,0,1.0
8916,0.235751,48,0,0.752071,3500.0,11,0,2,0,0.0
42000,0.567135,44,0,0.197518,2900.0,4,0,0,0,0.0


# Configs

In [72]:
n_estimators = 300
n_components = 9 # for PCA

# No feature engineering

In [73]:
clf_base = RandomForestClassifier(n_estimators=n_estimators).fit(X_train, y_train)
print('RF AU ROC = %1.4f' % roc_auc_score(y_test, clf_base.predict_proba(X_test)[:, 1]))

clf_base = LogisticRegression().fit(X_train, y_train)
print('Logit AU ROC = %1.4f' % roc_auc_score(y_test, clf_base.predict_proba(X_test)[:, 1]))

RF AU ROC = 0.8281
Logit AU ROC = 0.6809


# PCA Without Scaling

In [74]:
pca_no_scale = PCA(n_components=n_components)
pca_no_scale.fit(X_train)
pca_no_scale.explained_variance_ratio_

array([  9.97691574e-01,   1.89592555e-03,   4.08737375e-04,
         2.82343600e-06,   5.64783457e-07,   3.38781026e-07,
         1.75785693e-08,   1.28492405e-08,   4.14142207e-09,
         1.55078696e-09])

In [75]:
clf_pure_pca = RandomForestClassifier(n_estimators=n_estimators).fit(pca_no_scale.transform(X_train), y_train)
print('RF AU ROC = %1.4f' % roc_auc_score(y_test, clf_pure_pca.predict_proba(pca_no_scale.transform(X_test))[:, 1]))

clf_pure_pca = LogisticRegression().fit(pca_no_scale.transform(X_train), y_train)
print('Logit AU ROC = %1.4f' % roc_auc_score(y_test, clf_pure_pca.predict_proba(pca_no_scale.transform(X_test))[:, 1]))

RF AU ROC = 0.8223
Logit AU ROC = 0.6874


# PCA With Standardization

In [76]:
scaler = StandardScaler().fit(X_train)

In [77]:
pca_scaled = PCA(n_components=n_components)
pca_scaled.fit(scaler.transform(X_train))
pca_scaled.explained_variance_ratio_

array([ 0.29868149,  0.16014121,  0.12267758,  0.10299721,  0.10004885,
        0.08495137,  0.07259661,  0.05491852,  0.0021651 ,  0.00082204])

In [78]:
clf_stdz = RandomForestClassifier(n_estimators=n_estimators).fit(pca_scaled.transform(scaler.transform(X_train)), y_train)
print('RF AU ROC = %1.4f' % roc_auc_score(y_test, 
                                       clf_stdz.predict_proba(
                                           pca_scaled.transform(
                                               scaler.transform(X_test)))[:, 1]))

clf_stdz = LogisticRegression().fit(pca_scaled.transform(scaler.transform(X_train)), y_train)
print('Logit AU ROC = %1.4f' % roc_auc_score(y_test, 
                                       clf_stdz.predict_proba(
                                           pca_scaled.transform(
                                               scaler.transform(X_test)))[:, 1]))

RF AU ROC = 0.7955
Logit AU ROC = 0.6874


# PCA With Robust Standardization

In [85]:
scaler = RobustScaler().fit(X_train)

In [86]:
pca_scaled = PCA(n_components=n_components, whiten=True).fit(scaler.transform(X_train))
pca_scaled.explained_variance_ratio_

array([  9.23409059e-01,   7.65555424e-02,   3.13530306e-05,
         2.36476943e-06,   6.18812374e-07,   3.69731762e-07,
         2.26675476e-07,   2.14177987e-07,   1.65670381e-07,
         8.60648938e-08])

In [87]:
clf_rob_stdz = RandomForestClassifier(n_estimators=n_estimators).fit(pca_scaled.transform(scaler.transform(X_train)), y_train)
print('RF AU ROC = %1.4f' % roc_auc_score(y_test, 
                                       clf_rob_stdz.predict_proba(
                                           pca_scaled.transform(
                                               scaler.transform(X_test)))[:, 1]))

clf_rob_stdz = LogisticRegression().fit(pca_scaled.transform(scaler.transform(X_train)), y_train)
print('Logit AU ROC = %1.4f' % roc_auc_score(y_test, 
                                       clf_rob_stdz.predict_proba(
                                           pca_scaled.transform(
                                               scaler.transform(X_test)))[:, 1]))

RF AU ROC = 0.8342
Logit AU ROC = 0.6872


In [None]:
RF AU ROC = 0.8336
Logit AU ROC = 0.6872

# PCA With Min-Max Scaling

In [82]:
min_max_scaler = MinMaxScaler().fit(X_train)

In [83]:
pca_scaled = PCA(n_components=n_components)
pca_scaled.fit(min_max_scaler.transform(X_train))
pca_scaled.explained_variance_ratio_

array([  6.46422127e-01,   1.65358158e-01,   8.60246430e-02,
         6.33043733e-02,   2.83101376e-02,   4.30603564e-03,
         3.41502540e-03,   1.98375497e-03,   6.37256975e-04,
         2.38487467e-04])

In [84]:
clf_minmax = RandomForestClassifier(n_estimators=n_estimators).fit(pca_scaled.transform(min_max_scaler.transform(X_train)), 
                                                          y_train)
print('RF AU ROC = %1.4f' % roc_auc_score(y_test, 
                                       clf_minmax.predict_proba(
                                           pca_scaled.transform(
                                               min_max_scaler.transform(X_test)))[:, 1]))

clf_minmax = LogisticRegression().fit(pca_scaled.transform(min_max_scaler.transform(X_train)), 
                                                          y_train)
print('Logit AU ROC = %1.4f' % roc_auc_score(y_test, 
                                       clf_minmax.predict_proba(
                                           pca_scaled.transform(
                                               min_max_scaler.transform(X_test)))[:, 1]))

RF AU ROC = 0.7958
Logit AU ROC = 0.6348


# Main conclusions
- PCA with Min-Max scaling is always worst. Do not do this
- Robust Scaler is giving better results and Standard Scaler
- The nominal test is still very good and the rare increments in the performance probably do not justify the use of the PCA with this dataset.