In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import pandas as pd
import glob
import warnings
from IPython.display import display

warnings.filterwarnings('ignore')
mpl.rcParams['savefig.dpi'] = 128
mpl.rcParams['figure.dpi'] = 128
# Plot size to 14" x 7"
mpl.rc('figure', figsize = (14, 7))
# Font size to 14
mpl.rc('font', size = 14)
# Do not display top and right frame lines
mpl.rc('axes.spines', top = False, right = False)
# Remove grid lines
mpl.rc('axes', grid = False)
# Set backgound color to white
mpl.rc('axes', facecolor = 'white')

In [2]:
df = pd.read_csv("2009-2017.csv")

In [3]:
df['loan amount'] = df['loan amount'].astype(int)

## Get fatures and target

In [4]:
from sklearn.preprocessing import LabelEncoder

target = 'enquiry status';
# df = df.iloc[:number_of_rows]
X = df[df.keys()]
X = df.loc[:,df.columns != target]

le = LabelEncoder()
y = df[target]
y = le.fit_transform(y)

## Encoding Category data

In [5]:
X = pd.get_dummies(X)

In [6]:
X.shape

(268639, 60)

## Handling Highly Correlated Features

In [7]:
# Create correlation matrix
corr_matrix = df.corr().abs()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

# Find index of feature columns with correlation greater than 0.95
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]

In [8]:
to_drop

[]

## Reducing features by maximizing class separability

## Recursively eliminating features

In [9]:
from sklearn.datasets import make_regression
from sklearn.feature_selection import RFECV
from sklearn import datasets, linear_model

# Suppress an annoying but harmless warning
warnings.filterwarnings(action="ignore", module="scipy", message="^internal gelsd")

# Generate features matrix, target vector, and the true coefficients
features, target = make_regression(n_samples = 10000,
                                   n_features = 100,
                                   n_informative = 2,
                                   random_state = 1)

# Create a linear regression
ols = linear_model.LinearRegression()

# Recursively eliminate features
rfecv = RFECV(estimator=ols, step=1, scoring="neg_mean_squared_error")
# rfecv.fit(X, y)
# test = rfecv.transform(X)
test = rfecv.fit_transform(X, y)

In [10]:
test

array([[12,  6,  0, ...,  0,  0,  1],
       [ 1,  6,  3, ...,  0,  0,  1],
       [ 7, 27,  2, ...,  0,  0,  1],
       ..., 
       [ 1, 31,  6, ...,  0,  0,  1],
       [ 1, 31,  6, ...,  1,  0,  0],
       [ 1, 31,  6, ...,  0,  0,  1]])

## Removing irrelevant features for classification

In [11]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, f_classif

# Select two features with highest chi-squared statistics
chi2_selector = SelectKBest(chi2, k=5)
features_kbest = chi2_selector.fit_transform(X, y)

# Show results
print("Original number of features:", X.shape[1])
print("Reduced number of features:", features_kbest.shape[1])

Original number of features: 60
Reduced number of features: 5


In [12]:
mask = chi2_selector.get_support()
new_features = X.columns[mask]

In [13]:
features_kbest

array([[304000,      0,      1,      0,      0],
       [250000,      0,      0,      0,      0],
       [325000,      0,      1,      0,      0],
       ..., 
       [440000,      0,      0,      0,      0],
       [304000,      0,      0,      1,      0],
       [500000,      0,      0,      0,      0]])

In [14]:
from sklearn.model_selection import train_test_split

# split data and labels into a training and a test set
X_train, X_test, y_train, y_test = train_test_split(X, y,
                     test_size=0.4,
                     random_state=0,
                     stratify=y)