In [32]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.model_selection import GridSearchCV
from matplotlib.colors import ListedColormap
import matplotlib.pyplot as plt
import matplotlib
from distutils.version import LooseVersion
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import make_scorer

Load CSV and extract a few (>2) features using PCA method

In [33]:
train_data = pd.read_csv('Kaggle-Give-Me-Some-Data/cs-training.csv')
train_data.drop('Unnamed: 0', axis=1) # delete ranks
train_data = train_data.fillna(train_data.mean())

In [34]:
# Split the dataset into features and target variable
y_train = train_data['SeriousDlqin2yrs']
X_train = train_data.drop('SeriousDlqin2yrs', axis=1)

### Standardize the train and test sample

In [35]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.fit_transform(X_train)

### Build pipelines for PCA & LR/SVM/Decision Tree

#### LR
1. I build a pipeline that consists of a pca and a logisitic regression classifier sequentially.
2. Then I apply the `GridSearchCV` to find the best hyper-parameter (here for LR is `C`). Note that I had applied 5-fold cross-validation by setting the `cv` as `5`.

In [41]:
pipe_lr = make_pipeline(PCA(n_components=4),
                        LogisticRegression(random_state=1, solver='lbfgs'))

param_lr = {'logisticregression__C': [10 ** -2, 10 ** -1, 10 ** 0, 10 ** 1, 10 ** 2]}
gs = GridSearchCV(estimator=pipe_lr,
                  param_grid=param_lr,
                  scoring='accuracy',
                  cv=5)
scores = cross_val_score(gs, X_train, y_train, 
                         scoring='accuracy', cv=5)
print(scores)
print('CV accuracy: %.3f +/- %.3f' % (np.mean(scores),
                                      np.std(scores)))


[0.93313333 0.93306667 0.93333333 0.9336     0.93346667]
CV accuracy: 0.933 +/- 0.000


#### SVM

For svm, I didn't employ the `cross_val_score` function as it takes me about $1300$ minutes to complete the grid search for only two values of hyperparameter `C`. Please check the following screenshot to find more details.

![1](./svm_pain.png)

In [45]:
pipe_svc = make_pipeline(PCA(n_components=4),
                        SVC(random_state=1))

param_range = [0.1, 1.0]

param_grid = {'svc__C': param_range, 
               'svc__kernel': ['linear']}

gs = GridSearchCV(estimator=pipe_svc, 
                  param_grid=param_grid, 
                  scoring='accuracy', 
                  refit=True,
                  cv=5,
                  n_jobs=-1)
gs = gs.fit(X_train, y_train)
print(gs.best_score_)
print(gs.best_params_)

# scores = cross_val_score(gs, X_train, y_train, 
#                          scoring='accuracy', cv=5)
# print(scores)
# print('CV accuracy: %.3f +/- %.3f' % (np.mean(scores),
#                                       np.std(scores)))

0.9331800000000001
{'svc__C': 0.1, 'svc__kernel': 'linear'}


#### Decision Tree

In [44]:
pipe_dt = make_pipeline(PCA(n_components=4),
                        DecisionTreeClassifier(random_state=1))
print(pipe_dt)

param_dt = [{'decisiontreeclassifier__max_depth': [1, 2, 3, 4, 5, 6, 7, None]}]

gs = GridSearchCV(estimator=pipe_dt,
                  param_grid=param_dt,
                  scoring='accuracy',
                  cv=5)

gs = gs.fit(X_train, y_train)
print(gs.best_score_)
print(gs.best_params_)

scores = cross_val_score(gs, X_train, y_train, 
                         scoring='accuracy', cv=5)
print(scores)
print('CV accuracy: %.3f +/- %.3f' % (np.mean(scores),
                                      np.std(scores)))


Pipeline(steps=[('pca', PCA(n_components=4)),
                ('decisiontreeclassifier',
                 DecisionTreeClassifier(random_state=1))])
0.9359
{'decisiontreeclassifier__max_depth': 4}
[0.9352     0.93583333 0.93563333 0.93713333 0.93613333]
CV accuracy: 0.936 +/- 0.001
