In [298]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib notebook

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier, export_graphviz


In [299]:
#Load data file 
df = pd.read_csv('/Users/james1/Documents/data.csv', skiprows=[0])
#drop first column - ID
df = df.iloc[: , 1:]
#view data
df.head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
0,20000,2,2,1,24,2,2,-1,-1,-2,...,0,0,0,0,689,0,0,0,0,1
1,120000,2,2,2,26,-1,2,0,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,90000,2,2,2,34,0,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,50000,2,2,1,37,0,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,50000,1,2,1,57,-1,0,-1,0,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0


In [300]:
#Inspect data type

df.dtypes

LIMIT_BAL                     int64
SEX                           int64
EDUCATION                     int64
MARRIAGE                      int64
AGE                           int64
PAY_0                         int64
PAY_2                         int64
PAY_3                         int64
PAY_4                         int64
PAY_5                         int64
PAY_6                         int64
BILL_AMT1                     int64
BILL_AMT2                     int64
BILL_AMT3                     int64
BILL_AMT4                     int64
BILL_AMT5                     int64
BILL_AMT6                     int64
PAY_AMT1                      int64
PAY_AMT2                      int64
PAY_AMT3                      int64
PAY_AMT4                      int64
PAY_AMT5                      int64
PAY_AMT6                      int64
default payment next month    int64
dtype: object

In [301]:
# Data preprocessing

## Drop 0 value for education and marriage - no 0 described in variable documentation. Total amount of data dropped is 399 entries, %1.33 data

df = df[df['EDUCATION'] != 0]
df = df[df['EDUCATION'] != 5]
df = df[df['EDUCATION'] != 6]
df = df[df['MARRIAGE'] != 0]

## Make X all predictors and y the target variable

X = df.iloc[:,:-1]
y = df.iloc[:,-1]


## Independent Variables

### numerical: X1, X5, X6-11, X12-17, X18-23
### binary: X2
### categorical with >2 categories: X3, X4 -> change into dummies

X = pd.get_dummies(data=X, columns= ["SEX", "EDUCATION", "MARRIAGE"], drop_first = True)

# Split model intro training and test

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, test_size = 0.3, random_state = 3)

In [302]:
X.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 29601 entries, 0 to 29999
Data columns (total 26 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   LIMIT_BAL    29601 non-null  int64
 1   AGE          29601 non-null  int64
 2   PAY_0        29601 non-null  int64
 3   PAY_2        29601 non-null  int64
 4   PAY_3        29601 non-null  int64
 5   PAY_4        29601 non-null  int64
 6   PAY_5        29601 non-null  int64
 7   PAY_6        29601 non-null  int64
 8   BILL_AMT1    29601 non-null  int64
 9   BILL_AMT2    29601 non-null  int64
 10  BILL_AMT3    29601 non-null  int64
 11  BILL_AMT4    29601 non-null  int64
 12  BILL_AMT5    29601 non-null  int64
 13  BILL_AMT6    29601 non-null  int64
 14  PAY_AMT1     29601 non-null  int64
 15  PAY_AMT2     29601 non-null  int64
 16  PAY_AMT3     29601 non-null  int64
 17  PAY_AMT4     29601 non-null  int64
 18  PAY_AMT5     29601 non-null  int64
 19  PAY_AMT6     29601 non-null  int64
 20  SEX_2 

In [303]:
## Linear SVC classifier, Standard Scaler

pipeline_LinearSVC = Pipeline([('scaler', StandardScaler()), ('LinearSVC', LinearSVC())])
paramaters_SVC_grid = {'LinearSVC__C': [0.001, 0.01, 0.1, 1, 10, 100]}
grid = GridSearchCV(pipeline_LinearSVC, param_grid = paramaters_SVC_grid, cv = 5)
grid.fit(X_train, y_train)



GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('LinearSVC', LinearSVC())]),
             param_grid={'LinearSVC__C': [0.001, 0.01, 0.1, 1, 10, 100]})

In [304]:
print(f"Best cross-validation accuracy: {grid.best_score_}")
print(f"Test set score: {grid.score(X_test, y_test)}")
print(f"Best parameters: {grid.best_params_}")     

Best cross-validation accuracy: 0.8005791505791505
Test set score: 0.802724918365049
Best parameters: {'LinearSVC__C': 1}


In [305]:
# Linear SVC classifier, MinMax Scaler

pipeline_LinearSVC = Pipeline([('scaler', MinMaxScaler()), ('LinearSVC', LinearSVC())])
paramaters_SVC_grid = {'LinearSVC__C': [0.001, 0.01, 0.1, 1, 10, 100]}
grid= GridSearchCV(pipeline_LinearSVC, param_grid = paramaters_SVC_grid, cv = 5)
grid.fit(X_train, y_train)



GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('scaler', MinMaxScaler()),
                                       ('LinearSVC', LinearSVC())]),
             param_grid={'LinearSVC__C': [0.001, 0.01, 0.1, 1, 10, 100]})

In [306]:
print(f"Best cross-validation accuracy: {grid.best_score_}")
print(f"Test set score: {grid.score(X_test, y_test)}")
print(f"Best parameters: {grid.best_params_}")

Best cross-validation accuracy: 0.8003861003861005
Test set score: 0.8024997185001689
Best parameters: {'LinearSVC__C': 1}


In [307]:
# Linear SVC, unscaled

paramaters_SVC_unscaled_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100]}
LinearSVC_unscaled = LinearSVC()
LinearSVC_unscaled_cv = GridSearchCV(LinearSVC_unscaled,paramaters_SVC_unscaled_grid, cv = 5)
LinearSVC_unscaled_cv.fit(X_train, y_train)



GridSearchCV(cv=5, estimator=LinearSVC(),
             param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 100]})

In [308]:
print(f"Best cross-validation accuracy: {LinearSVC_unscaled_cv.best_score_}")
print("Test set score: {:.2f}".format(LinearSVC_unscaled_cv.score(X_test, y_test)))
print(f"Best parameters: {LinearSVC_unscaled_cv.best_params_}")

Best cross-validation accuracy: 0.7324324324324324
Test set score: 0.77
Best parameters: {'C': 0.001}


# Report

Scaling improves test score, 0.77 without vs 0.802724918365049 for Standard Scaler and 0.8024997185001689 for Minmax.  MinMax and Standard scaler both have optimal C of 1, then unscaled has C = 0.001. This makes sense, as unscaled model will have higher variance in value of coefficients and thus likely require more regularisation. 


In [309]:
# Logistic Regression, Standard Scaler

pipeline_LogReg = Pipeline([('scaler', StandardScaler()), ('LogReg', LogisticRegression())])
paramaters_LogReg_grid = {'LogReg__C': [0.001, 0.01, 0.1, 1, 10, 100]}
grid = GridSearchCV(pipeline_LogReg, param_grid = paramaters_LogReg_grid, cv=5)
grid.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('LogReg', LogisticRegression())]),
             param_grid={'LogReg__C': [0.001, 0.01, 0.1, 1, 10, 100]})

In [310]:
print(f"Best cross-validation accuracy: {grid.best_score_}")
print(f"Test set score: {grid.score(X_test, y_test)}")
print(f"Best parameters: {grid.best_params_}")

Best cross-validation accuracy: 0.8072876447876448
Test set score: 0.8125211124873325
Best parameters: {'LogReg__C': 1}


In [311]:
#Logistic Regression, MinMax

pipeline_LogReg = Pipeline([('scaler', MinMaxScaler()), ('LogReg', LogisticRegression())])
paramaters_LogReg_grid = {'LogReg__C': [0.001,0.01, 0.1, 1, 10, 100]}
grid = GridSearchCV(pipeline_LogReg, param_grid=paramaters_LogReg_grid, cv=5)
grid.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('scaler', MinMaxScaler()),
                                       ('LogReg', LogisticRegression())]),
             param_grid={'LogReg__C': [0.001, 0.01, 0.1, 1, 10, 100]})

In [312]:
print(f"Best cross-validation accuracy: {grid.best_score_}")
print(f"Test set score: {grid.score(X_test, y_test)}")
print(f"Best parameters: {grid.best_params_}")

Best cross-validation accuracy: 0.8077220077220078
Test set score: 0.8122959126224524
Best parameters: {'LogReg__C': 10}


In [313]:
# Logistic Regression, No Scaler

paramaters_LogReg_unscaled_grid = {'C': [0.001,0.01, 0.1, 1, 10, 100]}
LogReg_unscaled = LogisticRegression()
grid = GridSearchCV(LogReg_unscaled,paramaters_LogReg_unscaled_grid, cv=5)
grid.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

GridSearchCV(cv=5, estimator=LogisticRegression(),
             param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 100]})

In [314]:
print(f"Best cross-validation accuracy: {grid.best_score_}")
print(f"Test set score: {grid.score(X_test, y_test)}")
print(f"Best parameters: {grid.best_params_}")

Best cross-validation accuracy: 0.7767857142857142
Test set score: 0.7767143339713997
Best parameters: {'C': 0.1}


# Report

 The StandardScaler gives the highest test set score of 0.8122959126224524, compared to 0.8125211124873325 for MinMax and 0.7767143339713997 for unscaled. The StandardScaler has optimal parameter of 1, for MinMax this is 10 and for unscaled model it is also 0.1.

In [315]:
# Knn Neighbors Classifier, Standard Scaler

pipeline_Knn = Pipeline([('scaler', StandardScaler()), ('Knn', KNeighborsClassifier())])
paramaters_Knn_grid = {'Knn__n_neighbors': np.arange(1,50)}
grid = GridSearchCV(pipeline_Knn, param_grid = paramaters_Knn_grid, cv = 5)
grid.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('Knn', KNeighborsClassifier())]),
             param_grid={'Knn__n_neighbors': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49])})

In [316]:
print(f"Best parameters: {grid.best_params_}")
print("Test set score: {:.2f}".format(grid.score(X_test, y_test)))

Best parameters: {'Knn__n_neighbors': 31}
Test set score: 0.81


In [317]:
# Knn Neighbors Classifier, MinMax

pipeline_Knn = Pipeline([('scaler', MinMaxScaler()), ('Knn', KNeighborsClassifier())])
paramaters_Knn_grid = {'Knn__n_neighbors': np.arange(1,50)}
grid = GridSearchCV(pipeline_Knn, param_grid = paramaters_Knn_grid, cv = 5)
grid.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('scaler', MinMaxScaler()),
                                       ('Knn', KNeighborsClassifier())]),
             param_grid={'Knn__n_neighbors': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49])})

In [318]:
print(f"Best parameters: {grid.best_params_}")
print("Test set score: {:.2f}".format(grid.score(X_test, y_test)))

Best parameters: {'Knn__n_neighbors': 21}
Test set score: 0.81


In [319]:
# Knn Neighbours Classifier, Unscaled

paramaters_Knn_unscaled_grid = {'n_neighbors': np.arange(1,50)}
Knn_unscaled = KNeighborsClassifier()
Knn_unscaled_cv = GridSearchCV(Knn_unscaled,paramaters_Knn_unscaled_grid, cv = 5)
Knn_unscaled_cv.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49])})

In [320]:
print(f"Best parameters: {Knn_unscaled_cv.best_params_}")
print("Test set score: {:.2f}".format(Knn_unscaled_cv.score(X_test, y_test)))

Best parameters: {'n_neighbors': 34}
Test set score: 0.78


# Report

Standard scaler and MinMax both have test set score accuracy of 0.81, whilst with unscaled the accuracy score is less, 0.78. For Standard Scaler, the optimal K was 31 whilst it was lower for for MinMax, with K = 21. The unscaled model had optimal K of 34. So scaling meant both higher accuracy and lower value for parameter K.



In [321]:
# Decision tree

# a)/b)

tree = DecisionTreeClassifier(random_state = 0)
tree.fit(X_train, y_train)
print("Accuracy on training set: {:.3f}".format(tree.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(tree.score(X_test, y_test)))

Accuracy on training set: 0.999
Accuracy on test set: 0.726


In [None]:
## c

Decision tree models are prone to overfitting when their depth and number of features are not 
restricted. This model considers all features and continues all consequtive questions until all 
leaves are pure, which means that the tree is 100% accurate on the training set and therefore 
overfits the data. This can be prevented by finding a suitable depth and attribute count for the
model.

In [322]:
## d

tree_para = {'max_features':["auto","sqrt", "log2"],
             'max_depth':[4,5,6,7,8,9,10,11,12,15,20,30,40,50,70,90,120,150]}
clf = GridSearchCV(DecisionTreeClassifier(), tree_para, cv=5)
clf.fit(X_train, y_train)

print(f"Best parameters: {clf.best_params_}")
print(f"Accuracy on train: {clf.score(X_train, y_train)}")
print(f"Accuracy on test: {clf.score(X_test, y_test)}")

Best parameters: {'max_depth': 5, 'max_features': 'sqrt'}
Accuracy on train: 0.8166023166023166
Accuracy on test: 0.8160117103929737


# Report

When using the optimal values for max_depth and max_features - 5 and sqrt respectively - , the number of consecutive calls and features is restricted, and the model doesn’t overfit the train set anymore. As expected, the train set accuracy drops and the test set accuracy increases.

In [323]:
# Random Forest

## a and b

forest = RandomForestClassifier(n_estimators = 5, random_state = 2)
forest.fit(X_train, y_train)
forest.score(X_test, y_test)
forest_pred = forest.predict(X_test)

print(f"Accuracy on train: {forest.score(X_train, y_train)}")
print(f"Accuracy on test: {forest.score(X_test, y_test)}")

Accuracy on train: 0.9701254826254826
Accuracy on test: 0.7816687309987614


In [None]:
## c.i 


The RandomForestClassifier is a collection of DecisionTrees, where the result is averaged. If n_estimators = 1, RandomForest = DecisionTree. This helps against overfitting of the data.

## c.ii

RandomForest is harder to visualize than DecisionTrees. So when presenting the model, it might be prefered to show one DecisionTree


In [None]:
# c.iii

Y = 1 is around 78% of the data. This is pretty close to the accuracies estimated from our models. Therefore accuracy would not be the best metric to evaluate our models.



In [325]:
#c.iv

forest_grid = RandomForestClassifier()
forest_estimators = {'n_estimators': [100, 250, 500], 
                     'max_features': ["auto", "sqrt", "log2"], 
                     'max_depth':  [5, 8, 15]}

forest_estimators_grid = GridSearchCV(forest_grid, forest_estimators)

forest_estimators_grid.fit(X_train, y_train)
print(f"Best parameters: {forest_estimators_grid.best_params_}")
print(f"Accuracy on train: {forest_estimators_grid.score(X_train, y_train)}")
print(f"Accuracy on test: {forest_estimators_grid.score(X_test, y_test)}")

Best parameters: {'max_depth': 8, 'max_features': 'auto', 'n_estimators': 100}
Accuracy on train: 0.8382722007722008
Accuracy on test: 0.8217543069474158


# Report 

From the above models, we can conclude that the training accuracy dropped after using the optimal parameter values, however, test accuracy improved significantly. This is a positive  and was expected. As the number of features increases, the model uses smaller sample data to model the trees which counters overfitting and thus test accuracy improved.