# Classification with Python

In [None]:
import itertools
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.ticker import NullFormatter
import pandas as pd
import numpy as np
import matplotlib.ticker as ticker
from sklearn import preprocessing
%matplotlib inline

## Download and read the dataset

In [None]:
!wget -O loan_train.csv https://s3-api.us-geo.objectstorage.softlayer.net/cf-courses-data/CognitiveClass/ML0101ENv3/labs/loan_train.csv

In [None]:
df = pd.read_csv('loan_train.csv')
df.head()

In [None]:
df.shape

## Convert to date time object

In [None]:
df['effective_date'] = pd.to_datetime(df['effective_date'])
df['due_date'] = pd.to_datetime(df['due_date'])
df.head()

## Data visulisation and analysis

In [None]:
df['loan_status'].value_counts()

In [None]:
df['Principal'].value_counts()

In [None]:
g = sns.FacetGrid(df, col='Gender', hue='loan_status', col_wrap=2)
g.map(plt.hist, 'Principal', bins=np.linspace(0, df.Principal.max(), 10), ec='k')
g.axes[-1].legend()
plt.show()

In [None]:
g = sns.FacetGrid(df, col='Gender', hue='loan_status', col_wrap=2)
g.map(plt.hist, 'age', bins=np.linspace(df.age.min(), df.age.max(), 10), ec='k')
g.axes[-1].legend()
plt.show()

## Pre-processing: Feature selection/extraction

## The day of the week people get the loan

In [None]:
df['dayofweek'] = df['effective_date'].dt.dayofweek

g = sns.FacetGrid(df, col="Gender", hue="loan_status", col_wrap=2)
g.map(plt.hist, 'dayofweek', bins=np.linspace(0,6,7), ec="k") # Monday=0, Sunday=6
g.axes[-1].legend()
plt.show()

According to the histogram above, people who get the loan on weekends seem more likely to default on loans, so we divide the days of week into two groups: 1 stands for Friday, Satturday, and Sunday, and 0 stands for the rest

In [None]:
df['weekend'] = df['dayofweek'].apply(lambda x: 1 if (x > 3) else 0)
df.head()

## Convert Categorical features to numerical values

In [None]:
df.groupby(['Gender'])['loan_status'].value_counts(normalize=True)

convert male to 0 and female to 1

In [None]:
df['Gender'].replace(to_replace = ['male', 'female'], value = [0, 1], inplace = True)
df.head()

In [None]:
df.groupby(['education'])['loan_status'].value_counts(normalize=True)

Feature before One Hot Encoding

In [None]:
Feature = df[['Principal','terms','age','Gender','weekend']]
Feature.head()

In [None]:
Feature = pd.concat([Feature, pd.get_dummies(df['education'])], axis = 1)
Feature.head()

## Feature selection

In [None]:
X = Feature
X[0:5]

In [None]:
y = df['loan_status'].values
y[0:5]

## Train Test Split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4)
print ('Train set:', X_train.shape,  y_train.shape)
print ('Test set:', X_test.shape,  y_test.shape)

## Normalise data

Data Standardisation give data zero mean and unit variance (technically should be done after train test split )

In [None]:
X = preprocessing.StandardScaler().fit(X).transform(X)
X[0:5]

## Classification

## K Nearest Neighbor

In [None]:
from sklearn.neighbors import KNeighborsClassifier

## Training

Lets start the algorithm with k=4 for now.

In [None]:
k = 4
#Train Model and Predict  
neigh = KNeighborsClassifier(n_neighbors = k).fit(X_train,y_train)
neigh

## Predicting

Use the model to predict the test set.

In [None]:
yhat = neigh.predict(X_test)
yhat[0:5]

## Accuracy evaluation

In [None]:
from sklearn import metrics
print("Train set Accuracy: ", metrics.accuracy_score(y_train, neigh.predict(X_train)))
print("Test set Accuracy: ", metrics.accuracy_score(y_test, yhat))

In [None]:
Ks = 20
mean_acc = np.zeros((Ks-1))
std_acc = np.zeros((Ks-1))
ConfustionMx = [];
for n in range(1,Ks):
    
    #Train Model and Predict  
    neigh = KNeighborsClassifier(n_neighbors = n).fit(X_train,y_train)
    yhat=neigh.predict(X_test)
    mean_acc[n-1] = metrics.accuracy_score(y_test, yhat)

    
    std_acc[n-1]=np.std(yhat==y_test)/np.sqrt(yhat.shape[0])

mean_acc

In [None]:
plt.plot(range(1,Ks),mean_acc,'g')
plt.fill_between(range(1,Ks),mean_acc - 1 * std_acc,mean_acc + 1 * std_acc, alpha=0.1)
plt.legend(('Accuracy ', '+/- 3xstd'))
plt.ylabel('Accuracy ')
plt.xlabel('Number of Nabors (K)')
plt.tight_layout()
plt.show()

In [None]:
print( "The best accuracy was with", mean_acc.max(), "with k=", mean_acc.argmax()+1) 

### Rebuild the model with optimal K

In [None]:
k = mean_acc.argmax()+1
#Train Model and Predict  
neigh = KNeighborsClassifier(n_neighbors = k).fit(X_train,y_train)
neigh

In [None]:
yhat = neigh.predict(X_test)
yhat[0:5]

In [None]:
from sklearn import metrics
print("Train set Accuracy: ", metrics.accuracy_score(y_train, neigh.predict(X_train)))
print("Test set Accuracy: ", metrics.accuracy_score(y_test, yhat))

## Accuracy evaluation

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:
# Compute confusion matrix
cnf_matrix = confusion_matrix(y_test, yhat, labels=['COLLECTION', 'PAIDOFF'])
np.set_printoptions(precision=2)

print (classification_report(y_test, yhat))

# Plot non-normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=['COLLECTION', 'PAIDOFF'],normalize= False,  title='Confusion matrix')

### Jaccard index

In [None]:
from sklearn.metrics import jaccard_similarity_score
jaccard_similarity_score(y_test, yhat)

### F1-score

In [None]:
from sklearn.metrics import f1_score
f1_score(y_test, yhat, average='weighted') 

## Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
loan_tree = DecisionTreeClassifier(criterion="entropy", max_depth = 5)
loan_tree

In [None]:
loan_tree.fit(X_train, y_train)

In [None]:
pred_tree = loan_tree.predict(X_test)

In [None]:
print (pred_tree[0:5])
print (y_test[0:5])

## Evaluation

In [None]:
from sklearn import metrics
import matplotlib.pyplot as plt
print("DecisionTrees's Accuracy: ", metrics.accuracy_score(y_test, pred_tree))

### Confusion matrix

In [None]:
# Compute confusion matrix
cnf_matrix = confusion_matrix(y_test, pred_tree, labels=['COLLECTION', 'PAIDOFF'])
np.set_printoptions(precision=2)

print (classification_report(y_test, pred_tree))

# Plot non-normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=['COLLECTION', 'PAIDOFF'],normalize= False,  title='Confusion matrix')

### Jaccard index

In [None]:
jaccard_similarity_score(y_test, pred_tree)

### F1-score

In [None]:
f1_score(y_test, pred_tree, average='weighted') 

## Support Vector Machine

In [None]:
from sklearn import svm
loan = svm.SVC(kernel='rbf')
loan.fit(X_train, y_train) 

In [None]:
y_svm = loan.predict(X_test)
y_svm[0:5]

### Confusion matrix

In [None]:
# Compute confusion matrix
cnf_matrix = confusion_matrix(y_test, y_svm, labels=['COLLECTION', 'PAIDOFF'])
np.set_printoptions(precision=2)

print (classification_report(y_test, y_svm))

# Plot non-normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=['COLLECTION', 'PAIDOFF'],normalize= False,  title='Confusion matrix')

### Jaccard index

In [None]:
jaccard_similarity_score(y_test, y_svm)

### F1-score

In [None]:
f1_score(y_test, y_svm, average='weighted') 

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
LR = LogisticRegression(C=0.01, solver='liblinear').fit(X_train,y_train)
LR

In [None]:
y_LR = LR.predict(X_test)

In [None]:
y_LR_prob = LR.predict_proba(X_test)

### Confusion matrix

In [None]:
# Compute confusion matrix
cnf_matrix = confusion_matrix(y_test, y_LR, labels=['COLLECTION', 'PAIDOFF'])
np.set_printoptions(precision=2)

print (classification_report(y_test, y_LR))

# Plot non-normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=['COLLECTION', 'PAIDOFF'],normalize= False,  title='Confusion matrix')

### Jaccard index

In [None]:
jaccard_similarity_score(y_test, y_LR)

## F1-score

In [None]:
f1_score(y_test, y_LR, average='weighted') 

## log loss

In [None]:
from sklearn.metrics import log_loss
log_loss(y_test, y_LR_prob)

## Model Evaluation using Test set

In [None]:
!wget -O loan_test.csv https://s3-api.us-geo.objectstorage.softlayer.net/cf-courses-data/CognitiveClass/ML0101ENv3/labs/loan_test.csv

In [None]:
df_test = pd.read_csv('loan_test.csv')
df_test.head()

In [None]:
# convert to date time object
df_test['effective_date'] = pd.to_datetime(df_test['effective_date'])
df_test['due_date'] = pd.to_datetime(df_test['due_date'])

df_test['dayofweek'] = df_test['effective_date'].dt.dayofweek
df_test['weekend'] = df_test['dayofweek'].apply(lambda x: 1 if (x > 3) else 0)
df_test['Gender'].replace(to_replace = ['male', 'female'], value = [0, 1], inplace = True)

Feature_test = df_test[['Principal','terms','age','Gender','weekend']]

Feature_test = pd.concat([Feature_test, pd.get_dummies(df_test['education'])], axis = 1)

Feature_test.head()

In [None]:
X_testset = preprocessing.StandardScaler().fit(Feature_test).transform(Feature_test)
X_testset[0:5]

In [None]:
y_testset = df_test['loan_status'].values
y_testset[0:5]

## KNN Evaluation

In [None]:
yhat_test = neigh.predict(X_testset)
print('KNN Jaccard index: {}'.format(jaccard_similarity_score(y_testset, yhat_test)))
print('KNN F1-score: {}'.format(f1_score(y_testset, yhat_test, average='weighted')))

## Decision Tree Evaluation

In [None]:
tree_test = loan_tree.predict(X_testset)
print('Decision Tree Jaccard index: {:.2f}'.format(jaccard_similarity_score(y_testset, tree_test)))
print('Decision Tree F1-score: {:.2f}'.format(f1_score(y_testset, tree_test, average='weighted')))

## SVM Evaluation

In [None]:
tree_test = loan_tree.predict(X_testset)
print('SVM Jaccard index: {:.2f}'.format(jaccard_similarity_score(y_testset, tree_test)))
print('SVM F1-score: {:.2f}'.format(f1_score(y_testset, tree_test, average='weighted')))

## Logistic Regression Evaluation

In [None]:
LR_test = LR.predict(X_testset)
LR_test_prob = LR.predict_proba(X_testset)
print('LR Jaccard index: {:.2f}'.format(jaccard_similarity_score(y_testset, LR_test)))
print('LR F1-score: {:.2f}'.format(f1_score(y_testset, LR_test, average='weighted')))
print('LR F1-score: {:.2f}'.format(log_loss(y_testset, LR_test_prob)))

## Report