In [108]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import preprocessing
from sklearn.metrics import jaccard_score
from sklearn.metrics import f1_score
from sklearn.metrics import log_loss

In [109]:
loan = pd.read_csv('https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-ML0101EN-SkillsNetwork/labs/FinalModule_Coursera/data/loan_train.csv',
                  parse_dates=['due_date','effective_date'])

In [110]:
loan.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 346 entries, 0 to 345
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   Unnamed: 0      346 non-null    int64         
 1   Unnamed: 0.1    346 non-null    int64         
 2   loan_status     346 non-null    object        
 3   Principal       346 non-null    int64         
 4   terms           346 non-null    int64         
 5   effective_date  346 non-null    datetime64[ns]
 6   due_date        346 non-null    datetime64[ns]
 7   age             346 non-null    int64         
 8   education       346 non-null    object        
 9   Gender          346 non-null    object        
dtypes: datetime64[ns](2), int64(5), object(3)
memory usage: 27.2+ KB


In [111]:
loan['loan_status'].value_counts()

PAIDOFF       260
COLLECTION     86
Name: loan_status, dtype: int64

In [112]:
loan.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,loan_status,Principal,terms,effective_date,due_date,age,education,Gender
0,0,0,PAIDOFF,1000,30,2016-09-08,2016-10-07,45,High School or Below,male
1,2,2,PAIDOFF,1000,30,2016-09-08,2016-10-07,33,Bechalor,female
2,3,3,PAIDOFF,1000,15,2016-09-08,2016-09-22,27,college,male
3,4,4,PAIDOFF,1000,30,2016-09-09,2016-10-08,28,college,female
4,6,6,PAIDOFF,1000,30,2016-09-09,2016-10-08,29,college,male


## Weekend

In [113]:
loan['dayofweek'] = loan['effective_date'].dt.dayofweek

In [114]:
loan['weekend'] = loan['dayofweek'].apply(lambda x : 1 if x>3 else 0)

In [115]:
loan['Gender'].replace(['male','female'], value=[0,1], inplace=True)

## Feature

In [116]:
feature = loan[['Principal','terms','age','Gender','weekend']]

### Hot Encoding Education

In [117]:
feature = pd.concat([feature, pd.get_dummies(loan['education'])],axis=1)
feature.drop('Master or Above', axis=1, inplace=True)

In [118]:
X = feature

In [119]:
y = loan['loan_status']

## Normalize Data

In [120]:
X = preprocessing.StandardScaler().fit(X).transform(X)

# Model

### KNN

In [121]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [122]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4)
print ('Train set:', X_train.shape,  y_train.shape)
print ('Test set:', X_test.shape,  y_test.shape)

Train set: (276, 8) (276,)
Test set: (70, 8) (70,)


In [123]:
Ks = 10
mean_acc = np.zeros((Ks-1))
std_acc = np.zeros((Ks-1))
for n in range(1,Ks):
    neigh = KNeighborsClassifier(n_neighbors=n).fit(X_train, y_train)
    yhat = neigh.predict(X_test)
    mean_acc[n-1] = metrics.accuracy_score(y_test, yhat)
    std_acc[n-1]=np.std(yhat==y_test)/np.sqrt(yhat.shape[0])
mean_acc

array([0.64285714, 0.58571429, 0.74285714, 0.7       , 0.74285714,
       0.71428571, 0.8       , 0.75714286, 0.74285714])

#### best classifier N = 6

### Evaluation

In [124]:
print(jaccard_score(y_test, yhat, pos_label='COLLECTION'))
print(jaccard_score(y_test, yhat, pos_label='PAIDOFF'))

0.14285714285714285
0.7313432835820896


In [125]:
f1_score(y_test, yhat, average = 'weighted')

0.7173645320197043

# Decision Tree

In [126]:
from sklearn.tree import DecisionTreeClassifier

In [127]:
loan_tree = DecisionTreeClassifier(criterion='entropy', max_depth=4)
loan_tree.fit(X_train, y_train)

DecisionTreeClassifier(criterion='entropy', max_depth=4)

In [128]:
loan_pred = loan_tree.predict(X_test)

### Evaluation

In [129]:
print(jaccard_score(y_test, loan_pred,pos_label='COLLECTION'))
print(jaccard_score(y_test, loan_pred,pos_label='PAIDOFF'))

0.20588235294117646
0.5714285714285714


In [130]:
f1_score(y_test, loan_pred, average='weighted')

0.6445993031358885

# SVM

In [131]:
from sklearn import svm

In [132]:
clf = svm.SVC(kernel='rbf')
clf.fit(X_train, y_train)

SVC()

In [133]:
yhat = clf.predict(X_test)

### evaluation

In [134]:
print(jaccard_score(y_test, yhat, pos_label='COLLECTION'))
print(jaccard_score(y_test, yhat, pos_label='PAIDOFF'))

0.18181818181818182
0.7272727272727273


In [135]:
f1_score(y_test, yhat, average='weighted')

0.7275882012724117

# Logistic Regression

In [136]:
from sklearn.linear_model import LogisticRegression

LR = LogisticRegression(C=0.01, solver='liblinear').fit(X_train, y_train)
LR

LogisticRegression(C=0.01, solver='liblinear')

In [137]:
yhat = LR.predict(X_test)
yhat[0:5]

array(['COLLECTION', 'PAIDOFF', 'PAIDOFF', 'PAIDOFF', 'PAIDOFF'],
      dtype=object)

In [138]:
yhat_prob = LR.predict_proba(X_test)
yhat_prob[0:5]

array([[0.5034238 , 0.4965762 ],
       [0.45206111, 0.54793889],
       [0.30814132, 0.69185868],
       [0.34259428, 0.65740572],
       [0.32025894, 0.67974106]])

### evaluation

In [139]:
print(jaccard_score(y_test, yhat, pos_label='COLLECTION'))
print(jaccard_score(y_test, yhat, pos_label='PAIDOFF'))

0.08333333333333333
0.6764705882352942


In [140]:
f1_score(y_test, yhat, average = 'weighted')

0.6670522459996144