In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import preprocessing
from sklearn.metrics import jaccard_score
from sklearn.metrics import f1_score
from sklearn.metrics import log_loss
from sklearn.metrics import accuracy_score

In [2]:
loan = pd.read_csv('https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-ML0101EN-SkillsNetwork/labs/FinalModule_Coursera/data/loan_train.csv',
                  parse_dates=['due_date','effective_date'])

In [3]:
loan.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 346 entries, 0 to 345
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   Unnamed: 0      346 non-null    int64         
 1   Unnamed: 0.1    346 non-null    int64         
 2   loan_status     346 non-null    object        
 3   Principal       346 non-null    int64         
 4   terms           346 non-null    int64         
 5   effective_date  346 non-null    datetime64[ns]
 6   due_date        346 non-null    datetime64[ns]
 7   age             346 non-null    int64         
 8   education       346 non-null    object        
 9   Gender          346 non-null    object        
dtypes: datetime64[ns](2), int64(5), object(3)
memory usage: 27.2+ KB


In [4]:
loan['loan_status'].value_counts()

PAIDOFF       260
COLLECTION     86
Name: loan_status, dtype: int64

In [5]:
loan.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,loan_status,Principal,terms,effective_date,due_date,age,education,Gender
0,0,0,PAIDOFF,1000,30,2016-09-08,2016-10-07,45,High School or Below,male
1,2,2,PAIDOFF,1000,30,2016-09-08,2016-10-07,33,Bechalor,female
2,3,3,PAIDOFF,1000,15,2016-09-08,2016-09-22,27,college,male
3,4,4,PAIDOFF,1000,30,2016-09-09,2016-10-08,28,college,female
4,6,6,PAIDOFF,1000,30,2016-09-09,2016-10-08,29,college,male


## Weekend

In [6]:
loan['dayofweek'] = loan['effective_date'].dt.dayofweek

In [7]:
loan['weekend'] = loan['dayofweek'].apply(lambda x : 1 if x>3 else 0)

In [8]:
loan['Gender'].replace(['male','female'], value=[0,1], inplace=True)

## Feature

In [9]:
feature = loan[['Principal','terms','age','Gender','weekend']]

### Hot Encoding Education

In [10]:
feature = pd.concat([feature, pd.get_dummies(loan['education'])],axis=1)
feature.drop('Master or Above', axis=1, inplace=True)

In [11]:
X = feature

In [12]:
y = loan['loan_status']

## Normalize Data

In [13]:
X = preprocessing.StandardScaler().fit(X).transform(X)

# Model

### KNN

In [14]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4)
print ('Train set:', X_train.shape,  y_train.shape)
print ('Test set:', X_test.shape,  y_test.shape)

Train set: (276, 8) (276,)
Test set: (70, 8) (70,)


In [16]:
Ks = 10
mean_acc = np.zeros((Ks-1))
std_acc = np.zeros((Ks-1))
for n in range(1,Ks):
    neigh = KNeighborsClassifier(n_neighbors=n).fit(X_train, y_train)
    yhat = neigh.predict(X_test)
    mean_acc[n-1] = metrics.accuracy_score(y_test, yhat)
    std_acc[n-1]=np.std(yhat==y_test)/np.sqrt(yhat.shape[0])
    print('K:{},mean_acc:{}'.format(n,mean_acc[n-1]))

K:1,mean_acc:0.6428571428571429
K:2,mean_acc:0.5857142857142857
K:3,mean_acc:0.7428571428571429
K:4,mean_acc:0.7
K:5,mean_acc:0.7428571428571429
K:6,mean_acc:0.7142857142857143
K:7,mean_acc:0.8
K:8,mean_acc:0.7571428571428571
K:9,mean_acc:0.7428571428571429


#### best classifier N = 7

### Evaluation

In [23]:
neigh = KNeighborsClassifier(n_neighbors=7).fit(X_train, y_train)
yhat = neigh.predict(X_test)

In [24]:
print(jaccard_score(y_test, yhat, pos_label='COLLECTION'))
print(jaccard_score(y_test, yhat, pos_label='PAIDOFF'))

0.36363636363636365
0.7741935483870968


In [25]:
f1_score(y_test, yhat, average = 'weighted')

0.8

# Decision Tree

In [26]:
from sklearn.tree import DecisionTreeClassifier

In [27]:
for n in range(1,10):
    loan_tree = DecisionTreeClassifier(criterion='entropy', max_depth=n).fit(X_train, y_train)
    loan_pred = loan_tree.predict(X_test)
    print('K:{}--->{}'.format(n,accuracy_score(y_test, loan_pred)))

K:1--->0.7857142857142857
K:2--->0.7857142857142857
K:3--->0.6142857142857143
K:4--->0.6142857142857143
K:5--->0.6428571428571429
K:6--->0.7714285714285715
K:7--->0.7571428571428571
K:8--->0.7571428571428571
K:9--->0.6571428571428571


### best depth = 1 or 2

### Evaluation

In [68]:
loan_tree = DecisionTreeClassifier(criterion='entropy', max_depth=2).fit(X_train, y_train)
loan_pred = loan_tree.predict(X_test)

In [69]:
print(jaccard_score(y_test, loan_pred,pos_label='COLLECTION'))
print(jaccard_score(y_test, loan_pred,pos_label='PAIDOFF'))

0.0
0.7857142857142857


In [70]:
f1_score(y_test, loan_pred, average='weighted')

0.6914285714285714

# SVM

In [71]:
from sklearn import svm

In [72]:
clf = svm.SVC(kernel='rbf')
clf.fit(X_train, y_train)

SVC()

In [73]:
yhat = clf.predict(X_test)

### evaluation

In [74]:
print(jaccard_score(y_test, yhat, pos_label='COLLECTION'))
print(jaccard_score(y_test, yhat, pos_label='PAIDOFF'))

0.18181818181818182
0.7272727272727273


In [75]:
f1_score(y_test, yhat, average='weighted')

0.7275882012724117

# Logistic Regression

In [109]:
from sklearn.linear_model import LogisticRegression
c = [100, 10, 1, 0.1, 0.01]

for k in c:
    LR = LogisticRegression(C=k, solver='liblinear').fit(X_train, y_train)
    yhat = LR.predict(X_test)
    print('C:{},accuracy:{}'.format(k,accuracy_score(y_test, yhat)))

C:100,accuracy:0.7142857142857143
C:10,accuracy:0.7142857142857143
C:1,accuracy:0.7142857142857143
C:0.1,accuracy:0.7428571428571429
C:0.01,accuracy:0.6857142857142857


### best C=0.1

In [114]:
LR = LogisticRegression(C=0.1, solver='liblinear').fit(X_train, y_train)
yhat = LR.predict(X_test)
yhat_prob = LR.predict_proba(X_test)
yhat_prob[0:5]

array([[0.51024982, 0.48975018],
       [0.40295067, 0.59704933],
       [0.0979815 , 0.9020185 ],
       [0.12802981, 0.87197019],
       [0.09898222, 0.90101778]])

### evaluation

In [115]:
print(jaccard_score(y_test, yhat, pos_label='COLLECTION'))
print(jaccard_score(y_test, yhat, pos_label='PAIDOFF'))

0.1
0.7352941176470589


In [116]:
f1_score(y_test, yhat, average = 'weighted')

0.7048206031256878