# Inspired from Kaggle : AMOL DESHMUKH

In [14]:
#### Dealing with Warnings ####
import warnings
warnings.filterwarnings('ignore')

In [15]:
#### Dependencies ####
import numpy as np
import pandas as pd
from math import ceil
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report,f1_score,confusion_matrix
from time import time
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA,LinearDiscriminantAnalysis as LDA
from sklearn.model_selection import StratifiedKFold, cross_val_score

In [16]:
train_loan_data = pd.read_csv('Data/train.csv').drop('Id',axis=1)
test_loan_data = pd.read_csv('Data/test.csv').drop('Id',axis=1)

In [17]:
train_loan_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7500 entries, 0 to 7499
Data columns (total 17 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Home Ownership                7500 non-null   object 
 1   Annual Income                 5943 non-null   float64
 2   Years in current job          7129 non-null   object 
 3   Tax Liens                     7500 non-null   float64
 4   Number of Open Accounts       7500 non-null   float64
 5   Years of Credit History       7500 non-null   float64
 6   Maximum Open Credit           7500 non-null   float64
 7   Number of Credit Problems     7500 non-null   float64
 8   Months since last delinquent  3419 non-null   float64
 9   Bankruptcies                  7486 non-null   float64
 10  Purpose                       7500 non-null   object 
 11  Term                          7500 non-null   object 
 12  Current Loan Amount           7500 non-null   float64
 13  Cur

In [18]:
#### Report on Missing Values ####
def missing_values_report(data,train=False):
    features = data.columns
    if train :
        print('----Missing Values in Training Data----')
    else:
        print('----Missing Values in Test Data----')
    for feature in features:
        if data[feature].dtype in ['int64','float64']:
            missing_values = data[feature].isnull().sum()
            if missing_values > 0:
                print(f'{feature} : {missing_values}')
        else:
            # dealing with str input
            missing_values = data[feature].isnull().sum() + data[data[feature] == ""].shape[0]
            if missing_values > 0:
                print(f'{feature} : {missing_values}')
        
missing_values_report(train_loan_data,train=True)
missing_values_report(test_loan_data)

----Missing Values in Training Data----
Annual Income : 1557
Years in current job : 371
Months since last delinquent : 4081
Bankruptcies : 14
Credit Score : 1557
----Missing Values in Test Data----
Annual Income : 513
Years in current job : 86
Months since last delinquent : 1358
Bankruptcies : 3
Credit Score : 513


# Dealing with missing values :
    - Years in current job : if nan then -1, if < 1 year then 0, if 10+ then 10
    - Annual Income : if nan then if years in current job != 0 then median income else 0
    - Months since last delinquent : if nan then median
    - Bankruptcies : if nan then median of people with the same number of credit default
    - Credit Score : if nan then median

In [19]:
### Years in current job ###
train_loan_data['Years in current job'].unique()
years_dict = {
    '-1' : -1,
    '< 1 year' : 0,
    '1 year' : 1,
    '2 years' : 2,
    '3 years' : 3,
    '4 years' : 4,
    '5 years' : 5,
    '6 years' : 6,
    '7 years' : 7,
    '8 years' : 8,
    '9 years' : 9,
    '10+ years' : 10,
}

train_loan_data['Years in current job'] = train_loan_data['Years in current job'].fillna('-1').map(years_dict)
test_loan_data['Years in current job'] = test_loan_data['Years in current job'].fillna('-1').map(years_dict)

In [20]:
### Annual Income ###
train_loan_data['Annual Income'][train_loan_data['Years in current job']==-1] = train_loan_data['Annual Income'][train_loan_data['Years in current job']==-1].fillna(0)
income_med = train_loan_data['Annual Income'].dropna().median()
train_loan_data['Annual Income'][train_loan_data['Years in current job']!=-1] = train_loan_data['Annual Income'][train_loan_data['Years in current job']!=-1].fillna(income_med)
train_loan_data.head()

test_loan_data['Annual Income'][test_loan_data['Years in current job']==-1] = test_loan_data['Annual Income'][test_loan_data['Years in current job']==-1].fillna(0)
income_med = test_loan_data['Annual Income'].dropna().median()
test_loan_data['Annual Income'][test_loan_data['Years in current job']!=-1] = test_loan_data['Annual Income'][test_loan_data['Years in current job']!=-1].fillna(income_med)

In [21]:
### Months Since Last Deliquent ###
delinquent_med = train_loan_data['Months since last delinquent'].median()
train_loan_data['Months since last delinquent'] = train_loan_data['Months since last delinquent'].fillna(delinquent_med)

delinquent_med = test_loan_data['Months since last delinquent'].median()
test_loan_data['Months since last delinquent'] = test_loan_data['Months since last delinquent'].fillna(delinquent_med)

In [22]:
train_loan_data['Number of Credit Problems'].corr(train_loan_data['Bankruptcies'])

0.730750619475721

High correlation between number of credit pbs and bankruptcies

In [23]:
credit_pbs_to_bankrupt_dict = { i : ceil(train_loan_data['Bankruptcies'].dropna()[train_loan_data['Number of Credit Problems']==i].median()) for i in range(8)}
train_loan_data['Bankruptcies'] = train_loan_data['Bankruptcies'].fillna(train_loan_data['Number of Credit Problems'].map(credit_pbs_to_bankrupt_dict))

credit_pbs_to_bankrupt_dict = { i : ceil(test_loan_data['Bankruptcies'].dropna()[test_loan_data['Number of Credit Problems']==i].median()) for i in [0,1,2,3,4,5,7]}
test_loan_data['Bankruptcies'] = test_loan_data['Bankruptcies'].fillna(test_loan_data['Number of Credit Problems'].map(credit_pbs_to_bankrupt_dict))

In [24]:
### Credit Score ###
credit_med = train_loan_data['Credit Score'].median()
train_loan_data['Credit Score'] = train_loan_data['Credit Score'].fillna(credit_med)

credit_med = test_loan_data['Credit Score'].median()
test_loan_data['Credit Score'] = test_loan_data['Credit Score'].fillna(credit_med)

In [25]:
### Testing results ###
missing_values_report(train_loan_data,True)
missing_values_report(test_loan_data)

----Missing Values in Training Data----
----Missing Values in Test Data----


In [26]:
train_loan_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7500 entries, 0 to 7499
Data columns (total 17 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Home Ownership                7500 non-null   object 
 1   Annual Income                 7500 non-null   float64
 2   Years in current job          7500 non-null   int64  
 3   Tax Liens                     7500 non-null   float64
 4   Number of Open Accounts       7500 non-null   float64
 5   Years of Credit History       7500 non-null   float64
 6   Maximum Open Credit           7500 non-null   float64
 7   Number of Credit Problems     7500 non-null   float64
 8   Months since last delinquent  7500 non-null   float64
 9   Bankruptcies                  7500 non-null   float64
 10  Purpose                       7500 non-null   object 
 11  Term                          7500 non-null   object 
 12  Current Loan Amount           7500 non-null   float64
 13  Cur

# Categorical Variables encoding
    - Nominal variables : Home Ownership, Purpose and Term (dummy encoding)

In [27]:
#### Train Test Split ####
num_features = train_loan_data.drop('Credit Default',axis=1).select_dtypes(include=np.number).columns
cat_features = train_loan_data.select_dtypes(include=['object']).columns
X = train_loan_data.drop('Credit Default',axis=1)
y = train_loan_data['Credit Default']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [28]:
#### Transformation Pipelines ####
numeric_transformer = Pipeline(steps=[
    ('scaler',StandardScaler()),
    #('pca', PCA(n_components=100))
])

categorical_transformer = Pipeline(steps=[
    ('one_hot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_features),
        ('cat', categorical_transformer, cat_features)
    ])

In [29]:
#### Naive Classifier #####
### Put everything to 1
basic_tp = y_test.value_counts().get(1,1)
basic_fn = 0
basic_fp = y_test.value_counts().get(1,0)
basic_recall = basic_tp / (basic_tp + basic_fn)
basic_precision = basic_tp / (basic_tp + basic_fp)
basic_f1 = 2 / (1/basic_recall + 1/basic_precision)
print("Basic Classifier : ",basic_f1)

Basic Classifier :  0.6666666666666666


In [30]:
#### Logistic Regression ####
pipe = Pipeline(steps=[
    ('preprocessor',preprocessor),
    ('classifier',LogisticRegression())
])

model = pipe.fit(X_train,y_train)
y_pred = model.predict(X_test)
target_names = y_test.unique().astype(str)

In [31]:
print(classification_report(y_test, y_pred, target_names=target_names))
print(f1_score(y_true=y_test, y_pred=y_pred))
print(pd.DataFrame(confusion_matrix(y_true=y_test,y_pred=y_pred)))

              precision    recall  f1-score   support

           0       0.77      0.97      0.86      1613
           1       0.79      0.25      0.39       637

    accuracy                           0.77      2250
   macro avg       0.78      0.61      0.62      2250
weighted avg       0.78      0.77      0.72      2250

0.38525564803804996
      0    1
0  1571   42
1   475  162


In [32]:
submission_pred = pd.DataFrame({
    'Id' : range(7500,10000),
    'Credit Default' : model.predict(test_loan_data)
})


# Write the DataFrame to a CSV file
submission_pred.to_csv('Data/LogRegSubmission.csv', index=False)

Worst result than Naive Classifier, needs improvments ...

In [43]:
#### Trying various models ####
results = pd.DataFrame(columns=['Name','f1','f1_test','StdDev(%)','Time(s)'])
for model in [
    LogisticRegression, 
    GaussianNB,
    KNeighborsClassifier,
    SVC,
    RandomForestClassifier, 
    XGBClassifier,
    LDA,
    QDA
]:
    pipe = make_pipeline(preprocessor, model())
    start_time = time()
    kfold = StratifiedKFold(n_splits=4)
    scores = cross_val_score(pipe, X_train, y_train, scoring='f1', cv=kfold)
    pipe.fit(X_train, y_train)
    scores_test = f1_score(y_true=y_test, y_pred=pipe.predict(X_test))
    time_mod = time() - start_time
    new_row = pd.DataFrame([{
        'Name' : model.__name__, 
        'f1' : round(scores.mean(), 4), 
        'f1_test' : round(scores_test, 4), 
        'StdDev(%)' : round(1e2*scores.std(), 2), 
        'Time(s)': round(time_mod, 2)
    }])
    results = pd.concat([results,new_row],ignore_index=True)
    del pipe
    print('Analyzed {}.'.format(model.__name__))
print('Done!')

results = results.sort_values('f1', ascending=False).reset_index(drop=True)

Analyzed LogisticRegression.
Analyzed GaussianNB.
Analyzed KNeighborsClassifier.
Analyzed SVC.
Analyzed RandomForestClassifier.
Analyzed XGBClassifier.
Analyzed LinearDiscriminantAnalysis.
Analyzed QuadraticDiscriminantAnalysis.
Done!


In [44]:
results

Unnamed: 0,Name,f1,f1_test,StdDev(%),Time(s)
0,GaussianNB,0.4864,0.4982,0.89,0.07
1,XGBClassifier,0.4753,0.4813,3.46,0.55
2,QuadraticDiscriminantAnalysis,0.4656,0.4746,2.27,0.1
3,RandomForestClassifier,0.4304,0.4181,3.46,2.92
4,KNeighborsClassifier,0.423,0.43,1.8,0.17
5,LogisticRegression,0.4017,0.3853,2.64,0.18
6,LinearDiscriminantAnalysis,0.3499,0.3363,1.85,0.15
7,SVC,0.3163,0.3095,2.16,3.5


GNB seems to outstand other algorithms

In [45]:
#### Trying GaussianNB ####
GNB = GaussianNB()
pipe = make_pipeline(preprocessor,GNB)
pipe.fit(X_train,y_train)
preds = pipe.predict(X_test)
print(f1_score(y_true=y_test,y_pred=preds))

submission_pred = pd.DataFrame({
    'Id' : range(7500,10000),
    'Credit Default' : pipe.predict(test_loan_data)
})


# Write the DataFrame to a CSV file
submission_pred.to_csv('Data/GNBSubmission.csv', index=False)

0.49817739975698666


# Examination of the f1_score : 
    - Imbalanced dataset : around 28% of positive cases (q)
    - Comparison to random model (flip coin with proba p of positive pred) :
        f_1 = 2*q*p / (q+p) which is max for p = 1

In [49]:
q = train_loan_data['Credit Default'].value_counts().get(1,1) / len(train_loan_data)
p = 1
f1_coin = 2 * q * p / (q + p)
print('Share of positive default : ',q)
print('Max F1_score with random model : ',f1_coin)

Share of positive default :  0.28173333333333334
Max F1_score with random model :  0.4396130240299594
