In [1]:
# Enable reloading of modules being used after they change during the instance
%reload_ext autoreload
%autoreload 2

In [25]:
import pandas as pd
import numpy as np
from IPython.display import display             # Allows the use of display() for DataFrames

from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion

from imblearn.combine import SMOTEENN, _smote_enn
from imblearn.pipeline import pipeline
from sklearn.preprocessing import StandardScaler
import warnings
from sklearn.exceptions import DataConversionWarning 
warnings.filterwarnings(action='ignore',category=DataConversionWarning) #to supress dataconversion warnings from scaler
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

import joblib

# Import the classes
from libs.utils import *

from imblearn.pipeline import make_pipeline

In [3]:
traindemog_df = pd.read_csv("./data/traindemographics.csv", names=None, header=0) 
trainperf_df = pd.read_csv("./data/trainperf.csv", header=0)
trainprev_df = pd.read_csv("./data/trainprevloans.csv", header=0)

In [4]:
# remove the repeated rows for each costuomerid in demographics
traindemog_df.drop_duplicates(subset=['customerid'], inplace=True)
# edit the bank names to remove spaces
#traindemog_df['bank_name_clients'] = traindemog_df['bank_name_clients'].apply(lambda x: '_'.join(x.split()))
# Alternative: drop bank_name_clients column
traindemog_df.drop(columns=['bank_name_clients'], inplace=True)
# merge demographics with perfomance. Performance taking priority
merged_df = trainperf_df.merge(traindemog_df, on=['customerid'], how='left')

In [5]:
trainprev_features = FeatureUnion([
    ('prev_termdaysmean', GetMean('termdays')),
    ('prev_daysearlymean', DaydeltaTransformer('firstduedate','firstrepaiddate', 'daysearly')),
    ('prev_dayslatemean', DaydeltaTransformer('firstrepaiddate', 'firstduedate', 'dayslate')),
    ('prev_loanamountmean', GetMean('loanamount')),
    ('prev_wait_timemean', TimedeltaTransformer('approveddate', 'creationdate', 'b4approval')),
    ('prev_referredcount', GetUnique('referredby')),
    ('customerid', GetUid('customerid'))
])
trainprev_df_ = pd.DataFrame(data=trainprev_features.fit_transform(trainprev_df),
                            columns=['prev_termdaysmean','prev_daysearlymean',
                                     'prev_dayslatemean','prev_loanamountmean',
                                     'prev_wait_timemean','prev_referredcount','customerid'])

In [6]:
# merge new columns to the merged df. Merged df taking priority. Ensure float dtype for new cols
merged_df = merged_df.merge(trainprev_df_, on=['customerid'], how='left')
merged_df = merged_df.astype({'prev_termdaysmean':'float64',
                               'prev_daysearlymean':'float64',
                               'prev_dayslatemean':'float64',
                               'prev_loanamountmean':'float64',
                               'prev_wait_timemean':'float64',
                               'prev_referredcount':'float64'})

In [7]:
# imputation depends on full dataset
subpipe = make_pipeline(
    AgeYears('birthdate', 'approveddate', 'age_years'),
    StatFillNa('prev_termdaysmean', 'good_bad_flag', 'Mean'),
    StatFillNa('prev_daysearlymean', 'good_bad_flag', 'Mean'),
    StatFillNa('prev_dayslatemean', 'good_bad_flag', 'Mean'),
    StatFillNa('prev_loanamountmean', 'good_bad_flag', 'Mean'),
    StatFillNa('prev_wait_timemean', 'good_bad_flag', 'Mean'),
    StatFillNa('prev_referredcount', 'good_bad_flag', 'Mean'),
    StatFillNa('age_years', 'good_bad_flag', 'Median'),
)
merged_df = subpipe.fit_transform(merged_df)

In [30]:
from sklearn.ensemble import GradientBoostingClassifier

In [31]:
from sklearn.svm import SVC

In [50]:
to_drop = ['customerid', 'good_bad_flag', 'systemloanid', 'totaldue', 'referredby',
           'bank_branch_clients', 'level_of_education_clients',
          'longitude_gps', 'latitude_gps']
categorical_features = ['bank_account_type', 'employment_status_clients']#, 'bank_name_clients']

In [51]:
mainpipe = make_pipeline(
    ReferredTransformer('referredby', 'referred_status'),
    ApprovalPeriod('approveddate','creationdate','b4approval_sec'),
    VarFillNa('bank_account_type', 'Other'),
    VarFillNa('employment_status_clients',' Unknown'),
    #VarFillNa('bank_name_clients', 'Unknown'),
    Encoder(categorical_features),
    ColumnDropTransformer(to_drop),
    #SMOTEENN(random_state=42, ratio=0.8),
    StandardScaler()
    #GradientBoostingClassifier(random_state=42)
)

In [58]:
# save the prepared data to test with tensorflow
np.savetxt('./data/ytest.csv', y_test.values, delimiter=',')
np.savetxt('./data/ytrain.csv', y_train.values, delimiter=',')

In [43]:
y = merged_df['good_bad_flag'] # Target class column
X = mainpipe.fit_transform(merged_df)

In [55]:
X_train, X_test, y_train, y_test = train_test_split(*shuffle(X, (y=='Good').astype(int)),
                                                     test_size=0.30, random_state=42)

In [59]:
# save the prepared data to test with tensorflow
np.savetxt('./data/xtrain.csv', X_train,delimiter=',')
np.savetxt('./data/xtest.csv', X_test, delimiter=',')

In [None]:
#clfModel = mainpipe.fit(X_train, y_train)

In [36]:
predictions = clfModel.predict(X_test)
trainingtest = clfModel.predict(X_train)

In [37]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, f1_score

In [38]:
print(classification_report(y_test, predictions))
#print(classification_report(y, trainingtest))
tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()
#tn, fp, fn, tp = confusion_matrix(y, trainingtest).ravel()
print('tn: '+str(tn)+', fp: '+str(fp)+', fn: '+str(fn)+', tp: '+str(tp))
print('Training Accuracy score:', accuracy_score(y_train, trainingtest))
print('Test Accuracy score:', accuracy_score(y_test, predictions))
print('Test F1 score:', f1_score(y_test, predictions))

              precision    recall  f1-score   support

           0       0.49      0.26      0.34       281
           1       0.82      0.92      0.87      1030

   micro avg       0.78      0.78      0.78      1311
   macro avg       0.65      0.59      0.61      1311
weighted avg       0.75      0.78      0.76      1311

tn: 74, fp: 207, fn: 78, tp: 952
Training Accuracy score: 0.8475629702322538
Test Accuracy score: 0.782608695652174
Test F1 score: 0.8698035632709


In [None]:
#clfModel.named_steps

In [12]:
"""
mainpipe = Pipeline([
    #('referred_status', ReferredTransformer('referredby', 'referred_status')),
    #('b4approval_sec', ApprovalPeriod('approveddate','creationdate','b4approval_sec')),
    #('bank_account', VarFillNa('bank_account_type', 'Other')),
    #('employment', VarFillNa('employment_status_clients',' Unknown')),
    #('bank_name', VarFillNa('bank_name_clients', 'Unknown')),
    ('age_years', StatFillNa('age_years', 'good_bad_flag', 'Median')),
    #('encoder', Encoder(categorical_features)),
    ('drop', ColumnDropTransformer(to_drop)),
    #('smote', SMOTE(random_state=42, ratio = 0.9)),
    #('enn', EditedNearestNeighbours(random_state=42)),
    #('scaler', StandardScaler()),
    #('clf', GradientBoostingClassifier(n_estimators=500, random_state=42))
])
mainpipe.fit_transform(merged_df).info()
#clfModel
"""

"\nmainpipe = Pipeline([\n    #('referred_status', ReferredTransformer('referredby', 'referred_status')),\n    #('b4approval_sec', ApprovalPeriod('approveddate','creationdate','b4approval_sec')),\n    #('bank_account', VarFillNa('bank_account_type', 'Other')),\n    #('employment', VarFillNa('employment_status_clients',' Unknown')),\n    #('bank_name', VarFillNa('bank_name_clients', 'Unknown')),\n    ('age_years', StatFillNa('age_years', 'good_bad_flag', 'Median')),\n    #('encoder', Encoder(categorical_features)),\n    ('drop', ColumnDropTransformer(to_drop)),\n    #('smote', SMOTE(random_state=42, ratio = 0.9)),\n    #('enn', EditedNearestNeighbours(random_state=42)),\n    #('scaler', StandardScaler()),\n    #('clf', GradientBoostingClassifier(n_estimators=500, random_state=42))\n])\nmainpipe.fit_transform(merged_df).info()\n#clfModel\n"