In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [3]:
# import usual suspects

import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

In [4]:
# import other useful stuff

from IPython.display import display
from sklearn.metrics import mutual_info_score

from sklearn.model_selection import train_test_split

from sklearn.feature_extraction import DictVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

In [6]:
!wget https://github.com/KursadE/zoomcamp-HW7/blob/main/traindata_creditcard.csv

--2021-11-01 10:31:07--  https://github.com/KursadE/zoomcamp-HW7/blob/main/traindata_creditcard.csv
Resolving github.com (github.com)... 140.82.121.3
Connecting to github.com (github.com)|140.82.121.3|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [text/html]
Saving to: ‘traindata_creditcard.csv.1’

traindata_creditcar     [ <=>                ] 131,42K  --.-KB/s    in 0,07s   

2021-11-01 10:31:09 (1,93 MB/s) - ‘traindata_creditcard.csv.1’ saved [134572]



In [7]:
# put data into data frame
df = pd.read_csv("traindata_creditcard.csv")

In [8]:
df.head()

Unnamed: 0,ID,Gender,Age,Region_Code,Occupation,Channel_Code,Vintage,Credit_Product,Avg_Account_Balance,Is_Active,Is_Lead
0,NNVBBKZB,Female,73,RG268,Other,X3,43,No,1045696,No,0
1,IDD62UNG,Female,30,RG277,Salaried,X1,32,No,581988,No,0
2,HD3DSEMC,Female,56,RG268,Self_Employed,X3,26,No,1484315,Yes,0
3,BF3NC7KV,Male,34,RG270,Salaried,X1,19,No,470454,No,0
4,TEASRWXV,Female,30,RG282,Salaried,X1,33,No,886787,No,0


# Data Preparation and Data Cleaning

In [None]:
# check column data types

df.dtypes

In [None]:
# lower case column names

df.columns = df.columns.str.lower()

In [None]:
# define categorical and numerical columns

categorical = ['gender', 'region_code', 'occupation', 'channel_code', 'credit_product', 'is_active']
numerical = ['age', 'vintage', 'avg_account_balance']

In [None]:
# lower case and eliminate space in data records

for col in df[categorical].columns:
    df[col] = df[col].str.lower().str.replace(' ', '_')

In [None]:
# check null values

df.isnull().sum()

In [None]:
# assign new category 'Unk' to null values

df['credit_product'] = df['credit_product'].fillna('Unk')

In [None]:
# check unique entries in categorical features

for col in df[categorical].columns:
    print(col)
    print(df[col].unique())
    print(df[col].nunique())
    print()

In [None]:
# check number of unique entries represented in data

for col in df[categorical].columns:
    print(col)
    print(df[col].value_counts())
    print()

# Train, Validation, Test Split

In [None]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

In [None]:
df_full_train.head()

In [None]:
df_full_train = df_full_train.reset_index(drop=True)
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [None]:
y_train = df_train['is_lead']
y_val = df_val['is_lead']
y_test = df_test['is_lead']

In [None]:
y_full_train = df_full_train['is_lead']
del df_full_train['is_lead']

del df_test['is_lead']

# EDA & Feature Importance Analysis

In [None]:
# run EDA on train set
# check distribution of target - 'is_lead'
# it seems to be an unbalanced dataset

df_train['is_lead'].value_counts(normalize=False)

In [None]:
# calculate the weight of target

is_lead_overall = df_train['is_lead'].mean()
is_lead_overall

In [None]:
# check categories' target distribution to gain insight about data
# gender and occupation can be mildly important
# whereas some of the regions seem important
# as well as channel code  
# and not having a credit product

for c in categorical:
    df_group = df_train.groupby(c)['is_lead'].agg(['mean', 'count'])
    df_group['opportunity'] = df_group['mean'] / is_lead_overall
    display(df_group.sort_values(by='mean', ascending=False))
    print()
    print()

In [None]:
# check mutual_info_score

def mutual_info_lead_score(series):
    return mutual_info_score(series, df_train['is_lead'])

In [None]:
# having a credit product and channel code are important features
# though region itself do not seem to matter 

mi = df_train[categorical].apply(mutual_info_lead_score)
mi.sort_values(ascending=False)

In [None]:
# check distribution of numerical values

df_train[numerical].describe()

In [None]:
sns.histplot(data=df_train, x="age")

In [None]:
sns.histplot(data=df_train, x="vintage")

In [None]:
sns.histplot(data=df_train, x="avg_account_balance")

In [None]:
# there is correlation between 'age' and 'vintage'
# yet these are different features
# therefore, there is no need to eliminate one of the features

sns.heatmap(df_train[numerical].corr(),annot = True)

In [None]:
# all numerical features seem to have effect on target

df_train[numerical].corrwith(df_train['is_lead']).abs()

In [None]:
# eliminate less important features and redefine categorical columns

categorical = ['occupation', 'channel_code', 'credit_product']

In [None]:
# make baseline prediction with Logistic Regression to decide to continue feature engineering

# define train function
def train(df, y, C=1.0):
    dicts = df[train_columns].to_dict(orient='records')

    dv = DictVectorizer(sparse=False)
    X_train = dv.fit_transform(dicts)

    model = LogisticRegression(solver='lbfgs', C=C, max_iter=1000)
    model.fit(X_train, y)
    
    return dv, model

In [None]:
# define train function

def predict(df, dv, model):
    dicts = df[train_columns].to_dict(orient='records')

    X = dv.transform(dicts)
    y_pred = model.predict_proba(X)[:, 1]

    return y_pred

In [None]:
# make baseline prediction & return roc_auc_score

train_columns = categorical + numerical
dv, model = train(df_train, y_train)
y_pred = predict(df_val, dv, model)
#accuracy_score(y_val, y_pred >= 0.5)
roc_auc_score(y_val, y_pred)

In [None]:
fpr, tpr, thresholds = roc_curve(y_val, y_pred)

In [None]:
# poor performance of the base model

plt.figure(figsize=(5, 5))

plt.plot(fpr, tpr, label='Model')
plt.plot([0, 1], [0, 1], label='Random', linestyle='--')

plt.xlabel('FPR')
plt.ylabel('TPR')

plt.legend()

# Feature Engineering

In [None]:
# group 'age' under 'age_cat' feature
# group 'vintage' under 'vintage_cat' feature
# group 'avg_account_balance' under 'avg_account_balance_cat' feature

def cat(df):
    df['age_cat'] = pd.cut(df['age'],bins=[20,29,39,49,59,69,79,89],
       labels=['20-29','30-39','40-49','50-59','60-69','70-79','80-89'])
    df['vintage_cat'] = pd.cut(df['vintage'],bins=[0,19,39,59,79,99,119,139],
       labels=['0-19','20-39','40-59','60-79','80-99','100-119','120-139'])
    df['avg_account_balance_cat'] = pd.qcut(df['avg_account_balance'], q=10, 
                                                  labels=['0','1','2','3','4','5','6','7','8','9'])

In [None]:
cat(df_train)
cat(df_val)
cat(df_test)
cat(df_full_train)

In [None]:
# redefine categorical columns

categorical = ['occupation', 'channel_code', 'credit_product', 'age_cat','vintage_cat','avg_account_balance_cat']

In [None]:
mi = df_train[categorical].apply(mutual_info_lead_score)
mi.sort_values(ascending=False)

In [None]:
categorical = ['occupation', 'channel_code', 'credit_product', 'age_cat','vintage_cat']

In [None]:
# make new prediction with new categorical features 
# to see the effect of feature engineering

train_columns = categorical
dv, model = train(df_train, y_train)
y_pred = predict(df_val, dv, model)
#accuracy_score(y_val, y_pred >= 0.5)
roc_auc_score(y_val, y_pred) # roc_auc_score improved significantly

In [None]:
fpr, tpr, thresholds = roc_curve(y_val, y_pred)

In [None]:
# display improvement with new plot

plt.figure(figsize=(5, 5))

plt.plot(fpr, tpr, label='Model')
plt.plot([0, 1], [0, 1], label='Random', linestyle='--')

plt.xlabel('FPR')
plt.ylabel('TPR')

plt.legend()

# Train Another Model - Decision Tree

In [None]:
train_columns=categorical
train_dicts = df_train[train_columns].to_dict(orient='records')

dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(train_dicts)

In [None]:
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)

In [None]:
val_dicts = df_val[train_columns].to_dict(orient='records')
X_val = dv.fit_transform(val_dicts)

y_pred = dt.predict_proba(X_val)[:, 1]
roc_auc_score(y_val, y_pred)

In [None]:
#from sklearn.tree import export_text
#print(export_text(dt, feature_names=dv.get_feature_names()))

# Fine Tune Models

In [None]:
# Fine tune Logistic Regression parameter C

train_columns=categorical
C_scores = []

for C in [0.01, 0.1, 1, 10]:

    dv, model = train(df_train, y_train, C=C)
    y_pred = predict(df_val, dv, model)

    auc = roc_auc_score(y_val, y_pred).round(4)
    C_scores.append((C, auc))

In [None]:
# choose max auc, if equal choose min C

columns = ['C', 'auc']
df_C_scores = pd.DataFrame(C_scores, columns=columns)
df_C_scores.sort_values(["auc"], ascending = (False))

In [None]:
# Fine tune Decision Tree parameters
# Start with 'Tree Depth'

depths = [1, 2, 3, 4, 5, 6, 10, 15, 20, None]

for depth in depths: 
    dt = DecisionTreeClassifier(max_depth=depth)
    dt.fit(X_train, y_train)
    
    y_pred = dt.predict_proba(X_val)[:, 1]
    auc = roc_auc_score(y_val, y_pred)
    
    print('%4s -> %.3f' % (depth, auc))

In [None]:
# Decide 'Tree Depth' and 'Min Leaf Samples'

scores = []

for depth in [5, 6, 10, 15]: # choose max auc
    for s in [1, 5, 10, 15, 20, 100, 200, 500]:
        dt = DecisionTreeClassifier(max_depth=depth, min_samples_leaf=s)
        dt.fit(X_train, y_train)

        y_pred = dt.predict_proba(X_val)[:, 1]
        auc = roc_auc_score(y_val, y_pred)
        
        scores.append((depth, s, auc))

In [None]:
columns = ['max_depth', 'min_samples_leaf', 'auc']
df_scores = pd.DataFrame(scores, columns=columns)

In [None]:
df_scores_pivot = df_scores.pivot(index='min_samples_leaf', columns='max_depth', values=['auc'])
df_scores_pivot.round(3) # max_depth = 10 with 20 samples in leaf is choosen

# Selecting Decision Tree as Best Model

In [None]:
def train(df, y, max_depth, min_samples_leaf):
    dicts = df[train_columns].to_dict(orient='records')

    dv = DictVectorizer(sparse=False)
    X_train = dv.fit_transform(dicts)

    model = DecisionTreeClassifier(max_depth=max_depth, min_samples_leaf=min_samples_leaf)
    model.fit(X_train, y)
    
    return dv, model

In [None]:
def predict(df, dv, model):
    dicts = df[train_columns].to_dict(orient='records')

    X = dv.transform(dicts)
    y_pred = model.predict_proba(X)[:, 1]

    return y_pred

In [None]:
# decision tree with max_depth = 10 and 20 samples in leaf is chosen

train_columns=categorical
max_depth = 10
min_samples_leaf = 20


full_train_dicts = df_full_train[train_columns].to_dict(orient='records')
dv = DictVectorizer(sparse=False)
X_full_train = dv.fit_transform(full_train_dicts)

test_dicts = df_test[train_columns].to_dict(orient='records')
X_test = dv.transform(test_dicts)


dt = DecisionTreeClassifier(max_depth=max_depth, min_samples_leaf=min_samples_leaf)
dt.fit(X_full_train, y_full_train)
    
y_pred = dt.predict_proba(X_test)[:, 1]
auc = roc_auc_score(y_test, y_pred)
#y_pred
auc

In [None]:
train_columns = categorical
max_depth = 10
min_samples_leaf = 20

dv, model = train(df_full_train, y_full_train, max_depth, min_samples_leaf)
y_pred = predict(df_test, dv, model)
#accuracy_score(y_val, y_pred >= 0.5)
roc_auc_score(y_test, y_pred) # roc_auc_score improved significantly

# Save the Model

In [None]:
output_file = f'tree_model_depth={max_depth}.bin'
output_file

In [None]:
import pickle
with open(output_file, 'wb') as f_out:
    pickle.dump((dv, model), f_out)