In [480]:
import pandas as pd
import random
import numpy as np
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [351]:
import warnings
warnings.filterwarnings("ignore")

## Read data

In [352]:
train = pd.read_csv('hw3_train.csv')
test = pd.read_csv('hw3_test_data.csv')
test_sample = pd.read_csv('hw3_test_sample.csv')

In [353]:
print('train: {}'.format(train.shape))
print('test: {}'.format(train.shape))
print('test_sample: {}'.format(train.shape))

train: (7500, 29)
test: (7500, 29)
test_sample: (7500, 29)


In [355]:
train.head()

Unnamed: 0,customer_id,customer_residence_code,gender,age,is_new_customer,seniority_month,customer_type,customer_relation_type,residence_same_as_bank,country_same_as_bank,...,use_payroll_account,use_short_deposits,use_medium_deposits,use_long_deposits,use_e_account,use_loans,use_taxes,use_credit_card,use_pensions,use_direct_debit
0,107620,1,2,47,0.0,36,1.0,0,2,0,...,1,0,0,0,0,0,0,0,1.0,0
1,107775,1,2,52,0.0,36,1.0,0,2,0,...,0,0,0,1,1,0,0,1,0.0,0
2,112208,1,0,24,0.0,36,1.0,1,2,2,...,0,0,0,0,0,0,0,0,0.0,0
3,112270,1,0,26,0.0,36,1.0,1,2,0,...,0,0,0,0,0,0,0,0,0.0,0
4,112332,1,0,26,0.0,36,1.0,1,2,0,...,0,0,0,0,0,0,0,0,0.0,0


In [356]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7500 entries, 0 to 7499
Data columns (total 29 columns):
customer_id                7500 non-null int64
customer_residence_code    7500 non-null int64
gender                     7500 non-null int64
age                        7500 non-null int64
is_new_customer            7500 non-null float64
seniority_month            7500 non-null int64
customer_type              7500 non-null float64
customer_relation_type     7500 non-null int64
residence_same_as_bank     7500 non-null int64
country_same_as_bank       7500 non-null int64
join_channel               7500 non-null int64
adress_type                7500 non-null float64
activity_index             7500 non-null float64
household_income           7500 non-null float64
segment                    7500 non-null int64
use_savings                7500 non-null int64
use_guarantees             7500 non-null int64
use_current_accounts       7500 non-null int64
use_derivada_account       7500 non-n

## Preprocess data

In [357]:
def convert_age(age):
    if   age < 18: return 1
    elif age < 30 and age >= 18: return 2
    elif age < 50 and age >= 30: return 3
    elif age < 70 and age >= 50: return 4
    else: return 5

In [396]:
def preproc_func(data):
    temp = data.copy()
    
    #temp = temp[temp['gender']!=1]
    temp['age_cat']   = temp['age'].apply(convert_age)
    
    temp['age_cat_1'] = temp['age_cat'].apply(lambda x: 1 if x==1 else 0)
    temp['age_cat_2'] = temp['age_cat'].apply(lambda x: 1 if x==2 else 0)
    temp['age_cat_3'] = temp['age_cat'].apply(lambda x: 1 if x==3 else 0)
    temp['age_cat_4'] = temp['age_cat'].apply(lambda x: 1 if x==4 else 0)
    temp['age_cat_5'] = temp['age_cat'].apply(lambda x: 1 if x==5 else 0)
    
    temp['channel_cat_10']    = temp['join_channel'].apply(lambda x: 1 if x==10 else 0)
    temp['channel_cat_6']     = temp['join_channel'].apply(lambda x: 1 if x==6 else 0)
    temp['channel_cat_1']     = temp['join_channel'].apply(lambda x: 1 if x==1 else 0)
    temp['channel_cat_5']     = temp['join_channel'].apply(lambda x: 1 if x==5 else 0)
    temp['channel_cat_9']     = temp['join_channel'].apply(lambda x: 1 if x==9 else 0)
    temp['channel_cat_other'] = temp['join_channel'].apply(lambda x: 1 if x not in [10,6,1,5,9] else 0)
    
    temp['segm_cat_0'] = temp['segment'].apply(lambda x: 1 if x==0 else 0)
    temp['segm_cat_1'] = temp['segment'].apply(lambda x: 1 if x==1 else 0)
    temp['segm_cat_2'] = temp['segment'].apply(lambda x: 1 if x==2 else 0)
    
    temp['activity_index'] = temp['activity_index'].astype(int)
    cols_to_drop = ['customer_residence_code', 'residence_same_as_bank', 
                    'country_same_as_bank', 'adress_type', 'use_savings', 
                    'use_guarantees', 'use_derivada_account', 'use_short_deposits', 
                    'use_medium_deposits', 'use_loans', 'age_cat', 'age', 
                    'is_new_customer', 'customer_type', 'join_channel', 'segment']
    
    temp.drop(cols_to_drop, axis=1, inplace=True)
    
    return temp

In [443]:
prepr_train_1 = train[train['gender']!=1]

In [446]:
prepr_train = preproc_func(prepr_train_1.copy()).reset_index(drop=True)

In [447]:
prepr_train.head()

Unnamed: 0,customer_id,gender,seniority_month,customer_relation_type,activity_index,household_income,use_current_accounts,use_payroll_account,use_long_deposits,use_e_account,...,age_cat_5,channel_cat_10,channel_cat_6,channel_cat_1,channel_cat_5,channel_cat_9,channel_cat_other,segm_cat_0,segm_cat_1,segm_cat_2
0,107620,2,36,0,1,47164.86,0,1,0,0,...,0,0,1,0,0,0,0,0,1,0
1,107775,2,36,0,1,68262.93,1,0,1,1,...,0,0,1,0,0,0,0,1,0,0
2,112208,0,36,1,0,47451.24,1,0,0,0,...,0,1,0,0,0,0,0,0,0,1
3,112270,0,36,1,0,19386.48,1,0,0,0,...,0,1,0,0,0,0,0,0,0,1
4,112332,0,36,1,0,102460.89,1,0,0,0,...,0,1,0,0,0,0,0,0,0,1


## Train_test_split

In [361]:
X = prepr_train.drop(['customer_id', 'use_direct_debit'], axis=1)
y = prepr_train['use_direct_debit']

In [362]:
X.shape

(7442, 26)

In [363]:
def train_test_split(X, y, test_size, random_state=1):
    
    random_gen = np.random.RandomState(random_state)
    size = X.shape[0]
    batch_size = round(size*test_size)
    
    rand_indices = list(random_gen.choice(size, batch_size))
   
    return X.drop(rand_indices, axis=0), X.loc[rand_indices], y.drop(rand_indices, axis=0), y.loc[rand_indices]

In [364]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, random_state=1)

In [478]:
print('train: {}'.format(X_train.shape))
print('valid: {}'.format(X_valid.shape))

train: (5530, 26)
valid: (2233, 26)


## Train

In [437]:
tree = DecisionTreeClassifier(max_depth=4, min_samples_leaf=9, criterion='gini', class_weight={0: 0.3, 1:0.8})

In [463]:
tree.fit(X_train, y_train)

DecisionTreeClassifier(class_weight={0: 0.3, 1: 0.8}, criterion='gini',
            max_depth=4, max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=9, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [465]:
pred_train = tree.predict(X_train)
pred_valid = tree.predict(X_valid)

In [466]:
print('train precision {}'.format(round(precision_score(y_train, pred_train), 3)))
print('valid precision {}'.format(round(precision_score(y_valid, pred_valid), 3)))

train precision 0.649
valid precision 0.671


In [467]:
print('train recall {}'.format(round(recall_score(y_train, pred_train), 3)))
print('valid recall {}'.format(round(recall_score(y_valid, pred_valid), 3)))

train recall 0.425
valid recall 0.398


In [468]:
print('train f1_score {}'.format(round(f1_score(y_train, pred_train), 3)))
print('valid f1_score {}'.format(round(f1_score(y_valid, pred_valid), 3)))

train f1_score 0.513
valid f1_score 0.5


In [485]:
export_graphviz(tree).replace('\\n','''
''')

'digraph Tree {\nnode [shape=box] ;\n0 [label="X[0] <= 0.5\ngini = 0.376\nsamples = 5530\nvalue = [1473.9, 493.6]"] ;\n1 [label="gini = 0.276\nsamples = 5126\nvalue = [1431.3, 284.0]"] ;\n0 -> 1 [labeldistance=2.5, labelangle=45, headlabel="True"] ;\n2 [label="gini = 0.281\nsamples = 404\nvalue = [42.6, 209.6]"] ;\n0 -> 2 [labeldistance=2.5, labelangle=-45, headlabel="False"] ;\n}'

In [483]:
with open("tree_baseline.txt", "w") as f:
    f = export_graphviz(tree, out_file=f)

In [488]:
!dot -Tpng tree_baseline.txt -o tree_baseline.png

In [490]:
for x in range(1,10):
    tree = DecisionTreeClassifier(max_depth=x, min_samples_leaf=2, criterion='gini', class_weight={0: 0.3, 1:0.8})
    tree.fit(X_train, y_train)
    pred_train = tree.predict(X_train)
    pred_valid = tree.predict(X_valid)
    print(x)
    print('train f1_score {}'.format(round(f1_score(y_train, pred_train), 3)))
    print('valid f1_score {}'.format(round(f1_score(y_valid, pred_valid), 3)))
    print('-----------')

1
train f1_score 0.513
valid f1_score 0.5
-----------
2
train f1_score 0.513
valid f1_score 0.5
-----------
3
train f1_score 0.513
valid f1_score 0.486
-----------
4
train f1_score 0.539
valid f1_score 0.518
-----------
5
train f1_score 0.557
valid f1_score 0.53
-----------
6
train f1_score 0.562
valid f1_score 0.525
-----------
7
train f1_score 0.588
valid f1_score 0.555
-----------
8
train f1_score 0.609
valid f1_score 0.568
-----------
9
train f1_score 0.627
valid f1_score 0.537
-----------


In [492]:
scores = tree.feature_importances_ 

In [493]:
feature_imp = []
for score, name in zip(scores, X_train.columns):
        feature_imp.append((score, name))

In [494]:
sorted(feature_imp, key=lambda x: x[0], reverse=True)

[(0.4820209329556004, 'use_payroll_account'),
 (0.21557371904621736, 'customer_relation_type'),
 (0.057281741791799234, 'household_income'),
 (0.050380137917419146, 'age_cat_3'),
 (0.03884650266847925, 'seniority_month'),
 (0.02535809156479439, 'activity_index'),
 (0.023834619970986667, 'use_credit_card'),
 (0.018357303867311096, 'use_current_accounts'),
 (0.016718882920289373, 'use_pensions'),
 (0.016544766236973405, 'age_cat_2'),
 (0.011863070291505476, 'channel_cat_10'),
 (0.011253147020408461, 'use_e_account'),
 (0.007830789521856485, 'use_taxes'),
 (0.007526909495751394, 'use_long_deposits'),
 (0.0035471251686299992, 'channel_cat_other'),
 (0.003146814912896446, 'segm_cat_0'),
 (0.002419268982540582, 'age_cat_4'),
 (0.0019720941168559956, 'segm_cat_1'),
 (0.0017258226836002525, 'channel_cat_5'),
 (0.0015340592826463373, 'channel_cat_6'),
 (0.0015277400702282354, 'gender'),
 (0.0007364595132100482, 'channel_cat_1'),
 (0.0, 'age_cat_1'),
 (0.0, 'age_cat_5'),
 (0.0, 'channel_cat_9'),

## Feature importance

In [449]:
scores = tree.feature_importances_ 

In [451]:
feature_imp = []
for score, name in zip(scores, X_train.columns):
        feature_imp.append((score, name))

In [454]:
sorted(feature_imp, key=lambda x: x[0], reverse=True)

[(0.6028896606287074, 'use_payroll_account'),
 (0.2696297140444509, 'customer_relation_type'),
 (0.06363302774762708, 'age_cat_3'),
 (0.019392519752389058, 'age_cat_2'),
 (0.016418072919044964, 'use_credit_card'),
 (0.012394122506281613, 'use_current_accounts'),
 (0.007411761270165454, 'seniority_month'),
 (0.004659190361123825, 'activity_index'),
 (0.0016890451074769062, 'use_e_account'),
 (0.0015709470502900007, 'household_income'),
 (0.00031193861244275395, 'age_cat_4'),
 (0.0, 'gender'),
 (0.0, 'use_long_deposits'),
 (0.0, 'use_taxes'),
 (0.0, 'use_pensions'),
 (0.0, 'age_cat_1'),
 (0.0, 'age_cat_5'),
 (0.0, 'channel_cat_10'),
 (0.0, 'channel_cat_6'),
 (0.0, 'channel_cat_1'),
 (0.0, 'channel_cat_5'),
 (0.0, 'channel_cat_9'),
 (0.0, 'channel_cat_other'),
 (0.0, 'segm_cat_0'),
 (0.0, 'segm_cat_1'),
 (0.0, 'segm_cat_2')]

## Custom KFold and GridSearch

In [367]:
idx = list(range(X.shape[0]))
random.shuffle(idx)

In [368]:
params = {'max_depth': [2,3,4,5,6,7,8],
          'min_samples_leaf': [2,4,5,7,9,11,13,15],
          'criterion': ['gini', 'entropy'],
          'class_weight': [{0: 1, 1:2}, {0: 0.2, 1:0.8}, {0: 0.4, 1:0.6}]   
}

In [369]:
dict_of_params = {}
for k, v in params.items():
    dict_of_params[k] = np.random.choice(params[k])
    
dict_of_params

{'max_depth': 7,
 'min_samples_leaf': 2,
 'criterion': 'gini',
 'class_weight': {0: 0.4, 1: 0.6}}

## Predict

In [469]:
prepr_test = preproc_func(test.copy())

In [470]:
X_test = prepr_test.drop(['customer_id'], axis=1).reset_index(drop=True)

In [471]:
X_test_tmp  = X_test[['use_payroll_account']]

In [473]:
pred_test_tmp = tree.predict(X_test_tmp)

In [475]:
test_sample['use_direct_debit']= pred_test_tmp

In [476]:
test_sample.head()

Unnamed: 0,customer_id,use_direct_debit
0,107651,0
1,112177,0
2,112239,0
3,112301,0
4,112363,0


In [412]:
test_sample.to_csv('submission_2.csv', index=None)