In [42]:
import pandas as pd
import random
import numpy as np
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

In [6]:
import warnings
warnings.filterwarnings("ignore")

## Read data

In [7]:
train = pd.read_csv('hw3_train.csv')
test = pd.read_csv('hw3_test_data.csv')
test_sample = pd.read_csv('hw3_test_sample.csv')

In [8]:
print('train: {}'.format(train.shape))
print('test: {}'.format(train.shape))
print('test_sample: {}'.format(train.shape))

train: (7500, 29)
test: (7500, 29)
test_sample: (7500, 29)


In [9]:
train.head()

Unnamed: 0,customer_id,customer_residence_code,gender,age,is_new_customer,seniority_month,customer_type,customer_relation_type,residence_same_as_bank,country_same_as_bank,...,use_payroll_account,use_short_deposits,use_medium_deposits,use_long_deposits,use_e_account,use_loans,use_taxes,use_credit_card,use_pensions,use_direct_debit
0,107620,1,2,47,0.0,36,1.0,0,2,0,...,1,0,0,0,0,0,0,0,1.0,0
1,107775,1,2,52,0.0,36,1.0,0,2,0,...,0,0,0,1,1,0,0,1,0.0,0
2,112208,1,0,24,0.0,36,1.0,1,2,2,...,0,0,0,0,0,0,0,0,0.0,0
3,112270,1,0,26,0.0,36,1.0,1,2,0,...,0,0,0,0,0,0,0,0,0.0,0
4,112332,1,0,26,0.0,36,1.0,1,2,0,...,0,0,0,0,0,0,0,0,0.0,0


In [10]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7500 entries, 0 to 7499
Data columns (total 29 columns):
customer_id                7500 non-null int64
customer_residence_code    7500 non-null int64
gender                     7500 non-null int64
age                        7500 non-null int64
is_new_customer            7500 non-null float64
seniority_month            7500 non-null int64
customer_type              7500 non-null float64
customer_relation_type     7500 non-null int64
residence_same_as_bank     7500 non-null int64
country_same_as_bank       7500 non-null int64
join_channel               7500 non-null int64
adress_type                7500 non-null float64
activity_index             7500 non-null float64
household_income           7500 non-null float64
segment                    7500 non-null int64
use_savings                7500 non-null int64
use_guarantees             7500 non-null int64
use_current_accounts       7500 non-null int64
use_derivada_account       7500 non-n

## Preprocess data

In [11]:
def convert_age(age):
    if   age < 18: return 1
    elif age < 30 and age >= 18: return 2
    elif age < 50 and age >= 30: return 3
    elif age < 70 and age >= 50: return 4
    else: return 5

In [12]:
def preproc_func(data):
    temp = data.copy()
    
    #temp = temp[temp['gender']!=1]
    temp['age_cat']   = temp['age'].apply(convert_age)
    
    temp['age_cat_1'] = temp['age_cat'].apply(lambda x: 1 if x==1 else 0)
    temp['age_cat_2'] = temp['age_cat'].apply(lambda x: 1 if x==2 else 0)
    temp['age_cat_3'] = temp['age_cat'].apply(lambda x: 1 if x==3 else 0)
    temp['age_cat_4'] = temp['age_cat'].apply(lambda x: 1 if x==4 else 0)
    temp['age_cat_5'] = temp['age_cat'].apply(lambda x: 1 if x==5 else 0)
    
    temp['channel_cat_10']    = temp['join_channel'].apply(lambda x: 1 if x==10 else 0)
    temp['channel_cat_6']     = temp['join_channel'].apply(lambda x: 1 if x==6 else 0)
    temp['channel_cat_1']     = temp['join_channel'].apply(lambda x: 1 if x==1 else 0)
    temp['channel_cat_5']     = temp['join_channel'].apply(lambda x: 1 if x==5 else 0)
    temp['channel_cat_9']     = temp['join_channel'].apply(lambda x: 1 if x==9 else 0)
    temp['channel_cat_other'] = temp['join_channel'].apply(lambda x: 1 if x not in [10,6,1,5,9] else 0)
    
    temp['segm_cat_0'] = temp['segment'].apply(lambda x: 1 if x==0 else 0)
    temp['segm_cat_1'] = temp['segment'].apply(lambda x: 1 if x==1 else 0)
    temp['segm_cat_2'] = temp['segment'].apply(lambda x: 1 if x==2 else 0)
    
    temp['activity_index'] = temp['activity_index'].astype(int)
    cols_to_drop = ['customer_residence_code', 'residence_same_as_bank', 
                    'country_same_as_bank', 'adress_type', 'use_savings', 
                    'use_guarantees', 'use_derivada_account', 'use_short_deposits', 
                    'use_medium_deposits', 'use_loans', 'age_cat', 'age', 
                    'is_new_customer', 'customer_type', 'join_channel', 'segment']
    
    temp.drop(cols_to_drop, axis=1, inplace=True)
    
    return temp

In [13]:
prepr_train_1 = train[train['gender']!=1]

In [14]:
prepr_train = preproc_func(prepr_train_1.copy()).reset_index(drop=True)

In [15]:
prepr_train.head()

Unnamed: 0,customer_id,gender,seniority_month,customer_relation_type,activity_index,household_income,use_current_accounts,use_payroll_account,use_long_deposits,use_e_account,...,age_cat_5,channel_cat_10,channel_cat_6,channel_cat_1,channel_cat_5,channel_cat_9,channel_cat_other,segm_cat_0,segm_cat_1,segm_cat_2
0,107620,2,36,0,1,47164.86,0,1,0,0,...,0,0,1,0,0,0,0,0,1,0
1,107775,2,36,0,1,68262.93,1,0,1,1,...,0,0,1,0,0,0,0,1,0,0
2,112208,0,36,1,0,47451.24,1,0,0,0,...,0,1,0,0,0,0,0,0,0,1
3,112270,0,36,1,0,19386.48,1,0,0,0,...,0,1,0,0,0,0,0,0,0,1
4,112332,0,36,1,0,102460.89,1,0,0,0,...,0,1,0,0,0,0,0,0,0,1


## Train_test_split

In [16]:
X = prepr_train.drop(['customer_id', 'use_direct_debit'], axis=1)
y = prepr_train['use_direct_debit']

In [17]:
X.shape

(7442, 26)

In [30]:
def train_test_split(X, y, test_size, random_state=1):
    
    random_gen = np.random.RandomState(random_state)
    size = X.shape[0]
    batch_size = round(size*test_size)
    
    rand_indices = list(random_gen.choice(size, batch_size))
   
    return X.drop(rand_indices, axis=0), X.loc[rand_indices], y.drop(rand_indices, axis=0), y.loc[rand_indices]

In [31]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, random_state=1)

In [32]:
print('train: {}'.format(X_train.shape))
print('valid: {}'.format(X_valid.shape))

train: (5530, 26)
valid: (2233, 26)


## Train

In [4]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from scipy.stats import randint as sp_randint
from scipy.stats import lognorm as sp_lognorm

In [3]:
model = Pipeline([
    ('clf', LogisticRegression())
])

In [21]:
param_grid = {
    'clf__penalty': ['l1', 'l2'],
    'clf__random_state': [42],
    'clf__C': sp_lognorm(4)
}

print(param_grid)

{'clf__penalty': ['l1', 'l2'], 'clf__random_state': [42], 'clf__C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x10c58ce48>}


In [23]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=123)

In [33]:
# Рассмотрим 20 случайных комбинаций
random_searcher = RandomizedSearchCV(model, param_grid, n_iter=20, 
                                     random_state=42,
                                     scoring='roc_auc', 
                                     n_jobs=-1, cv=cv, 
                                     verbose=2)

random_searcher.fit(X_train, y_train)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    3.2s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    5.1s finished


RandomizedSearchCV(cv=StratifiedKFold(n_splits=5, random_state=123, shuffle=True),
          error_score='raise-deprecating',
          estimator=Pipeline(memory=None,
     steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('clf', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))]),
          fit_params=None, iid='warn', n_iter=20, n_jobs=-1,
          param_distributions={'clf__penalty': ['l1', 'l2'], 'clf__random_state': [42], 'clf__C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x10c58ce48>},
          pre_dispatch='2*n_jobs', random_state=42, refit=True,
          return_train_score='warn', scoring='roc_auc', verbose=2)

In [34]:
random_searcher.best_params_

{'clf__C': 1.5584478513027344, 'clf__penalty': 'l1', 'clf__random_state': 42}

In [35]:
random_searcher.best_score_

0.9165341811142412

In [36]:
results = pd.DataFrame(random_searcher.cv_results_)

In [37]:
results.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_clf__C,param_clf__penalty,param_clf__random_state,params,split0_test_score,split1_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,0.847162,0.11542,0.005557,0.003459,7.29257,l1,42,"{'clf__C': 7.292574305740818, 'clf__penalty': ...",0.921869,0.928912,...,0.915774,0.011827,8,0.91819,0.917154,0.925447,0.922695,0.918987,0.920495,0.003102
1,0.055064,0.00304,0.003669,0.000187,0.575189,l2,42,"{'clf__C': 0.5751886502026291, 'clf__penalty':...",0.925815,0.928716,...,0.916392,0.01239,4,0.91789,0.917153,0.925419,0.922645,0.918911,0.920404,0.00314
2,0.039283,0.006967,0.005029,0.001113,0.0117076,l1,42,"{'clf__C': 0.011707560231951408, 'clf__penalty...",0.911483,0.925315,...,0.903358,0.015179,19,0.901558,0.897603,0.908245,0.907326,0.902987,0.903544,0.003898
3,0.458864,0.093747,0.005061,0.001731,3.58088,l1,42,"{'clf__C': 3.5808805675530544, 'clf__penalty':...",0.925307,0.928929,...,0.916439,0.012257,3,0.918142,0.917112,0.925424,0.922738,0.918951,0.920473,0.003121
4,0.09643,0.007779,0.005055,0.003197,553.826,l2,42,"{'clf__C': 553.8263899146515, 'clf__penalty': ...",0.921894,0.928896,...,0.915787,0.011826,5,0.91821,0.917205,0.92549,0.922706,0.919012,0.920525,0.003101


In [38]:
best_model = random_searcher.best_estimator_

In [40]:
pred_valid = best_model.predict_proba(X_valid)

In [44]:
roc_auc_score(y_valid, pred_valid[:, 1])

0.9147888445905511

## Predict

In [46]:
prepr_test = preproc_func(test.copy())

In [47]:
X_test = prepr_test.drop(['customer_id'], axis=1).reset_index(drop=True)

In [49]:
pred_test = best_model.predict_proba(X_test)

In [50]:
test_sample['use_direct_debit'] = pred_test[:, 1]

In [51]:
test_sample.head()

Unnamed: 0,customer_id,use_direct_debit
0,107651,0.319974
1,112177,0.003063
2,112239,0.002199
3,112301,0.096057
4,112363,0.002222


In [52]:
test_sample.to_csv('submission_1.csv', index=None)