In [70]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score, KFold, GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from collections import Counter
from matplotlib import pyplot as plt
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score
%matplotlib inline

In [2]:
dataset_train = pd.read_csv('train_set.csv', delimiter=',')
dataset_train.head(5)

Unnamed: 0,ID,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,1,43,management,married,tertiary,no,291,yes,no,unknown,9,may,150,2,-1,0,unknown,0
1,2,42,technician,divorced,primary,no,5076,yes,no,cellular,7,apr,99,1,251,2,other,0
2,3,47,admin.,married,secondary,no,104,yes,yes,cellular,14,jul,77,2,-1,0,unknown,0
3,4,28,management,single,secondary,no,-994,yes,yes,cellular,18,jul,174,2,-1,0,unknown,0
4,5,42,technician,divorced,secondary,no,2974,yes,no,unknown,21,may,187,5,-1,0,unknown,0


In [4]:
dataset_test = pd.read_csv('test_set.csv', delimiter=',')
dataset_test.head(5)

Unnamed: 0,ID,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome
0,25318,51,housemaid,married,unknown,no,174,no,no,telephone,29,jul,308,3,-1,0,unknown
1,25319,32,management,married,tertiary,no,6059,yes,no,cellular,20,nov,110,2,-1,0,unknown
2,25320,60,retired,married,primary,no,0,no,no,telephone,30,jul,130,3,-1,0,unknown
3,25321,32,student,single,tertiary,no,64,no,no,cellular,30,jun,598,4,105,5,failure
4,25322,41,housemaid,married,secondary,no,0,yes,yes,cellular,15,jul,368,4,-1,0,unknown


In [5]:
dataset_1 = dataset_train
dataset_2 = dataset_test

In [6]:
dataset_1.dtypes

ID            int64
age           int64
job          object
marital      object
education    object
default      object
balance       int64
housing      object
loan         object
contact      object
day           int64
month        object
duration      int64
campaign      int64
pdays         int64
previous      int64
poutcome     object
y             int64
dtype: object

In [11]:
lh = LabelEncoder()
dataset_train_set = dataset_1
for col in dataset_train_set.columns[dataset_train_set.dtypes == 'object']:
    dataset_train_set[col] = lh.fit_transform(dataset_train_set[col])
dataset_train_set.head()

Unnamed: 0,ID,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,1,43,4,1,2,0,291,1,0,2,9,8,150,2,-1,0,3,0
1,2,42,9,0,0,0,5076,1,0,0,7,0,99,1,251,2,1,0
2,3,47,0,1,1,0,104,1,1,0,14,5,77,2,-1,0,3,0
3,4,28,4,2,1,0,-994,1,1,0,18,5,174,2,-1,0,3,0
4,5,42,9,0,1,0,2974,1,0,2,21,8,187,5,-1,0,3,0


In [13]:
dataset_train_set.describe()

Unnamed: 0,ID,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
count,25317.0,25317.0,25317.0,25317.0,25317.0,25317.0,25317.0,25317.0,25317.0,25317.0,25317.0,25317.0,25317.0,25317.0,25317.0,25317.0,25317.0,25317.0
mean,12659.0,40.935379,4.330687,1.167555,1.226291,0.017696,1357.555082,0.553778,0.160327,0.640163,15.835289,5.523166,257.732393,2.77205,40.248766,0.591737,2.558399,0.116957
std,7308.532719,10.634289,3.269565,0.608091,0.750483,0.131845,2999.822811,0.497109,0.366916,0.897537,8.31948,3.010054,256.975151,3.136097,100.213541,2.568313,0.989615,0.321375
min,1.0,18.0,0.0,0.0,0.0,0.0,-8019.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,-1.0,0.0,0.0,0.0
25%,6330.0,33.0,1.0,1.0,1.0,0.0,73.0,0.0,0.0,0.0,8.0,3.0,103.0,1.0,-1.0,0.0,3.0,0.0
50%,12659.0,39.0,4.0,1.0,1.0,0.0,448.0,1.0,0.0,0.0,16.0,6.0,181.0,2.0,-1.0,0.0,3.0,0.0
75%,18988.0,48.0,7.0,2.0,2.0,0.0,1435.0,1.0,0.0,2.0,21.0,8.0,317.0,3.0,-1.0,0.0,3.0,0.0
max,25317.0,95.0,11.0,2.0,3.0,1.0,102127.0,1.0,1.0,2.0,31.0,11.0,3881.0,55.0,854.0,275.0,3.0,1.0


In [12]:
dataset_test_set = dataset_2
for col in dataset_test_set.columns[dataset_test_set.dtypes == 'object']:
    dataset_test_set[col] = lh.fit_transform(dataset_test_set[col])
dataset_test_set.head()

Unnamed: 0,ID,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome
0,25318,51,3,1,3,0,174,0,0,1,29,5,308,3,-1,0,3
1,25319,32,4,1,2,0,6059,1,0,0,20,9,110,2,-1,0,3
2,25320,60,5,1,0,0,0,0,0,1,30,5,130,3,-1,0,3
3,25321,32,8,2,2,0,64,0,0,0,30,6,598,4,105,5,0
4,25322,41,3,1,1,0,0,1,1,0,15,5,368,4,-1,0,3


In [14]:
dataset_test_set.describe()

Unnamed: 0,ID,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome
count,10852.0,10852.0,10852.0,10852.0,10852.0,10852.0,10852.0,10852.0,10852.0,10852.0,10852.0,10852.0,10852.0,10852.0,10852.0,10852.0,10852.0
mean,30743.5,41.040638,4.37265,1.164947,1.226502,0.018153,1393.157298,0.547272,0.160984,0.637486,15.680151,5.509399,257.206137,2.770365,39.954755,0.546443,2.558975
std,3132.846895,10.652369,3.265972,0.602941,0.74899,0.133512,3318.497101,0.497783,0.367533,0.897054,8.302317,3.009708,250.480906,3.063481,99.524056,1.805938,0.993169
min,25318.0,18.0,0.0,0.0,0.0,0.0,-2604.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,-1.0,0.0,0.0
25%,28030.75,33.0,1.0,1.0,1.0,0.0,72.0,0.0,0.0,0.0,8.0,3.0,102.0,1.0,-1.0,0.0,3.0
50%,30743.5,39.0,4.0,1.0,1.0,0.0,450.0,1.0,0.0,0.0,16.0,6.0,181.0,2.0,-1.0,0.0,3.0
75%,33456.25,49.0,7.0,2.0,2.0,0.0,1440.25,1.0,0.0,2.0,21.0,8.0,322.0,3.0,-1.0,0.0,3.0
max,36169.0,94.0,11.0,2.0,3.0,1.0,81204.0,1.0,1.0,2.0,31.0,11.0,3102.0,58.0,871.0,51.0,3.0


In [24]:
train_set_x = dataset_train_set[dataset_train_set.columns[1:17]]
train_set_y = dataset_train_set['y']
train_set_id = dataset_train_set['ID']

In [31]:
test_set_x = dataset_test_set[dataset_test_set.columns[1:17]]
test_set_id = dataset_test_set['ID']

In [43]:
dataset_all = pd.concat([train_set_x, test_set_x])
ss = StandardScaler()
dataset_all[dataset_all.columns] = ss.fit_transform(dataset_all[dataset_all.columns])
dataset_all.head()

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome
0,0.191084,-0.105027,-0.27496,1.031508,-0.134747,-0.347627,0.901202,-0.437287,1.516256,-0.816503,0.824275,-0.421799,-0.247734,-0.411587,-0.244403,0.445592
1,0.097094,1.424748,-1.923669,-1.635108,-0.134747,1.196503,0.901202,-0.437287,-0.712483,-1.05705,-1.833644,-0.62177,-0.568823,2.1083,0.601067,-1.573274
2,0.56704,-1.328848,-0.27496,-0.3018,-0.134747,-0.407972,0.901202,2.28683,-0.712483,-0.215138,-0.172444,-0.708032,-0.247734,-0.411587,-0.244403,0.445592
3,-1.218753,-0.105027,1.37375,-0.3018,-0.134747,-0.762299,0.901202,2.28683,-0.712483,0.265955,-0.172444,-0.327695,-0.247734,-0.411587,-0.244403,0.445592
4,0.097094,1.424748,-1.923669,-0.3018,-0.134747,0.518183,0.901202,-0.437287,1.516256,0.626774,0.824275,-0.276722,0.715532,-0.411587,-0.244403,0.445592


In [49]:
train_x = dataset_all.iloc[:25317,:]
test_x = dataset_all.iloc[25317:,:]

In [50]:
x_train, x_val, y_train, y_val = train_test_split(train_x, train_set_y, test_size=0.2, random_state=7)

In [53]:
params={
    'n_estimators':100,
    'max_depth':11,
    'n_jobs':3,
    'random_state':7
}
rf_model = RandomForestClassifier(**params)
rf_model.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=11, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=3,
            oob_score=False, random_state=7, verbose=0, warm_start=False)

In [56]:
val_pred = rf_model.predict(x_val)
roc_auc_score(val_pred, y_val)

0.8039308275626338

In [58]:
test_pred = rf_model.predict_proba(test_x)
test_pred

array([[0.94060397, 0.05939603],
       [0.97411951, 0.02588049],
       [0.98667627, 0.01332373],
       ...,
       [0.99489043, 0.00510957],
       [0.98131253, 0.01868747],
       [0.87757937, 0.12242063]])

In [68]:
result_rf_pred = pd.DataFrame(dataset_test_set.iloc[:,:].ID, columns=['ID'])
result_rf_pred['pred'] = test_pred[:,1]
result_rf_pred.to_csv('result_rf_20190327_standard.csv', index=False)

In [73]:
params_deci = {
    'max_depth':13,
    'criterion':'gini',
    'min_samples_leaf':100,
    'random_state':7
}
params_ada = {
    'base_estimator':DecisionTreeClassifier(**params_deci),
    'n_estimators':300,
    'learning_rate':0.005,
    'random_state':7
}
adaboost_model = AdaBoostClassifier(**params_ada)
adaboost_model.fit(x_train,y_train)

AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=13,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=100, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=7,
            splitter='best'),
          learning_rate=0.005, n_estimators=300, random_state=7)

In [75]:
val_ada_pred = adaboost_model.predict(x_val)
roc_auc_score(val_ada_pred, y_val)

0.7995833635901533

In [76]:
test_ada_pred = adaboost_model.predict_proba(test_x)
test_ada_pred

array([[9.98728534e-01, 1.27146551e-03],
       [9.99975933e-01, 2.40672464e-05],
       [9.99999495e-01, 5.04873259e-07],
       ...,
       [9.99954721e-01, 4.52794254e-05],
       [9.99999376e-01, 6.23952313e-07],
       [9.74059195e-01, 2.59408048e-02]])

In [77]:
result_ada_pred = pd.DataFrame(dataset_test_set.iloc[:,:].ID, columns=['ID'])
result_ada_pred['pred'] = test_ada_pred[:,1]
result_ada_pred.to_csv('result_adaboost_20190327_standard.csv', index=False)