In [1]:
import pandas as pd
import numpy as np
import matplotlib as mp
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [2]:
train_df = pd.read_csv("train.csv", sep=";")
test_df = pd.read_csv("test.csv", sep=";", header=None, names=train_df.columns.values)

# get all the data
all_data_df = train_df.append(test_df)

In [3]:
## cleaning the data, get rid of the NaN values in some 
for column_to_cure in ["job", "marital", "education", "loan", "housing", "default"]:
    most_commun = all_data_df[column_to_cure].mode().iloc[0]
    all_data_df[column_to_cure].replace("unknown", most_commun, inplace=True)

In [4]:
## encode the target value
all_data_df['y'].replace("yes", 1.0, inplace=True)
all_data_df['y'].replace("no", 0.0, inplace=True)

In [5]:
## drop the default column (there is only one yes among all data points)
all_data_df.drop(["default"], axis=1, inplace=True)

In [6]:
## Adding features
## using one-hot-encoding on all the categorical columns except the y which is the target
categorical_columns = list(all_data_df.dtypes[all_data_df.dtypes == "object"].index.values)
all_data_df = pd.get_dummies(all_data_df, columns=categorical_columns)

In [7]:
all_data_df.shape

(4119, 56)

In [8]:
all_data_df.dtypes

age                                int64
duration                           int64
campaign                           int64
pdays                              int64
previous                           int64
emp.var.rate                     float64
cons.price.idx                   float64
cons.conf.idx                    float64
euribor3m                        float64
nr.employed                      float64
y                                float64
job_admin.                       float64
job_blue-collar                  float64
job_entrepreneur                 float64
job_housemaid                    float64
job_management                   float64
job_retired                      float64
job_self-employed                float64
job_services                     float64
job_student                      float64
job_technician                   float64
job_unemployed                   float64
marital_divorced                 float64
marital_married                  float64
marital_single  

### Get the ndarray

In [8]:
## get the Kaggle train and test dataset
test_df = all_data_df[all_data_df['y'].isnull()]
train_df= all_data_df[all_data_df['y'].notnull()]

In [9]:
## getting the numpy equivalent 
y = train_df['y'].values
X = train_df.loc[:, train_df.columns != 'y'].values
X_Kaggle = test_df.loc[:, test_df.columns != 'y'].values

In [10]:
X.shape

(2999, 55)

In [11]:
X_Kaggle.shape

(1120, 55)

### using piplines

In [189]:

from sklearn.preprocessing import StandardScaler, PolynomialFeatures, MinMaxScaler
from sklearn.feature_selection import RFE, SelectPercentile
#classifiers
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
# grid_search
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer,  matthews_corrcoef
# imbalanced data
from imblearn.pipeline import make_pipeline, Pipeline
from imblearn.under_sampling import EditedNearestNeighbours, TomekLinks, RandomUnderSampler
from imblearn.over_sampling import SMOTE
from imblearn.ensemble import BalancedBaggingClassifier

r_state = 0
n_folds = 3
mcc = make_scorer(matthews_corrcoef)

In [190]:
# the scaler
minMax_scaler = MinMaxScaler()
standard_scaler = StandardScaler()

# the feature_selector
RFE_transformer = RFE(DecisionTreeClassifier(random_state=r_state) )
select_percentile_transformer = SelectPercentile()

# the feature transformer 
polynomial_transformer = PolynomialFeatures()

# the samplers
## over_samplers 
smote_sampler = SMOTE(random_state= r_state)

## under samplers
enn_sampler = EditedNearestNeighbours(random_state=r_state)
tomek_links_sampler = TomekLinks(random_state=r_state)
random_under_sampler = RandomUnderSampler(random_state= r_state)

# the estimator
rf = RandomForestClassifier(random_state=r_state)
adaBoost  = AdaBoostClassifier(random_state= r_state)
svc = SVC(random_state= r_state)
log_reg = LogisticRegression(random_state= r_state)
mlp = MLPClassifier(random_state=r_state)
gboost = GradientBoostingClassifier(random_state= r_state)

# balanced bagging classifier
balanced_bagging_classfier = BalancedBaggingClassifier(random_state=0)


### Neural Nets

In [None]:
neural_net_pipline = Pipeline([('scaler', minMax_scaler), 
                            ('feature_extractor', RFE_transformer),
                            ('over_sampling', smote_sampler),
                            ('under_sampling', enn_sampler),
                            ('estimator', mlp)])

neural_net_grid = [
    {'scaler': [minMax_scaler, standart_scaler],
     'feature_extractor':[RFE_transformer],
     'feature_extractor__n_features_to_select':[X.shape[1] // 5, X.shape[1] // 3, X.shape[1] // 2 ],
     'over_sampling__kind': ['svm', 'borderline1', 'borderline2'],
     'under_sampling':[enn_sampler, tomek_links_sampler]
     },
    {'scaler': [minMax_scaler, standart_scaler],
     'feature_extractor':[select_percentile_transformer],
     'feature_extractor__percentile':[10, 20, 30, 40, 50],
     'over_sampling__kind': ['svm', 'borderline1', 'borderline2'],
     'under_sampling':[enn_sampler, tomek_links_sampler]
     }
]

neural_net = GridSearchCV(neural_net_pipline, neural_net_grid, scoring=mcc, cv= n_folds, n_jobs=1)
neural_net.fit(X,y)
neural_net.best_score_

In [61]:
neural_net.best_score_

0.55632645627343091

### Linear models

In [None]:
## using grid search for logistic regression 

logreg_pipeline = Pipeline([('scaler', minMax_scaler), 
                            ('feature_extractor', RFE_transformer),
                            ('over_sampling', smote_sampler),
                            ('under_sampling', enn_sampler),
                            ('feature_interraction', polynomial_transformer),
                            ('estimator', log_reg)])

## set a grid search
logreg_grid = [         {'scaler': [minMax_scaler],
                         'feature_extractor__n_features_to_select':[X.shape[1] // 3],
                        'over_sampling': [smote_sampler],
                        'over_sampling__kind': ['svm','borderline1','borderline2'],
                        'under_sampling': [enn_sampler, tomek_links_sampler],
                        'feature_interraction': [polynomial_transformer],
                        'feature_interraction__degree': [1,2,3],
                        'estimator': [log_reg],
                        'estimator__C':[0.01,0.1,1,10,100]},
                        ## second grid search
                         {'over_sampling': [None],
                        'feature_extractor__n_features_to_select':[X.shape[1] // 3],
                        'under_sampling': [None],
                        'feature_interraction':[polynomial_transformer],
                        'feature_interraction__degree':[1,2,3],
                        'estimator__C':[0.01,0.1,1,10,100],
                        'estimator__class_weight':['balanced']}]

logreg = GridSearchCV(logreg_pipeline, logreg_grid, cv=n_folds, scoring=mcc, n_jobs=-1)
logreg.fit(X,y)
logreg.best_score_

In [104]:
logreg.best_params_

{'estimator': LogisticRegression(C=100, class_weight=None, dual=False, fit_intercept=True,
           intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
           penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
           verbose=0, warm_start=False),
 'estimator__C': 100,
 'feature_interraction': PolynomialFeatures(degree=1, include_bias=True, interaction_only=False),
 'feature_interraction__degree': 1,
 'over_sampling': SMOTE(k=None, k_neighbors=5, kind='svm', m=None, m_neighbors=10, n_jobs=1,
    out_step=0.5, random_state=0, ratio='auto', svm_estimator=None),
 'over_sampling__kind': 'svm',
 'scaler': MinMaxScaler(copy=True, feature_range=(0, 1)),
 'under_sampling': TomekLinks(n_jobs=1, random_state=0, ratio='auto', return_indices=False)}

In [110]:
# grid search for linear svm
svm_pipeline = Pipeline([('scaler', minMax_scaler), 
                         ('feature_extractor', RFE_transformer),
                         ('over_sampling', smote_sampler),
                         ('under_sampling', enn_sampler),
                         ('feature_interraction', polynomial_transformer),
                         ('estimator', svc)])

## set a grid search
svm_grid = [            {'over_sampling__kind': ['svm','borderline1','borderline2'],
                        'under_sampling': [enn_sampler, tomek_links_sampler],
                        'feature_interraction__degree': [1,2,3],
                        'estimator__C': [0.01,0.1,1,10,100],
                        'estimator__kernel': ['linear'],
                         'feature_extractor__n_features_to_select':[X.shape[1] // 3],
                        },
                        ## second grid search
                        {'over_sampling': [None],
                        'under_sampling': [None],
                        'feature_extractor__n_features_to_select':[X.shape[1] // 3],
                        'feature_interraction__degree':[1,2,3],
                        'estimator__C':[0.01,0.1,1,10,100],
                        'estimator__kernel': ['linear'],
                        'estimator__class_weight':['balanced']}
           ]

svmlinear = GridSearchCV(svm_pipeline, svm_grid, cv= n_folds, scoring=mcc, n_jobs=-1)
svmlinear.fit(X,y)
svmlinear.best_score_

0.55368330616345285

In [111]:
svmlinear.best_params_

{'estimator__C': 10,
 'estimator__kernel': 'linear',
 'feature_interraction__degree': 1,
 'over_sampling__kind': 'svm',
 'under_sampling': TomekLinks(n_jobs=1, random_state=0, ratio='auto', return_indices=False)}

###  Support Vector Machine models

In [115]:
# we do not include polynomial features

svm_nonlinear_pipeline= Pipeline([('scaler', minMax_scaler), 
                         ('feature_extractor', RFE_transformer),
                         ('over_sampling', smote_sampler),
                         ('under_sampling', enn_sampler),
                         ('estimator', svc)])
svm_nonlinear_grid =[
    {'over_sampling__kind': ['svm','borderline1', 'borderline2'],
     'feature_extractor__n_features_to_select':[X.shape[1] // 3],
     'under_sampling':[enn_sampler, tomek_links_sampler],
     'estimator__kernel':['rbf'],
     'estimator__C':[0.01, 0.1, 1, 10, 100],
     'estimator__gamma':[0.01, 0.1, 1, 10, 100]}
]

svm_nonlinear = GridSearchCV(svm_nonlinear_pipeline, svm_nonlinear_grid, cv=n_folds, scoring = mcc, n_jobs=-1)
svm_nonlinear.fit(X,y)
svm_nonlinear.best_score_

0.55330785341998578

In [116]:
svm_nonlinear.best_params_

{'estimator__C': 100,
 'estimator__gamma': 0.01,
 'estimator__kernel': 'rbf',
 'over_sampling__kind': 'borderline2',
 'under_sampling': TomekLinks(n_jobs=1, random_state=0, ratio='auto', return_indices=False)}

### Ensemble Methodes

In [159]:
# random forest (no scaling here, it's not needed)

rf_pipline = Pipeline([ ('feature_extractor', RFE_transformer),
                         ('over_sampling', smote_sampler),
                         ('under_sampling', enn_sampler),
                         ('estimator', rf)])

rf_gride = [
    {'over_sampling__kind': ['svm'],
     'under_sampling':[tomek_links_sampler],
     'feature_extractor__n_features_to_select':[6,11,15],
     'estimator__n_estimators': [20, 40, 60],
     'estimator__max_features':[0.5, 1.0],
     'estimator__max_depth':[5,10]
     }
]

random_forest = GridSearchCV(rf_pipline, rf_gride, cv=n_folds, scoring=mcc, n_jobs=-1)
random_forest.fit(X,y)
random_forest.best_score_

0.57668378986939639

In [187]:
random_forest.best_params_
# try to add 0.2 for max_features

{'estimator__max_depth': 5,
 'estimator__max_features': 0.5,
 'estimator__n_estimators': 40,
 'feature_extractor__n_features_to_select': 11,
 'over_sampling__kind': 'svm',
 'under_sampling': TomekLinks(n_jobs=1, random_state=0, ratio='auto', return_indices=False)}

In [180]:
# adda boost 
ada_pipline = Pipeline([ ('feature_extractor', RFE_transformer),
                         ('over_sampling', smote_sampler),
                         ('under_sampling', enn_sampler),
                         ('estimator', adaBoost)])
ada_grid = [
    {
     'over_sampling__kind': ['svm'],
     'under_sampling':[tomek_links_sampler],
     'feature_extractor__n_features_to_select':[8,11, 13],
     'estimator__base_estimator': [DecisionTreeClassifier(max_depth=5), DecisionTreeClassifier(max_depth=10)],
     'estimator__algorithm':['SAMME', 'SAMME.R'],
     'estimator__learning_rate':[1, 0.8]
     }
]


ada = GridSearchCV(ada_pipline, ada_grid, cv=n_folds, scoring=mcc, n_jobs=-1)
ada.fit(X,y)
ada.best_score_

0.5537669231060447

In [186]:
ada.best_params_

{'estimator__algorithm': 'SAMME.R',
 'estimator__base_estimator': DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=10,
             max_features=None, max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, presort=False, random_state=None,
             splitter='best'),
 'estimator__learning_rate': 0.8,
 'feature_extractor__n_features_to_select': 11,
 'over_sampling__kind': 'svm',
 'under_sampling': TomekLinks(n_jobs=1, random_state=0, ratio='auto', return_indices=False)}

In [252]:
## GradientBoostingClassifier
g_boost_pipeline = Pipeline([('feature_extractor', RFE_transformer),
                             ('scaler', minMax_scaler),
                             ('over_sampling', smote_sampler),
                             ('under_sampling', enn_sampler),
                             ('estimator', gboost)])

g_boost_grid = [
    {
     'scaler': [minMax_scaler],
     'over_sampling__kind': ['svm'],
     'under_sampling':[tomek_links_sampler],
     'feature_extractor__n_features_to_select':[12,13,14],
     'estimator__loss': ['exponential'],
     'estimator__n_estimators':[100, 150],
     'estimator__max_depth':[2,3,4],
     'estimator__subsample':[.66, 0.5],
     'estimator__max_features':[.8, .9],
     }
]


g_boost = GridSearchCV(g_boost_pipeline, g_boost_grid, cv=n_folds, scoring=mcc, n_jobs=-1)
g_boost.fit(X,y)
g_boost.best_score_

0.59628181717552442

In [241]:
g_boost.best_params_

{'estimator__loss': 'exponential',
 'estimator__max_depth': 3,
 'estimator__max_features': 0.8,
 'estimator__n_estimators': 100,
 'estimator__subsample': 0.66,
 'feature_extractor__n_features_to_select': 13,
 'over_sampling__kind': 'svm',
 'scaler': MinMaxScaler(copy=True, feature_range=(0, 1)),
 'under_sampling': TomekLinks(n_jobs=1, random_state=0, ratio='auto', return_indices=False)}

In [253]:
g_boost.best_params_

{'estimator__loss': 'exponential',
 'estimator__max_depth': 3,
 'estimator__max_features': 0.8,
 'estimator__n_estimators': 100,
 'estimator__subsample': 0.66,
 'feature_extractor__n_features_to_select': 13,
 'over_sampling__kind': 'svm',
 'scaler': MinMaxScaler(copy=True, feature_range=(0, 1)),
 'under_sampling': TomekLinks(n_jobs=1, random_state=0, ratio='auto', return_indices=False)}

### balanced bagging classifier

In [66]:
from collections import Counter
Counter(y)

Counter({0.0: 2668, 1.0: 331})

In [117]:
bagging_pipeline = Pipeline([ ('feature_extractor', RFE_transformer),
                              ('estimator',balanced_bagging_classfier)])
bagging_grid = [{
        'feature_extractor__n_features_to_select':[11, 13, 15, 17],
        'estimator__base_estimator':[DecisionTreeClassifier(max_depth=5)],
        'estimator__n_estimators':[80],
        'estimator__max_samples': [.66],
        'estimator__max_features': [.66, 1],
        'estimator__bootstrap': [True],
        'estimator__replacement':[True, False],
        'estimator__oob_score': [True]
    }]

bagging = GridSearchCV(bagging_pipeline, bagging_grid, cv= n_folds, scoring=mcc, n_jobs=-1)
bagging.fit(X,y)
bagging.best_score_

0.54256414529987329

In [101]:
BalancedBaggingClassifier?

In [118]:
bagging.best_params_

{'estimator__base_estimator': DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
             max_features=None, max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, presort=False, random_state=None,
             splitter='best'),
 'estimator__bootstrap': True,
 'estimator__max_features': 0.66,
 'estimator__max_samples': 0.66,
 'estimator__n_estimators': 80,
 'estimator__oob_score': True,
 'estimator__replacement': False,
 'feature_extractor__n_features_to_select': 11}

### Test with Kaggle

In [142]:
kaggle_df = pd.read_csv("test_kaggle.csv", sep=",")
kaggle_df['y'].replace("yes", 1.0, inplace=True)
kaggle_df['y'].replace("no", 0.0, inplace=True)
y_kaggle = kaggle_df['y'].values

In [145]:
y_kaggle.shape[0] == X_Kaggle.shape[0]

True

In [254]:
matthews_corrcoef(y_pred=g_boost.predict(X_Kaggle), y_true= y_kaggle)

0.59730759260684463

In [244]:
file_name= "predict_gboost_"+str(matthews_corrcoef(y_pred=g_boost.predict(X_Kaggle), y_true= y_kaggle))
file_name = file_name.replace('.','_')
file_name

'predict_gboost_0_597307592607'

In [248]:
def save_submission(file_name, y_test):
    y_test=y_test.astype(np.int64)
    submission_df = pd.DataFrame({"Id": range(1, len(y_test) + 1),
                                  "prediction": y_test})
    submission_df.to_csv(file_name, index=False)

In [249]:
save_submission(file_name, g_boost.predict(X_Kaggle))