In [1]:
import pandas as pd
import numpy as np
import matplotlib as mp
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
train_df = pd.read_csv("train.csv", sep=";")
test_df = pd.read_csv("test.csv", sep=";", header=None, names=train_df.columns.values)

# get all the data
all_data_df = train_df.append(test_df)

In [3]:
## cleaning the data, get rid of the NaN values in some 
for column_to_cure in ["job", "marital", "education", "loan", "housing", "default"]:
    most_commun = all_data_df[column_to_cure].mode().iloc[0]
    all_data_df[column_to_cure].replace("unknown", most_commun, inplace=True)

In [4]:
## encode the target value
all_data_df['y'].replace("yes", 1.0, inplace=True)
all_data_df['y'].replace("no", 0.0, inplace=True)

In [5]:
## drop the default column (there is only one yes among all data points)
all_data_df.drop(["default"], axis=1, inplace=True)

In [6]:
## Adding features
## using one-hot-encoding on all the categorical columns except the y which is the target
categorical_columns = list(all_data_df.dtypes[all_data_df.dtypes == "object"].index.values)
all_data_df = pd.get_dummies(all_data_df, columns=categorical_columns)

In [7]:
all_data_df.shape

(4119, 56)

In [8]:
all_data_df.dtypes

age                                int64
duration                           int64
campaign                           int64
pdays                              int64
previous                           int64
emp.var.rate                     float64
cons.price.idx                   float64
cons.conf.idx                    float64
euribor3m                        float64
nr.employed                      float64
y                                float64
job_admin.                       float64
job_blue-collar                  float64
job_entrepreneur                 float64
job_housemaid                    float64
job_management                   float64
job_retired                      float64
job_self-employed                float64
job_services                     float64
job_student                      float64
job_technician                   float64
job_unemployed                   float64
marital_divorced                 float64
marital_married                  float64
marital_single  

### Get the ndarray

In [9]:
## get the Kaggle train and test dataset
test_df = all_data_df[all_data_df['y'].isnull()]
train_df= all_data_df[all_data_df['y'].notnull()]

In [10]:
## getting the numpy equivalent 
y = train_df['y'].values
X = train_df.loc[:, train_df.columns != 'y'].values
X_Kaggle = test_df.loc[:, test_df.columns != 'y'].values

In [11]:
X.shape

(2999, 55)

In [12]:
X_Kaggle.shape

(1120, 55)

In [15]:
# make the train and test set 
from sklearn.model_selection import train_test_split
# the test size 
test_size= 0.3

X_train, X_test, y_train, y_test = train_test_split(X,y, stratify=y, test_size= test_size, random_state=0)

In [16]:
print("train ="+ str(X_train.shape) + ", test ="+ str(X_test.shape))

train =(2099, 55), test =(900, 55)


### Feature selection

In [17]:
# using percentiles and the 50 precent most influent
from sklearn.feature_selection import SelectPercentile

select = SelectPercentile(percentile=50)
select.fit(X_train, y_train)

# get the most correlated data
all_data_df.columns.values[all_data_df.columns.values!='y'][select.get_support()]



array(['age', 'duration', 'campaign', 'pdays', 'previous', 'emp.var.rate',
       'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed',
       'job_admin.', 'job_management', 'job_retired', 'job_unemployed',
       'education_university.degree', 'contact_cellular',
       'contact_telephone', 'month_dec', 'month_jun', 'month_mar',
       'month_may', 'month_oct', 'month_sep', 'day_of_week_fri',
       'poutcome_failure', 'poutcome_nonexistent', 'poutcome_success'], dtype=object)

In [18]:
# using model based feature selection 
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import SelectFromModel

select1= SelectFromModel(DecisionTreeClassifier(random_state=0),threshold= "median")
select1.fit(X_train,y_train)

# get the selected features 
all_data_df.columns.values[all_data_df.columns.values != 'y'][select1.get_support()]

array(['age', 'duration', 'campaign', 'pdays', 'cons.price.idx',
       'cons.conf.idx', 'euribor3m', 'nr.employed', 'job_admin.',
       'job_entrepreneur', 'job_technician', 'job_unemployed',
       'marital_married', 'education_basic.6y', 'education_high.school',
       'education_university.degree', 'housing_no', 'housing_yes',
       'contact_cellular', 'contact_telephone', 'month_jul', 'month_jun',
       'month_oct', 'month_sep', 'day_of_week_fri', 'day_of_week_mon',
       'day_of_week_thu', 'day_of_week_wed'], dtype=object)

In [19]:
# using iterative feature selection 
from sklearn.feature_selection import RFE
select2 = RFE(DecisionTreeClassifier(random_state=0), n_features_to_select=27)
select2.fit(X_train,y_train)

# see the selected features 
all_data_df.columns.values[all_data_df.columns.values != 'y'][select2.get_support()]

array(['age', 'duration', 'campaign', 'pdays', 'previous', 'emp.var.rate',
       'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed',
       'job_admin.', 'job_blue-collar', 'job_management', 'job_retired',
       'job_self-employed', 'job_technician', 'job_unemployed',
       'marital_married', 'education_high.school', 'housing_yes',
       'contact_telephone', 'month_oct', 'month_sep', 'day_of_week_fri',
       'day_of_week_mon', 'day_of_week_thu', 'day_of_week_wed'], dtype=object)

In [20]:
# rescaling the data
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaler.fit(X_train)
X_train_s1 = scaler.transform(X_train)

In [22]:
# see if the scaling influences the RFE
select2.fit(X_train_s1,y_train)
f1 = all_data_df.columns.values[all_data_df.columns.values != 'y'][select2.get_support()]
select2.fit(X_train,y_train)
f2 = all_data_df.columns.values[all_data_df.columns.values != 'y'][select2.get_support()]

np.setdiff1d(f1,f2)

array([], dtype=object)

In [38]:
# deal with the imbalanced data
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
X_s1, y_s1 = SMOTE(kind='borderline1', ratio='minority', random_state=0).fit_sample(X_train, y_train)
X_s2, y_s2 = SMOTE(kind='borderline2', ratio='minority', random_state=0).fit_sample(X_train, y_train)
X_s3, y_s3 = SMOTE(kind='svm', ratio='minority', random_state=0).fit_sample(X_train, y_train)
X_s4, y_s4 = RandomUnderSampler(random_state=0).fit_sample(X_train,y_train)

Counter({0.0: 232, 1.0: 232})

In [40]:
# see the result of oversampling on rf
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import matthews_corrcoef

rf  = RandomForestClassifier().fit(X_s4, y_s4)
matthews_corrcoef(y_true=y_test, y_pred= rf.predict(X_test))


0.48257707560021212

### using piplines

In [78]:
from imblearn.pipeline import make_pipeline
from imblearn.under_sampling import EditedNearestNeighbours, TomekLinks
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

r_state = 0
n_folds = 5
mcc = make_scorer(matthews_corrcoef)

In [79]:
#select a third of the features (half of the initial features)
n_features_to_select= X.shape[1] // 3

# the scaler
minMax_scaler = MinMaxScaler()
# the feature_selector
RFE_transformer = RFE(DecisionTreeClassifier(random_state=r_state), n_features_to_select )

# the samplers
## over_samplers 
smote_bline1_sampler = SMOTE(kind='borderline1', random_state=r_state)
smote_bline2_sampler = SMOTE(kind='borderline2', random_state=r_state)
smote_svm_sampler = SMOTE(kind='svm', random_state=r_state)

## under samplers
enn_sampler = EditedNearestNeighbours(random_state=r_state)
tomek_links_sampler = TomekLinks(random_state=r_state)

# the estimator
rf = RandomForestClassifier(random_state=r_state)
adaBoost  = AdaBoostClassifier(random_state= r_state)
svc = SVC()
mlp = MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(15,), random_state=r_state)

# the pipeline
pipeline1 = make_pipeline(minMax_scaler, RFE_transformer, smote_svm_sampler, rf)
pipeline2 = make_pipeline(minMax_scaler, RFE_transformer, smote_svm_sampler, enn_sampler, rf)
pipeline3 = make_pipeline(minMax_scaler, RFE_transformer, smote_svm_sampler, adaBoost)
pipeline4 = make_pipeline(minMax_scaler, RFE_transformer, smote_svm_sampler, svc)
pipeline5 = make_pipeline(minMax_scaler, RFE_transformer, smote_svm_sampler, mlp)


In [80]:
# runing cross_validation

cross_val_score(pipeline5, X, y, scoring=mcc, cv=n_folds)

array([ 0.42518695,  0.4165885 ,  0.57709545,  0.46470108,  0.41759908])

### Choose a classifier

Random Forest

In [34]:
## Random forest with a grid search on it
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, matthews_corrcoef


grid_params_rf = {"class_weight": [None, "balanced"],
                  "n_estimators": [_ * 10 for _ in range(1,11)],
                  "max_features": [_ * 10 for _ in range(1,6)] + [X_train.shape[1]] ,
                  "max_depth":    [ 10 + _ * 10 for _ in range(5)],
                  "random_state": [0]}

mcc = make_scorer(matthews_corrcoef)
k_fold = 3

classifier = GridSearchCV(RandomForestClassifier(),grid_params_rf, cv = k_fold, scoring= mcc)

In [35]:
classifier.fit(X_train, y_train)

GridSearchCV(cv=3, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'max_depth': [10, 20, 30, 40, 50], 'n_estimators': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100], 'max_features': [10, 20, 30, 40, 50, 57], 'class_weight': [None, 'balanced'], 'random_state': [0]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=make_scorer(matthews_corrcoef), verbose=0)

In [52]:
# try it on all the data we have
classifier.fit(X,y)

GridSearchCV(cv=3, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'max_depth': [10, 20, 30, 40, 50], 'n_estimators': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100], 'max_features': [10, 20, 30, 40, 50, 57], 'class_weight': [None, 'balanced'], 'random_state': [0]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=make_scorer(matthews_corrcoef), verbose=0)

In [53]:
classifier.best_score_

0.5533145894959921

In [57]:
classifier.best_params_

{'class_weight': 'balanced',
 'max_depth': 10,
 'max_features': 57,
 'n_estimators': 80,
 'random_state': 0}

In [56]:
matthews_corrcoef(y_pred=classifier.predict(X_test), y_true=y_test)

0.82966606063184034

Googd score for RandomForest, we added data and that's it 

In [58]:
y_prediction = RandomForestClassifier(**classifier.best_params_).fit(X,y).predict(X_Kaggle)

#save the prediction

submission_df = pd.DataFrame(y_prediction.astype(int), columns=["prediction"])
submission_df.index += 1
                              
submission_df.to_csv("rf_prediction", index_label="Id")

SVM

In [26]:
from sklearn.svm import SVC

grid_params_svm = {"C": np.logspace(start=-2, stop=2, base=2,num=10),
                   "kernel": ["rbf"],
                   "gamma": np.logspace(start=-2, stop=2, base=2,num=10),
                   "class_weight": [None, "balanced"]}

classifier_svc = GridSearchCV(SVC(),grid_params_svm, cv = k_fold, scoring= mcc)


In [27]:
classifier_svc.fit(X_train,y_train)

GridSearchCV(cv=3, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'kernel': ['rbf'], 'C': array([ 0.25   ,  0.3402 ,  0.46294,  0.62996,  0.85724,  1.16653,
        1.5874 ,  2.16012,  2.93947,  4.     ]), 'gamma': array([ 0.25   ,  0.3402 ,  0.46294,  0.62996,  0.85724,  1.16653,
        1.5874 ,  2.16012,  2.93947,  4.     ]), 'class_weight': [None, 'balanced']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=make_scorer(matthews_corrcoef), verbose=0)

In [30]:
classifier_svc.best_params_

{'C': 0.25, 'class_weight': None, 'gamma': 0.25, 'kernel': 'rbf'}

In [31]:
np.logspace(start=-2, stop=2, base=2,num=10)

array([ 0.25      ,  0.3401975 ,  0.46293736,  0.62996052,  0.85724398,
        1.16652904,  1.58740105,  2.16011948,  2.93946898,  4.        ])

AdaBoost

In [33]:
from sklearn.ensemble import AdaBoostClassifier

grid_params_ada_boost={"n_estimators": [50 * _ for _ in range(1,4)],
                       "algorithm":["SAMME", "SAMME.R"],
                        "random_state": [0]}

classifier_ada_boost= GridSearchCV(AdaBoostClassifier(),grid_params_ada_boost, cv = k_fold, scoring= mcc)


In [34]:
classifier_ada_boost.fit(X_train,y_train)

GridSearchCV(cv=3, error_score='raise',
       estimator=AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'random_state': [0], 'n_estimators': [50, 100, 150], 'algorithm': ['SAMME', 'SAMME.R']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=make_scorer(matthews_corrcoef), verbose=0)

In [35]:
classifier_ada_boost.best_score_

0.37631141609740365

In [36]:
classifier_ada_boost.best_params_

{'algorithm': 'SAMME', 'n_estimators': 100, 'random_state': 0}