In [151]:
from xgboost import XGBClassifier
from xgboost import XGBRegressor
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import auc
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.svm import SVR
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore')

## reading data

In [3]:
pd_data = pd.read_csv('Affairs.csv')

In [4]:
print(pd_data.info(memory_usage=True))
pd_data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 601 entries, 0 to 600
Data columns (total 10 columns):
Unnamed: 0       601 non-null int64
affairs          601 non-null int64
gender           601 non-null object
age              601 non-null float64
yearsmarried     601 non-null float64
children         601 non-null object
religiousness    601 non-null int64
education        601 non-null int64
occupation       601 non-null int64
rating           601 non-null int64
dtypes: float64(2), int64(6), object(2)
memory usage: 47.0+ KB
None


Unnamed: 0.1,Unnamed: 0,affairs,gender,age,yearsmarried,children,religiousness,education,occupation,rating
0,4,0,male,37.0,10.0,no,3,18,7,4
1,5,0,female,27.0,4.0,no,4,14,6,4
2,11,0,female,32.0,15.0,yes,1,12,1,4
3,16,0,male,57.0,15.0,yes,5,18,6,5
4,23,0,male,22.0,0.75,no,2,17,6,3


In [5]:
af = pd_data.affairs.value_counts()
af

0     451
7      42
12     38
1      34
3      19
2      17
Name: affairs, dtype: int64

In [6]:
# 婚外情比例
1 - af[0]/af.sum()

0.24958402662229617

In [7]:
len(pd_data.dropna()) == len(pd_data)

True

## preprocesing

In [8]:
pd_data['gender'] = pd_data.gender.map({'male': 0, 'female': 1})

In [9]:
pd_data['children'] = pd_data.children.map({'no': 0, 'yes': 1})

In [10]:
pd_data = pd.get_dummies(pd_data, columns=['occupation'])

In [11]:
pd_data['affairs_true'] = pd_data.affairs != 0

In [12]:
pd_data.head()

Unnamed: 0.1,Unnamed: 0,affairs,gender,age,yearsmarried,children,religiousness,education,rating,occupation_1,occupation_2,occupation_3,occupation_4,occupation_5,occupation_6,occupation_7,affairs_true
0,4,0,0,37.0,10.0,0,3,18,4,0,0,0,0,0,0,1,False
1,5,0,1,27.0,4.0,0,4,14,4,0,0,0,0,0,1,0,False
2,11,0,1,32.0,15.0,1,1,12,4,1,0,0,0,0,0,0,False
3,16,0,0,57.0,15.0,1,5,18,5,0,0,0,0,0,1,0,False
4,23,0,0,22.0,0.75,0,2,17,3,0,0,0,0,0,1,0,False


In [13]:
pd_data[['Unnamed: 0', 'affairs', 'affairs_true']].head()

Unnamed: 0.1,Unnamed: 0,affairs,affairs_true
0,4,0,False
1,5,0,False
2,11,0,False
3,16,0,False
4,23,0,False


In [14]:
x = pd_data.drop(['Unnamed: 0', 'affairs', 'affairs_true'], axis=1).values
y = pd_data.affairs.astype(int).values
yb = pd_data.affairs_true.astype(int).values

In [15]:
print(pd.value_counts(y))
print(pd.value_counts(yb))

0     451
7      42
12     38
1      34
3      19
2      17
dtype: int64
0    451
1    150
dtype: int64


## XGBClassifier

In [16]:
x_train, x_test, y_train, y_test = train_test_split(x, yb)
x_train, x_validation, y_train, y_validation = train_test_split(x_train, y_train)

In [17]:
seed = 0
eval_set = [(x_validation, y_validation)]
model = XGBClassifier()
model.fit(x_train, y_train, early_stopping_rounds=10, 
          eval_metric="auc", eval_set=eval_set)

[0]	validation_0-auc:0.661844
Will train until validation_0-auc hasn't improved in 10 rounds.
[1]	validation_0-auc:0.693866
[2]	validation_0-auc:0.686921
[3]	validation_0-auc:0.655864
[4]	validation_0-auc:0.659722
[5]	validation_0-auc:0.654128
[6]	validation_0-auc:0.657215
[7]	validation_0-auc:0.658372
[8]	validation_0-auc:0.659144
[9]	validation_0-auc:0.658758
[10]	validation_0-auc:0.649498
[11]	validation_0-auc:0.654321
Stopping. Best iteration:
[1]	validation_0-auc:0.693866



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [18]:
model.score(x_test, y_test)

0.7417218543046358

In [19]:
confusion_matrix(model.predict(x_test), y_test)

array([[106,  28],
       [ 11,   6]])

## GridSearch

In [20]:
param_xgb = {
    'max_depth': range(2,6),
    'learning_rate': [0.01, 0.1, 0.5], 
    'n_estimators': np.arange(3,16)*20,
    'gamma': [10**i for i in range(-2,3)],
    'min_child_weight': range(1, 5),
    'max_delta_step': range(0, 5)
}

In [23]:
model_2 = XGBClassifier(early_stopping_rounds=10, 
                        eval_metric="auc", eval_set=eval_set)
grid_xgb = GridSearchCV(model_2, param_xgb, n_jobs=-1, verbose=True)

In [24]:
grid_xgb.fit(x_train, y_train)

Fitting 3 folds for each of 15600 candidates, totalling 46800 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    3.0s
[Parallel(n_jobs=-1)]: Done 1054 tasks      | elapsed:    9.1s
[Parallel(n_jobs=-1)]: Done 3054 tasks      | elapsed:   28.8s
[Parallel(n_jobs=-1)]: Done 5854 tasks      | elapsed:   53.4s
[Parallel(n_jobs=-1)]: Done 9454 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 13854 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 19054 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done 25054 tasks      | elapsed:  3.7min
[Parallel(n_jobs=-1)]: Done 31854 tasks      | elapsed:  4.5min
[Parallel(n_jobs=-1)]: Done 39454 tasks      | elapsed:  5.4min
[Parallel(n_jobs=-1)]: Done 46800 out of 46800 | elapsed:  6.2min finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, early_stopping_rounds=10, eval_metric='auc',
       eval_set=[(array([[ 0., 32., ...,  0.,  0.],
       [ 0., 32., ...,  0.,  1.],
       ...,
       [ 0., 27., ...,  0.,  0.],
       [ 0., 27.,...0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'max_depth': range(2, 6), 'learning_rate': [0.01, 0.1, 0.5], 'n_estimators': array([ 60,  80, 100, 120, 140, 160, 180, 200, 220, 240, 260, 280, 300]), 'gamma': [0.01, 0.1, 1, 10, 100], 'min_child_weight': range(1, 5), 'max_delta_step': range(0, 5)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=True)

In [25]:
grid_xgb.score(x_test, y_test)

0.7483443708609272

In [42]:
grid_xgb.best_estimator_

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, early_stopping_rounds=10, eval_metric='auc',
       eval_set=[(array([[ 0., 32., ...,  0.,  0.],
       [ 0., 32., ...,  0.,  1.],
       ...,
       [ 0., 27., ...,  0.,  0.],
       [ 0., 27., ...,  0.,  0.]]), array([0, 1, ..., 0, 0]))],
       gamma=0.01, learning_rate=0.01, max_delta_step=0, max_depth=2,
       min_child_weight=1, missing=None, n_estimators=60, n_jobs=1,
       nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [105]:
confusion_matrix(grid_xgb.predict(x_test), y_test)

array([[109,  30],
       [  8,   4]])

## hyperopt

In [61]:
from hyperopt import fmin, tpe, hp,partial

In [163]:
space_xgb = {
    'max_depth':hp.choice('max_depth', [2,3,4]),
    'learning_rate':hp.loguniform('learning_rate',np.log(0.01),np.log(0.1)),
    'n_estimators':hp.choice('n_estimators',range(20,161,20)),
    'gamma':hp.loguniform('gamma',np.log(0.01),np.log(1000)),
    'min_child_weight':hp.choice('min_child_weight',range(1,5)),
    'max_delta_step':hp.choice('max_delta_step',range(5))
}

def hp_xgb(args):
    xgb = XGBClassifier(early_stopping_rounds=10,eval_metric="auc",
                       max_depth = args['max_depth'],
                       learning_rate  = args['learning_rate'],
                       n_estimators  = args['n_estimators'],
                       gamma  = args['gamma'],
                       min_child_weight  = args['min_child_weight'],
                       max_delta_step  = args['max_delta_step']
                       )
    xgb.fit(x_train, y_train)
#     return -xgb.score(x_test, y_test)
#     return -accuracy_score(xgb.predict(x_test), y_test)
#     return -f1_score(xgb.predict(x_test), y_test)
    metric = cross_val_score(xgb,x_train,y_train,cv=3,scoring="roc_auc").mean()
    return -metric
best = fmin(
    fn=hp_xgb,
    space=space_xgb,
    algo=partial(tpe.suggest,n_startup_jobs=10),max_evals=100)

In [164]:
print(best)

{'gamma': 1.3860095586273222, 'learning_rate': 0.06324193068174706, 'max_delta_step': 1, 'max_depth': 1, 'min_child_weight': 3, 'n_estimators': 5}


In [165]:
-hp_xgb(best)

0.6073062558356676

In [166]:
x = XGBClassifier(early_stopping_rounds=10,eval_metric="auc",**best).fit(x_train, y_train)

In [167]:
confusion_matrix(x.predict(x_test), y_test)

array([[104,  27],
       [ 13,   7]])