# Random Forest/XGBoost Classification

## Importing the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from datetime import datetime
!pip install scikit-optimize



## Importing the datasets

In [2]:
df = pd.read_csv("/Users/nielsblom/Documents/train.csv", sep=";", decimal=",") # insert path for files 
df_test = pd.read_csv("/Users/nielsblom/Documents/test.csv", sep=";", decimal=",") # insert path for files

In [3]:
df.head(5)

Unnamed: 0,target,v2a1,hacdor,rooms,hacapo,v14a,refrig,v18q1,r4h1,r4h2,...,computer,television,qmobilephone,lugar1,lugar2,lugar3,lugar4,lugar5,rural,age
0,0,0.0,0,4,0,1,1,0,0,1,...,0,0,3,1,0,0,0,0,1,43
1,0,0.0,0,8,0,1,1,1,0,1,...,0,0,3,0,0,0,0,0,1,18
2,0,0.0,0,5,0,1,1,0,0,2,...,0,1,2,0,0,1,0,0,0,62
3,1,0.0,0,8,0,1,1,2,0,2,...,1,1,4,1,0,0,0,0,0,20
4,0,350000.0,0,5,0,1,1,3,1,1,...,1,0,3,1,0,0,0,0,0,3


In [4]:
df_test.head(5)

Unnamed: 0,v2a1,hacdor,rooms,hacapo,v14a,refrig,v18q1,r4h1,r4h2,r4m1,...,computer,television,qmobilephone,lugar1,lugar2,lugar3,lugar4,lugar5,rural,age
0,0.0,0,8,0,1,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,92
1,90000.0,1,2,0,1,1,0,0,2,0,...,0,0,1,1,0,0,0,0,0,50
2,215000.0,0,4,0,1,1,0,0,1,0,...,0,0,2,1,0,0,0,0,0,22
3,150000.0,0,3,0,1,1,0,0,1,0,...,0,0,1,1,0,0,0,0,0,66
4,100000.0,1,1,1,0,1,0,0,2,0,...,0,0,3,1,0,0,0,0,0,41


## Define train and test sets

In [5]:
X = df.iloc[:, 1:].values
y = df.iloc[:, 0].values

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

## Feature Scaling

In [7]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

## Define timer function 

In [8]:
def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))

## Code for muting deprecation warnings of XGBoost

In [9]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

# Train and fit default model

In [10]:
!pip install xgboost
from xgboost import XGBClassifier
classifier = XGBClassifier(random_state=0, eval_metric = "logloss")
classifier.fit(X_train, y_train)



XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric='logloss', gamma=0, gpu_id=-1,
              grow_policy='depthwise', importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_bin=256, max_cat_to_onehot=4, max_delta_step=0, max_depth=6,
              max_leaves=0, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=0,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, ...)

## Predicting the Test set results

In [11]:
y_pred = classifier.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[0 0]
 [1 1]
 [1 1]
 ...
 [0 0]
 [0 0]
 [0 0]]


## Making the Confusion Matrix

In [13]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[927  26]
 [ 84 491]]


0.9280104712041884

# Apply k-Fold Cross Validation

In [14]:
from sklearn.model_selection import cross_val_score
cv = 10
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = cv)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

Accuracy: 91.57 %
Standard Deviation: 1.26 %


## Apply repeated stratified K-fold cross validation

In [15]:
from sklearn.model_selection import RepeatedStratifiedKFold
repcv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
accuracies = cross_val_score(estimator = classifier, X = X, y = y, cv = repcv)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

Accuracy: 92.93 %
Standard Deviation: 0.84 %


## Apply a grid-search to tune the hyperparameters

## Bayesian search XGBoost

In [30]:
from skopt import BayesSearchCV

n_iter = 70

parameters = [{
                'n_estimators': [100],
                'max_depth': [12, 16, 20, 24],
                'lambda': [0.0001, 0.0005, 0.001, 0.1],
                'alpha': [0.0001, 0.0005, 0.001, 0.1],
                'eta': [0.001, 0.1, 0.2, 0.3, 0.4],
                'eval_metric' : ['logloss','error', 'accuracy_score']
                'gamma': [0.0005, 0.001, 0.002],
                'min_child_weight': [1, 3, 5],
                'max_delta_step': [1, 3, 5, 10],
                'subsample': [0.75, 1.0],
                'colsample_bytree' :[0.75, 1.0],
                'colsample_bylevel' :[0.75, 1.0],
                'colsample_bynode' :[0.75, 1.0],
#                 'num_parallel_trees': [0,1, 2, 3],
#                 'early_stopping_rounds': [10, 15, 20, 25]
              }]


bayes_search = BayesSearchCV(estimator = classifier,
                             search_spaces = parameters,
                             n_iter = n_iter,
                             scoring = 'accuracy',
                             cv = cv,
                             n_jobs = -1,
                             random_state = 0,
                             verbose = 10)

start_time = timer(None) # timing starts from this point for "start_time" variable
bayes_search.fit(X_train, y_train)
timer(start_time) # timing ends here for "start_time" variable

best_accuracy = bayes_search.best_score_
best_parameters = bayes_search.best_params_
print("Best Accuracy: {:.2f} %".format(best_accuracy*100))
print("Best Parameters:", best_parameters)

Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for eac

Grid search:
Time taken: 13 hours 31 minutes and 16.91 seconds.
Best Accuracy: 95.24 %
Best Parameters: {'colsample_bytree': 1.0, 'eta': 0.2, 'eval_metric': 'logloss', 'gamma': 0.0025, 'max_delta_step': 10, 'max_depth': 12, 'min_child_weight': 0, 'n_estimators': 100, 'reg_alpha': 0.0001, 'reg_lambda': 0.2, 'subsample': 1.0}
With repcv

 Time taken: 1 hours 49 minutes and 45.04 seconds.
Best Accuracy: 94.99 %
Best Parameters: OrderedDict([('alpha', 0.0001), ('colsample_bytree', 1.0), ('eta', 0.0001), ('gamma', 0.001), ('lambda', 0.001), ('max_delta_step', 5), ('max_depth', 16), ('min_child_weight', 0), ('subsample', 1.0)])



## Grid search XGBoost

In [84]:
from sklearn.model_selection import GridSearchCV
parameters = [{
                'n_estimators' : [100, 500, 1000, 1500, 2000],
                'eval_metric' : ["logloss"],
                'reg_alpha' : [0.0001],    
                'colsample_bytree' : [1.0],   
                'eta' : [0.2], 
                'gamma' : [0.0025],    
                'reg_lambda' : [0.2],    
                'max_delta_step' : [5], 
                'max_depth' : [13], 
                'min_child_weight' : [0], 
                'subsample' : [1.0]  
              }]
grid_search = GridSearchCV(estimator = classifier,
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = cv,
                           n_jobs = -1)
start_time = timer(None) # timing starts from this point for "start_time" variable
grid_search.fit(X_train, y_train)
timer(start_time) # timing ends here for "start_time" variable

best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_
print("Best Accuracy: {:.2f} %".format(best_accuracy*100))
print("Best Parameters:", best_parameters)


 Time taken: 0 hours 38 minutes and 48.31 seconds.
Best Accuracy: 94.94 %
Best Parameters: {'colsample_bytree': 1.0, 'eta': 0.2, 'eval_metric': 'logloss', 'gamma': 0.0025, 'max_delta_step': 5, 'max_depth': 13, 'min_child_weight': 0, 'n_estimators': 500, 'reg_alpha': 0.0001, 'reg_lambda': 0.2, 'subsample': 1.0}


## Best model

In [16]:
best_classifier = XGBClassifier(random_state=0,
                                n_estimators = 1000,
                                eval_metric = "logloss",
                                reg_alpha = 0.0001, 
                                colsample_bytree = 1.0, 
                                eta = 0.2, 
                                gamma = 0.0025, 
                                reg_lambda = 0.2, 
                                max_delta_step = 5, 
                                max_depth = 13, 
                                min_child_weight = 0, 
                                subsample = 1.0
                               )
best_classifier.fit(X_train,y_train)
accuracies = cross_val_score(estimator = best_classifier, X = X_train, y = y_train, cv = repcv, n_jobs = -1)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))

Accuracy: 94.80 %


### Best accuracy and parameters

Accuracy:
Accuracy: 95.04 %

Parameters:
{'objective': 'binary:logistic', 'use_label_encoder': True, 'base_score': 0.5, 'booster': 'gbtree', 'colsample_bylevel': 1, 'colsample_bynode': 1, 'colsample_bytree': 1.0, 'enable_categorical': False, 'gamma': 0.0025, 'gpu_id': -1, 'importance_type': None, 'interaction_constraints': '', 'learning_rate': 0.200000003, 'max_delta_step': 5, 'max_depth': 13, 'min_child_weight': 0, 'missing': nan, 'monotone_constraints': '()', 'n_estimators': 1000, 'n_jobs': 8, 'num_parallel_tree': 1, 'predictor': 'auto', 'random_state': 0, 'reg_alpha': 0.0001, 'reg_lambda': 0.2, 'scale_pos_weight': 1, 'subsample': 1.0, 'tree_method': 'exact', 'validate_parameters': 1, 'verbosity': None, 'eval_metric': 'logloss', 'eta': 0.2}

In [17]:
print(best_classifier.get_params())

{'objective': 'binary:logistic', 'use_label_encoder': False, 'base_score': 0.5, 'booster': 'gbtree', 'callbacks': None, 'colsample_bylevel': 1, 'colsample_bynode': 1, 'colsample_bytree': 1.0, 'early_stopping_rounds': None, 'enable_categorical': False, 'eval_metric': 'logloss', 'gamma': 0.0025, 'gpu_id': -1, 'grow_policy': 'depthwise', 'importance_type': None, 'interaction_constraints': '', 'learning_rate': 0.200000003, 'max_bin': 256, 'max_cat_to_onehot': 4, 'max_delta_step': 5, 'max_depth': 13, 'max_leaves': 0, 'min_child_weight': 0, 'missing': nan, 'monotone_constraints': '()', 'n_estimators': 1000, 'n_jobs': 0, 'num_parallel_tree': 1, 'predictor': 'auto', 'random_state': 0, 'reg_alpha': 0.0001, 'reg_lambda': 0.2, 'sampling_method': 'uniform', 'scale_pos_weight': 1, 'subsample': 1.0, 'tree_method': 'exact', 'validate_parameters': 1, 'verbosity': None, 'eta': 0.2}


In [18]:
y_pred_best = best_classifier.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[0 0]
 [1 1]
 [1 1]
 ...
 [0 0]
 [0 0]
 [0 0]]


In [19]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred_best)
print(cm)
accuracy_score(y_test, y_pred_best)

[[937  16]
 [ 53 522]]


0.9548429319371727

# Make final predicitions

In [20]:
X_test_final = df_test.iloc[:, :].values

In [21]:
final_classifier = XGBClassifier(random_state=0,
                                n_estimators = 1000,
                                eval_metric = "logloss",
                                reg_alpha = 0.0001, 
                                colsample_bytree = 1.0, 
                                eta = 0.2, 
                                gamma = 0.0025, 
                                reg_lambda = 0.2, 
                                max_delta_step = 5, 
                                max_depth = 13, 
                                min_child_weight = 0, 
                                subsample = 1.0
                               )
best_classifier.fit(X,y)
y_pred_best = best_classifier.predict(X_test_final)

print(y_pred_best)

[0 0 0 ... 0 0 1]


In [29]:
y_pred_best_string = [str(x) for x in y_pred_best]
print(y_pred_best_string)

with open('predictions.txt', 'w') as txt_file:
    for line in y_pred_best_string:
        txt_file.write(line)


['0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '1', '0', '0', '0', '0', '0', '0', '0', '1', '0', '0', '1', '1', '0', '1', '1', '1', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '1', '0', '0', '0', '0', '0', '0', '1', '1', '1', '0', '0', '1', '0', '1', '1', '0', '0', '1', '0', '0', '0', '1', '0', '0', '1', '0', '0', '0', '0', '0', '0', '0', '1', '1', '0', '1', '0', '0', '0', '1', '1', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '1', '1', '0', '0', '0', '0', '1', '0', '0', '0', '0', '1', '0', '0', '1', '1', '0', '0', '0', '1', '1', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '1', '0', '0', '0', '1', '1', '0', '0', '0', '0', '0', '0', '0', '0',