Training the model using boosting
=============

In [5]:
# -*- coding: utf-8 -*-

import random
import numpy as np
import pandas as pd
import os
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import RandomizedSearchCV
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier

import matplotlib.pyplot as plt
import gc
random.seed(23)

## set the directory
os.chdir(r'C:\Users\User\Documents\Data_Science_Projects\middle-east-event-show-prediction-project')

# load the cleanData
data = pd.read_pickle(r'.\data\output\cleanData.pkl')
data = data.drop(['10 Digit Card Number', 'show'], axis=1)

#### Scale the numerical variables and Split data into train and test

In [6]:
def preprocess_data(data):
    data = data.rename(columns={'attended': 'labels'})
    summary = data.describe().transpose()
    cols = data[summary[summary['max']>1].reset_index()['index'].tolist()]
    cols = cols.columns[1:]

    # separate the labels/target variable
    dataX = data.drop(['labels'], axis = 1)
    dataY = data['labels']

    # Create train and test dataset
    X_train, x_test, Y_train, y_test = train_test_split(dataX, dataY, random_state = 0)

    gather_data = pd.DataFrame(columns = ['feature', 'mean', 'std'])
    for i in cols:
        ave = X_train[i].mean()
        std = X_train[i].std()
        appended = pd.DataFrame({'feature': i, 'mean': ave, 'std': std}, index = [0])
        gather_data = gather_data.append(appended, ignore_index=True)
        x_test[i] = (x_test[i]-ave)/std
        X_train[i] = (X_train[i]-ave)/std

    # First, scale the Data - only those numerical/non-categorical
    x_test = np.array(x_test)
    y_test = np.array(y_test)
    return X_train, x_test, Y_train, y_test, gather_data

X_train, x_test, Y_train, y_test, gather_data = preprocess_data(data)
#gather_data.to_csv(r'.\data\output\mean_std_scaler.csv', index=False)
gc.collect()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


14

#### Upsample to cure the imbalance bias

In [7]:
'''def upsample_data(train_data, label):
    # apply oversampling (SMOTE) since the data is very imbalanced
    smote = SMOTE(random_state=1, ratio=1.0)
    X_train, Y_train = smote.fit_resample(train_data, label)
    return X_train, Y_train

X_train, Y_train = upsample_data(X_train, Y_train)'''

'def upsample_data(train_data, label):\n    # apply oversampling (SMOTE) since the data is very imbalanced\n    smote = SMOTE(random_state=1, ratio=1.0)\n    X_train, Y_train = smote.fit_resample(train_data, label)\n    return X_train, Y_train\n\nX_train, Y_train = upsample_data(X_train, Y_train)'

In [18]:
def grid_search(X_train, Y_train):
    model = XGBClassifier(boosting_type='gbdt',  objective='binary:logistic', metric='auc')

    param_dist = {
                    'learning_rate': [0.1],
                    'max_depth': [8],
                    'subsample': [0.6],
                    'colsample_bytree': [0.7, 0.5],
                    'n_estimators': [300, 200],
                    'lambda': [0.2, 0.3],
                    'alpha': [0, 0.2]
    }

    n_iter_search = 50
    grid = RandomizedSearchCV(model, cv=5, param_distributions=param_dist, n_jobs=12, n_iter=n_iter_search) ## 10-fold cross-validation
    grid.fit(X_train, Y_train)
    print ('\nThe best parameters are ', grid.best_params_)
    print('\nThe best score is ', grid.best_score_)
    return grid

grid = grid_search(X_train, Y_train)




The best parameters are  {'subsample': 0.6, 'learning_rate': 0.1, 'n_estimators': 300, 'lambda': 0.2, 'max_depth': 8, 'colsample_bytree': 0.7, 'alpha': 0}

The best score is  0.7376449054301403
