Training the model using Support Vector Machine
=============

In [1]:
# -*- coding: utf-8 -*-

import random
import numpy as np
import pandas as pd
import os
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import SMOTE
from sklearn import svm
import matplotlib.pyplot as plt
import gc
random.seed(23)

## set the directory
os.chdir(r'C:\Users\User\Documents\Data_Science_Projects\food-expo-attendee-prediction-project')

# load the cleanData
data = pd.read_pickle(r'.\data\output\cleanData.pkl')
data = data.drop(['10 Digit Card Number', 'show'], axis=1)

##### Split data into train and test

In [2]:
def preprocess_data(data):
    data = data.rename(columns={'attended': 'labels'})
    summary = data.describe().transpose()
    cols = data[summary[summary['max']>1].reset_index()['index'].tolist()]
    cols = cols.columns[1:]

    # separate the labels/target variable
    dataX = data.drop(['labels'], axis = 1)
    dataY = data['labels']

    # First, scale the Data - only those numerical/non-categorical
    names = dataX.columns
    scaler = preprocessing.StandardScaler()
    # Fit your data on the scaler object
    scaled_data = scaler.fit_transform(dataX)
    scaled_data = pd.DataFrame(scaled_data, columns=names)
    scaled_data = scaled_data[cols] ###------------------->> cols are non-categorical columns
    dataX = dataX.drop(cols, axis=1)
    dataX = pd.concat([scaled_data, dataX], axis=1)
    
    # Create train and test dataset
    X_train, x_test, Y_train, y_test = train_test_split(dataX, dataY, random_state = 0)
    x_test = np.array(x_test)
    y_test = np.array(y_test)
    return X_train, x_test, Y_train, y_test

X_train, x_test, Y_train, y_test = preprocess_data(data)
gc.collect()


  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [3]:
def upsample_data(train_data, label):
    # apply oversampling (SMOTE) since the data is very imbalanced
    smote = SMOTE(random_state=1, ratio=1.0)
    X_train, Y_train = smote.fit_resample(train_data, label)
    return X_train, Y_train

X_train, Y_train = upsample_data(X_train, Y_train)

In [4]:
print(len(Y_train[Y_train==1]))
print(len(Y_train[Y_train==0]))

37543
37543


##### Tune the model by finding the best combination of parameters
This training will be using a -fold cross validation evaluation

In [None]:
model = svm.SVC()

parametersGrid = {
                'C': [1, 10], #0.001, 0.01, 0.1, 
                'gamma': [0.001] #, 0.01, 0.1, 1
                 }
grid = GridSearchCV(model, parametersGrid, cv=8, n_jobs=12) ## 8-fold cross-validation
grid.fit(X_train, Y_train)

In [None]:
print (grid.best_params_)
print(grid.best_score_)

#### The best parameters :
    parametersGrid = {
                'C': [0.001, 0.01, 0.1, 1, 10],
                'gamma': [0.001, 0.01, 0.1, 1]
                 }

In [None]:
0.7854327038329383