### Notebook for classsifying using AdaBoost.

This notebook is excpecting the data to be numeric. A method of ensuring this to first use Pre_Processing_USE-4.ipynb.

 Cells are executed in the order in which they appear in the notebook.

In [1]:
# Import libraries

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
import numpy as np
import pandas as pd

import sigopt # comment out if not using SigOpt

In [8]:
# Sigopt stuff, not really needed. If you don't have a Sigopt account, you should get one.
from sigopt import Connection

# put your token here 
api_token = "Put your token here"
 
conn = Connection(client_token=api_token)

Import numerical feature and label data.

In [12]:
# Load dataset
X = pd.read_csv('X_numeric.csv', sep=',', header=None)
y = pd.read_csv('y_numeric.csv', sep=',', header=None)
y = np.ravel(y)

In [13]:
# Check the data in case the first row is the index 
X.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,61,62,63,64,65,66,67,68,69,70
0,14806240000.0,14806240000.0,14806240000.0,16.17853,12.164207,1.753315,-5.300097,1.016841,8.11867,6.819417,...,11.372132,7.036551,-2.55268,-54.15332,32.803783,16.710754,-24.062065,6.590991,-0.72985,6.6551
1,447547000000.0,447547000000.0,447547000000.0,20.004223,11.334753,-2.820019,-7.355494,-1.181901,6.805092,0.625156,...,21.825056,13.261285,-17.120144,4.034127,-10.328978,-12.635922,-22.477568,3.983698,-0.710531,4.778454
2,4536947000.0,4536947000.0,4536947000.0,19.789394,11.128526,-5.51075,-8.971125,4.214148,-21.717592,6.829502,...,-4.366057,6.523605,-5.956306,8.049953,-8.431773,-1.328702,19.821604,2.230384,-2.804008,4.660541
3,38976250000.0,38976250000.0,38976250000.0,19.559141,11.15343,-7.129503,14.20982,4.214219,-21.717434,-10.894494,...,12.437857,30.452587,15.085895,-17.211315,-9.99823,-9.034165,21.270443,-3.827162,-2.322451,4.214811
4,14252740000.0,14252740000.0,14252740000.0,19.156839,11.09323,-4.028421,3.910608,0.271392,7.674269,3.828856,...,12.387394,15.096603,4.221771,-12.505415,6.967276,-8.013271,13.346799,3.252636,1.021878,7.317034


In [14]:
# Check the shape of the labels in case the first row is the index
y.shape

(657,)

In [None]:
# If the index has been added to the first column, run this cell 
X=X.iloc[:,1:]
y=y.iloc[:,1:]

Split to train and test sets. Save them as np.array and make sure they are float. Perform scaling.

In [15]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
# Save to array and make sure dtype is float or else the scaling might not work.
X_train=np.asarray(X_train).astype(np.float_)
X_test=np.asarray(X_test).astype(np.float_)

y_train=np.asarray(y_train).astype(np.float_)
y_test=np.asarray(y_test).astype(np.float_)

In [17]:
# Scale the features
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

Perform AdaBoost Classificaiton. In case you want to perform it without SigOpt, skip the next two cells.

In [9]:
# This is sigopt part where you define the parameters and their ranges
def create_model(assignments):
    model = AdaBoostClassifier(
                                n_estimators         = assignments['n_estimators'],
                                learning_rate        = assignments['learning_rate']
                            )
    return model

def evaluate_model(assignments):
    model = create_model(assignments)
    model.fit(X_train, y_train)
    
    return cross_val_score(model, X_train, y_train, cv=5).mean()

experiment = conn.experiments().create(
    
    name="AdaBoost Classifier FB Statuses",
 
    parameters=[
        dict(name="n_estimators", bounds=dict(min=1,max=350), type="int"),
        dict(name="learning_rate", bounds=dict(min=0.0001,max=0.1), type="double")
        ],
 
    metrics=[
        dict(name="accuracy", objective="maximize", strategy="optimize")
        ],
 
    observation_budget = 20,
)
 
print("Explore your experiment: https://app.sigopt.com/experiment/" + experiment.id + "/analysis")

Explore your experiment: https://app.sigopt.com/experiment/515103/analysis


In [None]:
#Optimization Loop, still SigOpt stuff
for _ in range(experiment.observation_budget):
    suggestion = conn.experiments(experiment.id).suggestions().create()
    assignments = suggestion.assignments
    value = evaluate_model(assignments)
    conn.experiments(experiment.id).observations().create(
        suggestion=suggestion.id,
        value=value
    )
    #update experiment object
    experiment = conn.experiments(experiment.id).fetch()
assignments = conn.experiments(experiment.id).best_assignments().fetch().data[0].assignments  
print("BEST ASSIGNMENTS \n", assignments)

In [18]:
# So you haven't signed up for SigOpt yet, but you can still do this.
# Create a classifier with your choice of parameters and train it on the training set.
clas = AdaBoostClassifier(n_estimators=157, learning_rate=0.01, random_state=42)
clas.fit(X_train, y_train)

AdaBoostClassifier(learning_rate=0.01, n_estimators=157, random_state=42)

In [19]:
# Predict the test set and calculate the accuracy
scores = cross_val_score(clas, X_test, y_test)
print(f'AdaBoost got the following accuracy on the test set: {np.mean(scores)}') 

AdaBoost got the following accuracy on the test set: 0.7575498575498576
