# Setup

In [1]:
# -*- coding: utf-8 -*-

In [13]:
import os
import sys
import numpy as np
import pandas as pd
import sklearn as sk
import pickle as pkl

---
# Data

### About the dataset

Abalone
+ Sex / nominal / -- / M, F, and I (infant) 
+ Length / continuous / mm / Longest shell measurement 
+ Diameter	/ continuous / mm / perpendicular to length 
+ Height / continuous / mm / with meat in shell 
+ Whole weight / continuous / grams / whole abalone 
+ Shucked weight / continuous	/ grams / weight of meat 
+ Viscera weight / continuous / grams / gut weight (after bleeding) 
+ Shell weight / continuous / grams / after being dried 
+ Rings / integer / -- / +1.5 gives the age in years 

In [7]:
# variable names
names = [
    'sex',
    'length',
    'diameter',
    'height',
    'whole_weight',
    'shucked_weight',
    'viscera_weight',
    'shell_weight',
    'rings']

# reading dataset
df = pd.read_csv('data/abalone.data', header=None, names=names)

# building prediction target
df['target'] = df['rings'] >= 10
df = df.drop('rings', axis=1)

In [8]:
# !!!! TEMP
df = df.drop('sex', axis=1)

In [23]:
df.head()

Unnamed: 0,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,target
0,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,True
1,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,False
2,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,False
3,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,True
4,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,False


---

# Feature Preparation

In [11]:
# seperating target from features
y = np.array(df['target'])
X = df.drop('target', axis=1)

In [14]:
# shuffling and splitting data into training and test sets
SPLIT = 0.33
X_train, X_test, y_train, y_test = sk.model_selection.train_test_split(X, y, test_size=SPLIT, random_state=42)

In [None]:
# one-hot encoding categorical features



In [None]:
# rescaling numerical features



---
# Training

In [19]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(
    n_estimators=100, # number of trees
    n_jobs=-1, # parallelization
    random_state=1337, # random seed
    max_depth=10, # maximum tree depth
    min_samples_leaf=10
)

In [34]:
%time model = clf.fit(X_train, y_train)

CPU times: user 372 ms, sys: 0 ns, total: 372 ms
Wall time: 281 ms


---
# Evaluation

In [29]:
# computing ROC AUC over training set
train_auc = sk.metrics.roc_auc_score(y_train, model.predict(X_train))
print('Training ROC AUC:\t', round(train_auc, 3))

Training ROC AUC:	 0.849


In [31]:
# computing ROC AUC over test set
test_auc = sk.metrics.roc_auc_score(y_test, model.predict(X_test))
print('Test ROC AUC:\t\t', round(test_auc, 3))

Test ROC AUC:		 0.796


---
# Storing Model

In [36]:
pkl.dump(model, open('pickles/model.pkl','wb'))

---
# Loading Model

In [38]:
m = pkl.load(open('pickles/model.pkl','rb'))
m

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=10,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=-1, oob_score=False,
            random_state=1337, verbose=0, warm_start=False)

All good. Let's build this bad boy into an API now!