In [1]:
import numpy as np
import pandas as np
from time import time
import pickle
from sklearn.model_selection import train_test_split
from sklearn.metrics import fbeta_score, accuracy_score
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

In [28]:
data_path = "dataset/datasets_clean.pickle"
with open(data_path, "rb") as f:
    X_train = pickle.load(f)
    X_test = pickle.load(f)
    y_train = pickle.load(f)
    y_test = pickle.load(f)

# Initial model evaluation

In [3]:
X, X_val, y, y_val = train_test_split(X_train, y_train, test_size=.1, random_state=42)

In [25]:
def train_predict(learner, sample_size, X_train, y_train, X_test, y_test): 
    '''
    inputs:
       - learner: the learning algorithm to be trained and predicted on
       - sample_size: the size of samples (number) to be drawn from training set
       - X_train: features training set
       - y_train: income training set
       - X_test: features testing set
       - y_test: income testing set
    '''
    
    results = {}
    
    start = time() # Get start time
    X_train_subset = X_train.loc[:sample_size, :].values
    y_train_subset = y_train.ix[:sample_size, 'Opioid.Prescriber']
    learner.fit(X_train_subset, y_train_subset)
    end = time() # Get end time
    
    results['train_time'] = end - start
        
    
    start = time() # Get start time
    predictions_test = learner.predict(X_test)
    predictions_train = learner.predict(X_train[:300])
    end = time() # Get end time
    
    # TODO: Calculate the total prediction time
    results['pred_time'] = end - start
            
    # TODO: Compute accuracy on the 
    results['acc_train'] = accuracy_score(y_train[:300], predictions_train)
        
    # TODO: Compute accuracy on test set
    results['acc_test'] = accuracy_score(y_test, predictions_test)
    
    # TODO: Compute F-score on the the first 300 training samples
    results['f_train'] = fbeta_score(y_train[:300], predictions_train, beta = .5)
        
    # TODO: Compute F-score on the test set
    results['f_test'] = fbeta_score(y_test, predictions_test, beta = .5)
       
    # Success
    print("{} trained on {} samples.".format(learner.__class__.__name__, sample_size))
        
    # Return the results
    return results

In [26]:
X.tail()

Unnamed: 0,Gender_F,Gender_M,State_AA,State_AE,State_AK,State_AL,State_AR,State_AZ,State_CA,State_CO,...,VENTOLIN.HFA,VERAPAMIL.ER,VESICARE,VOLTAREN,VYTORIN,WARFARIN.SODIUM,XARELTO,ZETIA,ZIPRASIDONE.HCL,ZOLPIDEM.TARTRATE
1054,1,0,0,0,0,0,1,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.990453
12646,0,1,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,1.0,0.992725,0.999999,0.999981,0.0,0.0
12425,1,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.981065
19534,0,1,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5983,0,1,0,0,0,0,0,0,0,0,...,0.999995,0.0,0.0,0.0,0.0,0.981709,0.0,0.0,0.0,0.993699


In [27]:
clf_A = AdaBoostClassifier(random_state=0)
clf_B = SVC(random_state=0)
clf_C = RandomForestClassifier(random_state=0)
clf_D = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=0)


samples_1 = int(.01 * X.shape[0])
samples_10 = int(.1 * X.shape[0])
samples_100 = X.shape[0] - 1


# Collect results on the learners
results = {}
for clf in [clf_A, clf_B, clf_C, clf_D]:
    clf_name = clf.__class__.__name__
    results[clf_name] = {}
    for i, samples in enumerate([samples_1, samples_10, samples_100]):
        results[clf_name][i] = \
        train_predict(clf, samples, X, y.ix[:, 0], X_val, y_val.ix[:, 0] )


IndexingError: Too many indexers

In [8]:
sample_size = 10000
X_train_subset = X.ix[:sample_size, :]
y_train_subset = y.ix[:sample_size, 0]
learner = AdaBoostClassifier(random_state=0)
learner.fit(X_train_subset, y_train_subset)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=0)

In [17]:
y_train_subset = y_train.loc[:sample_size, :]

In [22]:
y_train_subset.shape

(13452, 14)