In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression, LogisticRegressionCV, Lasso, Ridge, RidgeClassifier, SGDClassifier
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, precision_recall_curve, precision_recall_fscore_support, f1_score, r2_score 
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from scipy.stats import randint as sp_randint, gamma as sp_gamma, expon as sp_expon, uniform as sp_uniform
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV
import cPickle as pickle

In [3]:
def open_prepper(file_path):
    """Open the DataPrepper from pickled file."""
    with open(file_path) as f:
        prepper = pickle.load(f)
    return prepper

In [4]:
file_path = '../data/store/data_prepper_ALL-CATEGORIES.pkl'
prepper = open_prepper(file_path)

In [5]:
X_train, y_train = prepper.return_training_data()
X_test, y_test = prepper.return_testing_data()




### IMAGE VIEW QUANTILES

In [6]:
param_distributions = {'Logistic': {"C": sp_expon(loc=0.001, scale=1),
                                    "fit_intercept": [True, False],
                                    "intercept_scaling": sp_randint(1, 5),
                                    "warm_start": [False, True]
                                    },
                       'RandomForest': {"max_depth": [None],
                                        "max_features": ['auto', None],
                                        "min_samples_split": sp_randint(1, 201),
                                        "min_samples_leaf": sp_randint(1, 201),
                                        "criterion": ["gini", "entropy"],
                                        "oob_score": [True],
                                        "warm_start": [False, True] 
                                        },
                       'AdaBoost_DT': {"learning_rate": sp_expon(loc=0.001, scale=1.5),
                                       "algorithm" : ['SAMME.R', 'SAMME']
                                       },
                       'GBC': {"learning_rate": sp_expon(loc=0.001, scale=0.5),
                               "subsample": sp_uniform(loc=0.2, scale=0.8),
                               "max_features": [None, 'auto'],
                               "warm_start": [True, False],
                               "max_depth": [3, 4, 5],
                               },
                       'SVC': {"C": sp_expon(loc=0.001, scale=2),
                               "kernel": ['rbf', 'poly'],
                               "degree": sp_randint(2, 10),
                               "coef0": [0, 1, 2],
                               "shrinking": [True, False]
                               }
                       }

In [7]:
DT = DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=4, min_samples_split=10, min_samples_leaf=10,
      min_weight_fraction_leaf=0.0, max_features=300, random_state=30, max_leaf_nodes=20, class_weight=None,
      presort=False)

model_ivq_LogitClassifier = LogisticRegression(penalty='l2', dual=False, tol=0.0001,
                                               class_weight=None, random_state=25,
                                               solver='liblinear', max_iter=1000, multi_class='ovr', verbose=2,
                                               n_jobs=36)

model_ivq_RandomForest = RandomForestClassifier(n_estimators=1000, min_weight_fraction_leaf=0.0, n_jobs=36,
                                                random_state=42, verbose=2, class_weight=None, bootstrap=True)

model_ivq_AdaBoost_DT = AdaBoostClassifier(base_estimator=DT, n_estimators=300, random_state=12)

model_ivq_GBC = GradientBoostingClassifier(loss='deviance', n_estimators=100,
                                           min_samples_split=10, min_samples_leaf=10, min_weight_fraction_leaf=0.0,
                                           random_state=21, verbose=0,
                                           max_leaf_nodes=12, presort='auto')

model_ivq_SVC = SVC(gamma='auto', probability=True,
                    tol=0.001, cache_size=1000, class_weight=None, verbose=True, max_iter=-1,
                    decision_function_shape='ovr', random_state=1)


In [None]:
n_iter_search = 20
random_search_GBC = RandomizedSearchCV(estimator=model_ivq_GBC,
                                       param_distributions=param_distributions['GBC'],
                                       n_iter=n_iter_search,
                                       n_jobs=36, cv=4, verbose=1, random_state=30, error_score='raise')
random_search_GBC.fit(X_train, y_train['image_views_quantized'])

Fitting 4 folds for each of 20 candidates, totalling 80 fits


In [None]:
random_search_GBC.best_score_

In [None]:
best_GBC = random_search_GBC.best_estimator_
y_pred = best_GBC.predict(X_test)
print "Best AdaBoost DT F1 Score: ", f1_score(y_test['image_views_quantized'], y_pred, labels=None, pos_label=None, average='weighted', sample_weight=None)

In [None]:
new_params = best_GBC.get_params()
new_params['n_estimators'] = 1000
new_params['verbose'] = 2
new_model_ivq_GBC = GradientBoostingClassifier(**new_params)
new_model_ivq_GBC.fit(X_train, y_train['image_views_quantized'])

In [None]:
new_model_ivq_GBC.get_params()

In [None]:
f1_scores = []
num_estimators = new_model_ivq_GBC.get_params()['n_estimators']
for i, y_pred in zip(range(1, num_estimators+1), new_model_ivq_GBC.staged_predict(X_test)):
    f1_scores.append(f1_score(y_test['image_views_quantized'], y_pred, labels=None, pos_label=None, average='weighted', sample_weight=None))

plt.plot(f1_scores, 'b')