In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 10
%autosave 15

import os
import pickle
import sklearn as sk
import scipy as sp
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import randint

from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.grid_search import GridSearchCV, RandomizedSearchCV

Autosaving every 15 seconds


In [2]:
def save_model(model, path):
    best_model = model.best_estimator_
    best_params = model.best_params_
    best_mean = model.best_score_
    best_std = 0
    
    for _, mean, cv_means in model.grid_scores_:
        if abs(mean - best_mean) <= 1e-10:
            best_std = np.std(cv_means)
    
    pickle.dump([best_model, best_params, best_mean, best_std], open(path + "/" + model.best_estimator_.__class__.__name__, 'wb'))

def load_model(path):
    
    return pickle.load(open(path, 'rb'))

In [3]:
DATA = "./gist.csv"

gist = pd.read_csv(DATA, sep=' ', header=None)

target = np.array(gist[960])
data = MinMaxScaler().fit_transform(gist.drop(960, axis=1))

In [4]:
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.1, random_state=23)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

model = LogisticRegression(C=40)
model.fit(X_train, y_train)

print(model.score(X_train, y_train))
print(model.score(X_test, y_test))

model = RandomForestClassifier(n_estimators=250)
model.fit(X_train, y_train)

print(model.score(X_train, y_train))
print(model.score(X_test, y_test))

(897, 960) (100, 960) (897,) (100,)
0.688963210702
0.77
1.0
0.9


In [5]:
# Models and their Parameters

lr_params =  {'C': randint(1, 100),
             'penalty': ['l1', 'l2']}

knn_params =  {"n_neighbors": randint(3, 16),
              "p": randint(1,3),
              "algorithm": ['kd_tree', 'ball_tree']}

dt_params =  {"max_depth": randint(1,11),
              "min_samples_split": randint(1, 11),
              "min_samples_leaf": randint(1, 11),
              "max_features": [None, 'log2', 'sqrt'],
              "criterion": ["gini", "entropy"]}

svm_params =  {"C": randint(1,101),
              "kernel": ['rbf', 'linear', 'poly', 'sigmoid'],
              "degree": randint(1, 6),
               "shrinking": [True, False]}

rf_params =  {"n_estimators": randint(50, 501),
              "max_depth": randint(1,11),
              "min_samples_split": randint(1, 11),
              "min_samples_leaf": randint(1, 11),
              "max_features": [None, 'log2', 'sqrt'],
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

gbm_params =  {"n_estimators": randint(50, 501),
              "max_depth": randint(1,11),
              "min_samples_split": randint(1, 11),
              "min_samples_leaf": randint(1, 11),
               "max_features": [None, 'log2', 'sqrt'],
              "subsample": [0.7, 0.8, 0.9, 1.0],
              "criterion": ["gini", "entropy"]}


models = [LogisticRegression(),
          KNeighborsClassifier(),
          DecisionTreeClassifier(),
          SVC(),
          RandomForestClassifier(),
          GradientBoostingClassifier()]

params = [lr_params, knn_params, dt_params, svm_params, rf_params, gbm_params]

In [None]:
for model, parameters in zip(models, params):
    clf = RandomizedSearchCV(model, param_distributions=parameters, n_iter=500, cv=10, verbose=2)
    clf.fit(data, np.array(target))
    
    save_model(clf, ".")

Fitting 10 folds for each of 500 candidates, totalling 5000 fits
[CV] penalty=l2, C=32 ................................................
[CV] ....................................... penalty=l2, C=32 -   2.1s
[CV] penalty=l2, C=32 ................................................
[CV] ....................................... penalty=l2, C=32 -   1.5s
[CV] penalty=l2, C=32 ................................................
[CV] ....................................... penalty=l2, C=32 -   1.5s

In [195]:
# model, _, mean, std = load_model('./RandomForestClassifier')