In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, AdaBoostClassifier, GradientBoostingClassifier, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.linear_model import LogisticRegression, LinearRegression, LogisticRegressionCV, Lasso, Ridge, RidgeClassifier, SGDClassifier, SGDRegressor
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, precision_recall_curve, precision_recall_fscore_support, f1_score, r2_score 
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.svm import SVC, SVR
from sklearn.tree import DecisionTreeClassifier
from scipy.stats import randint as sp_randint, gamma as sp_gamma, expon as sp_expon, uniform as sp_uniform
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV
import cPickle as pickle

In [None]:
def open_prepper(file_path):
    """Open the DataPrepper from pickled file."""
    with open(file_path) as f:
        prepper = pickle.load(f)
    return prepper

file_path = '../data/store/data_prepper_ALL-CATEGORIES.pkl'
prepper = open_prepper(file_path)

X_train, y_train = prepper.return_training_data()
X_test, y_test = prepper.return_testing_data()

In [None]:
y_train['log_image_views'] = y_train['image_views'].apply(lambda x: np.log(x))
y_train['log_image_views'] = y_train['log_image_views'].apply(lambda x: 0 if x < 0 else x)

y_test['log_image_views'] = y_test['image_views'].apply(lambda x: np.log(x))
y_test['log_image_views'] = y_test['log_image_views'].apply(lambda x: 0 if x < 0 else x)

In [None]:
#plt.hist(y_train['image_views'], bins=100)
plt.hist(y_train['log_image_views'], bins=100)
plt.show()

In [None]:
y_train.head()

### RANDOM FOREST REGRESSION

In [None]:
model_RF_regression = RandomForestRegressor(n_estimators=500, criterion='mse', max_depth=None, min_samples_split=2,
                                            min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None,
                                            max_leaf_nodes=None, bootstrap=True, oob_score=True, n_jobs=30,
                                            random_state=None, verbose=1, warm_start=False)

In [None]:
model_RF_regression.fit(X_train, y_train['log_image_views'])

In [None]:
model_RF_regression.score(X_test, y_test['log_image_views'])

In [None]:
model_RF_regression.feature_importances_

In [None]:
with open('./regression_model_RF_500.pkl', 'w') as f:
    pickle.dump(model_RF_regression, f)

Score:  
0.53 with 100 trees  
0.54 with 500

### GBR

In [None]:
model_GB_regression = GradientBoostingRegressor(loss='ls', learning_rate=0.05, n_estimators=300, subsample=0.6,
                                                min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0,
                                                max_depth=3, init=None, random_state=None, max_features='auto', alpha=0.9,
                                                verbose=2, max_leaf_nodes=None, warm_start=False, presort='auto')

In [None]:
model_GB_regression.fit(X_train, y_train['log_image_views'])

In [None]:
model_GB_regression.score(X_test, y_test['log_image_views'])

In [None]:
r2_scores = []
num_estimators = model_GB_regression.get_params()['n_estimators']
for i, y_pred in zip(range(1, num_estimators+1), model_GB_regression.staged_predict(X_test)):
    r2_scores.append(r2_score(y_test['log_image_views'], y_pred, sample_weight=None))
plt.ylim((0,1.0))
plt.plot(r2_scores, 'b')

In [None]:
with open('./regression_model_GBR.pkl', 'w') as f:
    pickle.dump(model_GB_regression, f)

WAS ~0.3

### ADABOOST

In [None]:
model_ada_regression = AdaBoostRegressor(base_estimator=None, n_estimators=100, learning_rate=1.0,
                                         loss='linear', random_state=None)

In [None]:
model_ada_regression.fit(X_train, y_train['log_image_views'])

In [None]:
model_ada_regression.score(X_test, y_test['log_image_views'])

In [None]:
with open('./regression_model_adaboost_100.pkl', 'w') as f:
    pickle.dump(model_GB_regression, f)

### SGD REGRESSSION

In [None]:
model_sgd_regression = SGDRegressor(loss='squared_loss', penalty='l1', alpha=0.0001, l1_ratio=0.05, fit_intercept=True,
                                    n_iter=20, shuffle=True, verbose=1, epsilon=0.1, random_state=None,
                                    learning_rate='invscaling', eta0=0.01, power_t=0.25, warm_start=False, average=False)

In [None]:
model_sgd_regression.fit(X_train, y_train['log_image_views'])

In [None]:
model_sgd_regression.score(X_train, y_train['log_image_views'])

#### SVR

In [None]:
model_svr_regression = SVR(kernel='rbf', degree=3, gamma='auto', coef0=0.0, tol=0.001, C=1.0, epsilon=0.1,
                           shrinking=True, cache_size=200, verbose=True, max_iter=-1)

In [None]:
model_svr_regression.fit(X_train, y_train['log_image_views'])

In [None]:
model_svr_regression.score(X_test, y_test['log_image_views'])