In [28]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression, LogisticRegressionCV, Lasso, Ridge, RidgeClassifier, SGDClassifier
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, precision_recall_curve, precision_recall_fscore_support, f1_score, r2_score 
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from scipy.stats import randint as sp_randint, gamma as sp_gamma, expon as sp_expon, uniform as sp_uniform
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV

In [2]:
df_features = pd.read_csv('../data/modeling/SPORTS/feature_data_SPORTS_21205.csv', sep='|')
df_targets = pd.read_csv('../data/modeling/SPORTS/target_data_SPORTS_21205.csv', sep='|')

df_features = df_features.set_index('owner').set_index("id", append=True)

df_targets = df_targets.set_index('owner').set_index("id", append=True)

df_targets = df_targets.drop('image_tags', axis=1)

target_columns = list(df_targets.columns)
target_columns.remove('image_ntags')
target_columns

df = df_features.join(df_targets, how='inner')

df.dropna(axis=0, inplace=True)

In [3]:
def name_quantile(x, limits):
    quantile_cats = range(1, len(limits)+1, 1)
    for cat, limit in zip(quantile_cats, limits):
        if x <= limit:
            return cat

def create_quantile_target_col(df_train, df_test, target_columns, col_name, n_quantiles=5):
    """
    Create a new column in both DataFrames that bins a target column into categories.
    
    PARAMETERS
    ----------
    df_train : pandas.DataFrame
        The training set data.
        
    df_test : pandas.DataFrame
        The testint set data.
    
    n_quantiles : int
        The number of bins. For 4 bins (0 to 0.25, 0.25 to 0.5, etc...), n_quantiles=4.
        
    col_name : str
    
    target_columns : list
    
    RETURNS
    -------
    df : DataFrame
    
    target_columns : list        
    """
    min_value = df_train[col_name].min()
    max_value = df_train[col_name].max()
    limits = []
    for i in range(1, n_quantiles+1):
        limits.append(df_train[col_name].quantile(i/float(n_quantiles)))
    
    new_col_name = col_name+"_quantile"
    target_columns.append(new_col_name)
    
    df_train.loc[:, new_col_name] = df_train[col_name].apply(lambda x: name_quantile(x, limits))
    df_test.loc[:, new_col_name] = df_test[col_name].apply(lambda x: name_quantile(x, limits))
    return df_train, df_test, target_columns

def pop_columns(df, col_names):
    for i, name in enumerate(list(col_names)):
        if i == 0:
            df_dropped_cols = df.pop(name)
        else:
            df_dropped_cols = pd.concat((df_dropped_cols, df.pop(name)), axis=1)
    return df, df_dropped_cols

In [4]:
df_train, df_test = train_test_split(df, train_size=0.8, random_state=42)

In [5]:
df_train, df_test, target_columns = create_quantile_target_col(df_train, df_test, target_columns, 'image_views', 4)
df_train, df_test, target_columns = create_quantile_target_col(df_train, df_test, target_columns, 'image_ncomments', 4)
df_train, df_test, target_columns = create_quantile_target_col(df_train, df_test, target_columns, 'image_nfavs', 4)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [6]:
X_train, y_train = pop_columns(df_train, target_columns)
X_test, y_test = pop_columns(df_test, target_columns)

In [7]:
X_columns = X_train.columns
y_columns = y_train.columns

In [8]:
scaler_mean_std = StandardScaler()
X_train = scaler_mean_std.fit_transform(X_train)
X_test = scaler_mean_std.transform(X_test)

In [9]:
X_train = pd.DataFrame(data=X_train, columns=X_columns)
X_test = pd.DataFrame(data=X_test, columns=X_columns)

In [10]:
y_train.head(1)

Unnamed: 0_level_0,Unnamed: 1_level_0,user_is_pro,user_can_buy_pro,user_total_views,image_ncomments,image_nfavs,image_nsets,image_npools,image_views,image_views_quantile,image_ncomments_quantile,image_nfavs_quantile
owner,id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
12385792@N00,11378305226,1,0,7760,0,0,1,0,6,1,1,1


### GRID SEARCH - Image Views Quantile (IVQ)

#### Set Search Parameters

In [40]:
param_distributions = {'Logistic': {"C": sp_expon(loc=0.001, scale=1),
                                    "fit_intercept": [True, False],
                                    "intercept_scaling": sp_randint(1, 5),
                                    "warm_start": [False, True]
                                    },
                       'RandomForest': {"max_depth": sp_randint(2, 7),
                                        "max_features": ['auto', None],
                                        "min_samples_split": sp_randint(1, 201),
                                        "min_samples_leaf": sp_randint(1, 201),
                                        "criterion": ["gini", "entropy"],
                                        "oob_score": True,
                                        "warm_start": [False, True] 
                                        },
                       'AdaBoost_DT': {"learning_rate": sp_expon(loc=0.001, scale=1.5),
                                       "algorithm" : ['SAMME.R', 'SAMME']
                                       },
                       'GBC': {"learning_rate": sp_expon(loc=0.001, scale=0.5),
                               "subsample": sp_uniform(loc=0.2, scale=1.0),
                               "max_features": [None, 'auto'],
                               "warm_start": [True, False]
                               },
                       'SVC': {"C": sp_expon(loc=0.001, scale=2),
                               "kernel": ['rbf', 'poly'],
                               "degree": sp_randint(2, 10),
                               "coef0": [0, 1, 2],
                               "shrinking": [True, False]
                               }
                       }

#### Build Models

In [41]:
DT = DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=4, min_samples_split=10, min_samples_leaf=10,
      min_weight_fraction_leaf=0.0, max_features=300, random_state=30, max_leaf_nodes=20, class_weight=None,
      presort=False)

model_ivq_LogitClassifier = LogisticRegression(penalty='l2', dual=False, tol=0.0001,
                                               class_weight=None, random_state=25,
                                               solver='liblinear', max_iter=10, multi_class='ovr', verbose=1,
                                               n_jobs=36)

model_ivq_RandomForest = RandomForestClassifier(n_estimators=300, min_weight_fraction_leaf=0.0, n_jobs=36,
                                                random_state=42, verbose=1, class_weight=None, bootstrap=True)

model_ivq_AdaBoost_DT = AdaBoostClassifier(base_estimator=DT, n_estimators=50, random_state=12)

model_ivq_GBC = GradientBoostingClassifier(loss='deviance', n_estimators=50,
                                           min_samples_split=10, min_samples_leaf=10, min_weight_fraction_leaf=0.0,
                                           max_depth=3, random_state=21, verbose=2,
                                           max_leaf_nodes=12, presort='auto')

model_ivq_SVC = SVC(gamma='auto', probability=True,
                    tol=0.001, cache_size=1000, class_weight=None, verbose=True, max_iter=20,
                    decision_function_shape='ovr', random_state=1)


#### Logistic Random Search CV

In [14]:
n_iter_search = 2
random_search_LogitClassifier = RandomizedSearchCV(estimator=model_ivq_LogitClassifier,
                                                   param_distributions=param_distributions['Logistic'],
                                                   n_iter=n_iter_search,
                                                   n_jobs=36, cv=2, verbose=2, random_state=30, error_score='raise')
random_search_LogitClassifier.fit(X_train, y_train['image_views_quantile'])

Fitting 5 folds for each of 2 candidates, totalling 10 fits
[CV] warm_start=True, C=0.909077971371, intercept_scaling=3, fit_intercept=False 
[CV] warm_start=True, C=0.909077971371, intercept_scaling=3, fit_intercept=False 
[CV] warm_start=True, C=0.909077971371, intercept_scaling=3, fit_intercept=False 
[CV] warm_start=True, C=0.909077971371, intercept_scaling=3, fit_intercept=False 
[CV] warm_start=True, C=0.909077971371, intercept_scaling=3, fit_intercept=False 
[CV] warm_start=True, C=3.21002324315, intercept_scaling=2, fit_intercept=False 
[CV] warm_start=True, C=3.21002324315, intercept_scaling=2, fit_intercept=False 
[CV] warm_start=True, C=3.21002324315, intercept_scaling=2, fit_intercept=False 
[CV] warm_start=True, C=3.21002324315, intercept_scaling=2, fit_intercept=False 
[CV] warm_start=True, C=3.21002324315, intercept_scaling=2, fit_intercept=False 




[LibLinear][CV]  warm_start=True, C=3.21002324315, intercept_scaling=2, fit_intercept=False - 1.0min




[LibLinear][CV]  warm_start=True, C=3.21002324315, intercept_scaling=2, fit_intercept=False - 1.1min




[LibLinear][CV]  warm_start=True, C=3.21002324315, intercept_scaling=2, fit_intercept=False - 1.2min




[LibLinear][CV]  warm_start=True, C=3.21002324315, intercept_scaling=2, fit_intercept=False - 1.2min




[LibLinear][CV]  warm_start=True, C=3.21002324315, intercept_scaling=2, fit_intercept=False - 1.2min




[LibLinear][CV]  warm_start=True, C=0.909077971371, intercept_scaling=3, fit_intercept=False - 1.3min




[LibLinear][CV]  warm_start=True, C=0.909077971371, intercept_scaling=3, fit_intercept=False - 1.3min




[LibLinear][CV]  warm_start=True, C=0.909077971371, intercept_scaling=3, fit_intercept=False - 1.3min




[LibLinear][CV]  warm_start=True, C=0.909077971371, intercept_scaling=3, fit_intercept=False - 1.4min




[LibLinear][CV]  warm_start=True, C=0.909077971371, intercept_scaling=3, fit_intercept=False - 1.5min


[Parallel(n_jobs=36)]: Done  10 out of  10 | elapsed:  1.5min finished


[LibLinear]



RandomizedSearchCV(cv=5, error_score='raise',
          estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=10, multi_class='ovr', n_jobs=36,
          penalty='l2', random_state=25, solver='liblinear', tol=0.0001,
          verbose=1, warm_start=False),
          fit_params={}, iid=True, n_iter=2, n_jobs=36,
          param_distributions={'warm_start': [False, True], 'C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7ff47109da10>, 'intercept_scaling': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7ff47109dc10>, 'fit_intercept': [True, False]},
          pre_dispatch='2*n_jobs', random_state=30, refit=True,
          scoring=None, verbose=2)

In [15]:
random_search_LogitClassifier.grid_scores_

[mean: 0.51515, std: 0.00740, params: {'warm_start': True, 'C': 0.9090779713708561, 'intercept_scaling': 3, 'fit_intercept': False},
 mean: 0.51368, std: 0.00800, params: {'warm_start': True, 'C': 3.210023243149706, 'intercept_scaling': 2, 'fit_intercept': False}]

In [16]:
random_search_LogitClassifier.best_score_

0.51514972883753829

In [17]:
best_LogitClassifier = random_search_LogitClassifier.best_estimator_
y_pred = best_LogitClassifier.predict(X_test)
print "Best Logit Classifier F1 Score: ", f1_score(y_test['image_views_quantile'], y_pred, labels=None, pos_label=None, average='macro', sample_weight=None)

Best Logit Classifier F1 Score:  0.514739874928


#### Random Forest Random Search CV

In [23]:
n_iter_search = 2
random_search_RandomForest = RandomizedSearchCV(estimator=model_ivq_RandomForest,
                                                param_distributions=param_distributions['RandomForest'],
                                                n_iter=n_iter_search,
                                                n_jobs=36, cv=2, verbose=2, random_state=30, error_score='raise')
random_search_RandomForest.fit(X_train, y_train['image_views_quantile'])

Fitting 2 folds for each of 2 candidates, totalling 4 fits
[CV] warm_start=True, oob_score=True, min_samples_leaf=131, max_features=None, criterion=entropy, min_samples_split=192, max_depth=3 
[CV] warm_start=True, oob_score=True, min_samples_leaf=131, max_features=None, criterion=entropy, min_samples_split=192, max_depth=3 
[CV] warm_start=False, oob_score=True, min_samples_leaf=120, max_features=auto, criterion=gini, min_samples_split=190, max_depth=5 
[CV] warm_start=False, oob_score=True, min_samples_leaf=120, max_features=auto, criterion=gini, min_samples_split=190, max_depth=5 


[Parallel(n_jobs=36)]: Done 128 tasks      | elapsed:    1.9s
[Parallel(n_jobs=36)]: Done 128 tasks      | elapsed:    2.2s
[Parallel(n_jobs=36)]: Done 378 tasks      | elapsed:    5.4s
[Parallel(n_jobs=36)]: Done 378 tasks      | elapsed:    5.6s
[Parallel(n_jobs=36)]: Done 728 tasks      | elapsed:   10.2s
[Parallel(n_jobs=36)]: Done 728 tasks      | elapsed:   10.5s
[Parallel(n_jobs=36)]: Done 1000 out of 1000 | elapsed:   13.8s finished
[Parallel(n_jobs=36)]: Done 1000 out of 1000 | elapsed:   14.0s finished
[Parallel(n_jobs=36)]: Done 128 tasks      | elapsed:    0.2s
[Parallel(n_jobs=36)]: Done 378 tasks      | elapsed:    0.5s
[Parallel(n_jobs=36)]: Done 728 tasks      | elapsed:    1.0s
[Parallel(n_jobs=36)]: Done 1000 out of 1000 | elapsed:    1.2s finished
[Parallel(n_jobs=36)]: Done 128 tasks      | elapsed:    0.2s


[CV]  warm_start=False, oob_score=True, min_samples_leaf=120, max_features=auto, criterion=gini, min_samples_split=190, max_depth=5 -  48.6s


[Parallel(n_jobs=36)]: Done 378 tasks      | elapsed:    0.5s
[Parallel(n_jobs=36)]: Done 728 tasks      | elapsed:    0.9s
[Parallel(n_jobs=36)]: Done 1000 out of 1000 | elapsed:    1.2s finished


[CV]  warm_start=False, oob_score=True, min_samples_leaf=120, max_features=auto, criterion=gini, min_samples_split=190, max_depth=5 -  49.4s


[Parallel(n_jobs=36)]: Done 128 tasks      | elapsed:  1.2min
[Parallel(n_jobs=36)]: Done 128 tasks      | elapsed:  1.3min
[Parallel(n_jobs=36)]: Done 378 tasks      | elapsed:  3.2min
[Parallel(n_jobs=36)]: Done 378 tasks      | elapsed:  3.2min
[Parallel(n_jobs=36)]: Done 728 tasks      | elapsed:  5.9min
[Parallel(n_jobs=36)]: Done 728 tasks      | elapsed:  6.0min
[Parallel(n_jobs=36)]: Done 1000 out of 1000 | elapsed:  7.9min finished
[Parallel(n_jobs=36)]: Done 1000 out of 1000 | elapsed:  7.9min finished
[Parallel(n_jobs=36)]: Done 128 tasks      | elapsed:    0.1s
[Parallel(n_jobs=36)]: Done 128 tasks      | elapsed:    0.1s
[Parallel(n_jobs=36)]: Done 378 tasks      | elapsed:    0.3s
[Parallel(n_jobs=36)]: Done 378 tasks      | elapsed:    0.3s
[Parallel(n_jobs=36)]: Done 728 tasks      | elapsed:    0.5s
[Parallel(n_jobs=36)]: Done 728 tasks      | elapsed:    0.5s
[Parallel(n_jobs=36)]: Done 1000 out of 1000 | elapsed:    0.7s finished


[CV]  warm_start=True, oob_score=True, min_samples_leaf=131, max_features=None, criterion=entropy, min_samples_split=192, max_depth=3 - 8.2min


[Parallel(n_jobs=36)]: Done 1000 out of 1000 | elapsed:    0.7s finished


[CV]  warm_start=True, oob_score=True, min_samples_leaf=131, max_features=None, criterion=entropy, min_samples_split=192, max_depth=3 - 8.2min


[Parallel(n_jobs=36)]: Done   4 out of   4 | elapsed:  8.2min finished
[Parallel(n_jobs=36)]: Done 128 tasks      | elapsed:  1.3min
[Parallel(n_jobs=36)]: Done 378 tasks      | elapsed:  3.5min
[Parallel(n_jobs=36)]: Done 728 tasks      | elapsed:  6.7min
[Parallel(n_jobs=36)]: Done 1000 out of 1000 | elapsed:  9.0min finished


RandomizedSearchCV(cv=2, error_score='raise',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=36,
            oob_score=False, random_state=42, verbose=1, warm_start=False),
          fit_params={}, iid=True, n_iter=2, n_jobs=36,
          param_distributions={'warm_start': [False, True], 'oob_score': [False, True], 'min_samples_leaf': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7ff4599082d0>, 'min_samples_split': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7ff459908310>, 'criterion': ['gini', 'entropy'], 'max_features': ['auto', None], 'max_depth': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7ff4598fd3d0>},
          pre_dispatch='2*n_jobs', random_state=30, refit=True,
          scoring=None, verbose=2)

In [24]:
random_search_RandomForest.best_score_

0.52699834944588542

In [25]:
best_RandomForest = random_search_RandomForest.best_estimator_
y_pred = best_RandomForest.predict(X_test)
print "Best Random Forest Classifier F1 Score: ", f1_score(y_test['image_views_quantile'], y_pred, labels=None, pos_label=None, average='macro', sample_weight=None)

[Parallel(n_jobs=36)]: Done 128 tasks      | elapsed:    0.1s
[Parallel(n_jobs=36)]: Done 378 tasks      | elapsed:    0.3s
[Parallel(n_jobs=36)]: Done 728 tasks      | elapsed:    0.5s


Best Random Forest Classifier F1 Score:  0.498109252463


[Parallel(n_jobs=36)]: Done 1000 out of 1000 | elapsed:    0.8s finished


#### AdaBoost DT Random Search CV

In [31]:
n_iter_search = 2
random_search_AdaBoost_DT = RandomizedSearchCV(estimator=model_ivq_AdaBoost_DT,
                                                   param_distributions=param_distributions['AdaBoost_DT'],
                                                   n_iter=n_iter_search,
                                                   n_jobs=36, cv=3, verbose=2, random_state=30, error_score='raise')
random_search_AdaBoost_DT.fit(X_train, y_train['image_views_quantile'])

Fitting 3 folds for each of 2 candidates, totalling 6 fits


[Parallel(n_jobs=36)]: Done   6 out of   6 | elapsed:  1.6min finished


RandomizedSearchCV(cv=3, error_score='raise',
          estimator=AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=4,
            max_features=300, max_leaf_nodes=20, min_samples_leaf=10,
            min_samples_split=10, min_weight_fraction_leaf=0.0,
            presort=False, random_state=30, splitter='best'),
          learning_rate=1.0, n_estimators=50, random_state=12),
          fit_params={}, iid=True, n_iter=2, n_jobs=36,
          param_distributions={'learning_rate': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7ff4a92abc10>, 'algorithm': ['SAMME.R', 'SAMME']},
          pre_dispatch='2*n_jobs', random_state=30, refit=True,
          scoring=None, verbose=1)

In [32]:
random_search_AdaBoost_DT.best_score_

0.59643951898137226

In [33]:
best_AdaBoost_DT = random_search_AdaBoost_DT.best_estimator_
y_pred = best_AdaBoost_DT.predict(X_test)
print "Best AdaBoost DT F1 Score: ", f1_score(y_test['image_views_quantile'], y_pred, labels=None, pos_label=None, average='macro', sample_weight=None)

Best AdaBoost DT F1 Score:  0.607472223011


#### Gradient Boost Classifier Random Search CV

In [34]:
n_iter_search = 2
random_search_GBC = RandomizedSearchCV(estimator=model_ivq_GBC,
                                                   param_distributions=param_distributions['GBC'],
                                                   n_iter=n_iter_search,
                                                   n_jobs=36, cv=3, verbose=2, random_state=30, error_score='raise')
random_search_GBC.fit(X_train, y_train['image_views_quantile'])

Fitting 3 folds for each of 2 candidates, totalling 6 fits
[CV] max_features=auto, subsample=0.352022639922, learning_rate=0.223350669556, warm_start=False 
[CV] max_features=auto, subsample=0.352022639922, learning_rate=0.223350669556, warm_start=False 
[CV] max_features=auto, subsample=0.352022639922, learning_rate=0.223350669556, warm_start=False 
[CV] max_features=auto, subsample=0.652295356862, learning_rate=0.534991644241, warm_start=False 
[CV] max_features=auto, subsample=0.652295356862, learning_rate=0.534991644241, warm_start=False 
      Iter       Train Loss      OOB Improve   Remaining Time 
[CV] max_features=auto, subsample=0.652295356862, learning_rate=0.534991644241, warm_start=False 
      Iter       Train Loss      OOB Improve   Remaining Time 
      Iter       Train Loss      OOB Improve   Remaining Time 
      Iter       Train Loss      OOB Improve   Remaining Time 
      Iter       Train Loss      OOB Improve   Remaining Time 
      Iter       Train Loss      OOB I

[Parallel(n_jobs=36)]: Done   6 out of   6 | elapsed:  3.9min finished


      Iter       Train Loss      OOB Improve   Remaining Time 
         1       11424.1431        2044.3780            7.08m
         2       10101.0188         632.5692            6.87m
         3        9521.5385         287.6866            6.53m
         4        9010.2908         188.1259            6.33m
         5        8708.5143         103.8759            6.08m
         6        8370.6366          97.6784            5.90m
         7        8122.2043          78.9813            5.71m
         8        7957.9672          40.5434            5.50m
         9        7742.5850          50.2229            5.33m
        10        7588.2933          28.1474            5.17m
        11        7484.5944           4.2658            5.03m
        12        7317.2439          11.6396            4.89m
        13        7172.2904           2.6672            4.73m
        14        7079.1946          -0.1389            4.58m
        15        6994.2927           7.2261            4.41m
       

RandomizedSearchCV(cv=3, error_score='raise',
          estimator=GradientBoostingClassifier(init=None, learning_rate=0.1, loss='deviance',
              max_depth=3, max_features=None, max_leaf_nodes=12,
              min_samples_leaf=10, min_samples_split=10,
              min_weight_fraction_leaf=0.0, n_estimators=50,
              presort='auto', random_state=21, subsample=1.0, verbose=2,
              warm_start=False),
          fit_params={}, iid=True, n_iter=2, n_jobs=36,
          param_distributions={'max_features': [None, 'auto'], 'subsample': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7ff47109a990>, 'learning_rate': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7ff4598cdfd0>, 'warm_start': [True, False]},
          pre_dispatch='2*n_jobs', random_state=30, refit=True,
          scoring=None, verbose=2)

In [37]:
random_search_GBC.best_estimator_

GradientBoostingClassifier(init=None, learning_rate=0.534991644241,
              loss='deviance', max_depth=3, max_features='auto',
              max_leaf_nodes=12, min_samples_leaf=10, min_samples_split=10,
              min_weight_fraction_leaf=0.0, n_estimators=50,
              presort='auto', random_state=21, subsample=0.652295356862,
              verbose=2, warm_start=False)

In [35]:
random_search_GBC.best_score_

0.69594435274699362

In [36]:
best_GBC = random_search_GBC.best_estimator_
y_pred = best_GBC.predict(X_test)
print "Best AdaBoost DT F1 Score: ", f1_score(y_test['image_views_quantile'], y_pred, labels=None, pos_label=None, average='macro', sample_weight=None)

Best AdaBoost DT F1 Score:  0.715118653295


#### SVC Random Search CV

In [42]:
n_iter_search = 2
random_search_SVC = RandomizedSearchCV(estimator=model_ivq_SVC,
                                                   param_distributions=param_distributions['SVC'],
                                                   n_iter=n_iter_search,
                                                   n_jobs=36, cv=3, verbose=2, random_state=30, error_score='raise')
random_search_SVC.fit(X_train, y_train['image_views_quantile'])

Fitting 3 folds for each of 2 candidates, totalling 6 fits
[CV] kernel=poly, C=0.287845399018, shrinking=False, degree=5, coef0=1 
[CV] kernel=poly, C=0.287845399018, shrinking=False, degree=5, coef0=1 
[CV] kernel=poly, C=0.287845399018, shrinking=False, degree=5, coef0=1 
[CV] kernel=rbf, C=1.83013321348, shrinking=True, degree=9, coef0=1 ..
[CV] kernel=rbf, C=1.83013321348, shrinking=True, degree=9, coef0=1 ..
[CV] kernel=rbf, C=1.83013321348, shrinking=True, degree=9, coef0=1 ..




[LibSVM][CV]  kernel=poly, C=0.287845399018, shrinking=False, degree=5, coef0=1 -  11.8s
[LibSVM][CV]  kernel=poly, C=0.287845399018, shrinking=False, degree=5, coef0=1 -  12.0s
[LibSVM][CV]  kernel=poly, C=0.287845399018, shrinking=False, degree=5, coef0=1 -  12.3s




[LibSVM][CV]  kernel=rbf, C=1.83013321348, shrinking=True, degree=9, coef0=1 -  13.7s
[LibSVM][CV]  kernel=rbf, C=1.83013321348, shrinking=True, degree=9, coef0=1 -  13.8s
[LibSVM][CV]  kernel=rbf, C=1.83013321348, shrinking=True, degree=9, coef0=1 -  14.0s


[Parallel(n_jobs=36)]: Done   6 out of   6 | elapsed:   15.8s finished


[LibSVM]



RandomizedSearchCV(cv=3, error_score='raise',
          estimator=SVC(C=1.0, cache_size=1000, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=20, probability=True, random_state=1, shrinking=True, tol=0.001,
  verbose=True),
          fit_params={}, iid=True, n_iter=2, n_jobs=36,
          param_distributions={'kernel': ['rbf', 'poly'], 'C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7ff4598d93d0>, 'shrinking': [True, False], 'coef0': [0, 1, 2], 'degree': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7ff459845290>},
          pre_dispatch='2*n_jobs', random_state=30, refit=True,
          scoring=None, verbose=2)

In [43]:
random_search_SVC.best_score_

0.271457203489743

In [44]:
best_SVC = random_search_SVC.best_estimator_
y_pred = best_SVC.predict(X_test)
print "Best AdaBoost DT F1 Score: ", f1_score(y_test['image_views_quantile'], y_pred, labels=None, pos_label=None, average='macro', sample_weight=None)

Best AdaBoost DT F1 Score:  0.227904674255


### GRID SEARCH - Image nComments Quantile (InCQ)

### GRID SEARCH - Image nFavs Quantile (InFQ)

In [None]:
rv = sp_expon(loc=0.001, scale=1.5)

In [None]:
plt.hist(rv.rvs(1000), bins=40)
plt.plot()

In [75]:
nums = rv.rvs(1000)