In [30]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression, LogisticRegressionCV, Lasso, Ridge, RidgeClassifier, SGDClassifier
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, precision_recall_curve, precision_recall_fscore_support, f1_score, r2_score 
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from scipy.stats import randint as sp_randint, gamma as sp_gamma, expon as sp_expon
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV

In [15]:
df_features = pd.read_csv('../data/modeling/SPORTS/feature_data_SPORTS_21205.csv', sep='|')
df_targets = pd.read_csv('../data/modeling/SPORTS/target_data_SPORTS_21205.csv', sep='|')

df_features = df_features.set_index('owner').set_index("id", append=True)

df_targets = df_targets.set_index('owner').set_index("id", append=True)

df_targets = df_targets.drop('image_tags', axis=1)

target_columns = list(df_targets.columns)
target_columns.remove('image_ntags')
target_columns

df = df_features.join(df_targets, how='inner')

df.dropna(axis=0, inplace=True)

In [16]:
def name_quantile(x, limits):
    quantile_cats = range(1, len(limits)+1, 1)
    for cat, limit in zip(quantile_cats, limits):
        if x <= limit:
            return cat

def create_quantile_target_col(df_train, df_test, target_columns, col_name, n_quantiles=5):
    """
    Create a new column in both DataFrames that bins a target column into categories.
    
    PARAMETERS
    ----------
    df_train : pandas.DataFrame
        The training set data.
        
    df_test : pandas.DataFrame
        The testint set data.
    
    n_quantiles : int
        The number of bins. For 4 bins (0 to 0.25, 0.25 to 0.5, etc...), n_quantiles=4.
        
    col_name : str
    
    target_columns : list
    
    RETURNS
    -------
    df : DataFrame
    
    target_columns : list        
    """
    min_value = df_train[col_name].min()
    max_value = df_train[col_name].max()
    limits = []
    for i in range(1, n_quantiles+1):
        limits.append(df_train[col_name].quantile(i/float(n_quantiles)))
    
    new_col_name = col_name+"_quantile"
    target_columns.append(new_col_name)
    
    df_train.loc[:, new_col_name] = df_train[col_name].apply(lambda x: name_quantile(x, limits))
    df_test.loc[:, new_col_name] = df_test[col_name].apply(lambda x: name_quantile(x, limits))
    return df_train, df_test, target_columns

def pop_columns(df, col_names):
    for i, name in enumerate(list(col_names)):
        if i == 0:
            df_dropped_cols = df.pop(name)
        else:
            df_dropped_cols = pd.concat((df_dropped_cols, df.pop(name)), axis=1)
    return df, df_dropped_cols

In [17]:
df_train, df_test = train_test_split(df, train_size=0.8, random_state=42)

In [18]:
df_train, df_test, target_columns = create_quantile_target_col(df_train, df_test, target_columns, 'image_views', 4)
df_train, df_test, target_columns = create_quantile_target_col(df_train, df_test, target_columns, 'image_ncomments', 4)
df_train, df_test, target_columns = create_quantile_target_col(df_train, df_test, target_columns, 'image_nfavs', 4)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [21]:
X_train, y_train = pop_columns(df_train, target_columns)
X_test, y_test = pop_columns(df_test, target_columns)

In [22]:
X_columns = X_train.columns
y_columns = y_train.columns

In [23]:
scaler_mean_std = StandardScaler()
X_train = scaler_mean_std.fit_transform(X_train)
X_test = scaler_mean_std.transform(X_test)

In [24]:
X_train = pd.DataFrame(data=X_train, columns=X_columns)
X_test = pd.DataFrame(data=X_test, columns=X_columns)

In [27]:
y_train.head(1)

Unnamed: 0_level_0,Unnamed: 1_level_0,user_is_pro,user_can_buy_pro,user_total_views,image_ncomments,image_nfavs,image_nsets,image_npools,image_views,image_views_quantile,image_ncomments_quantile,image_nfavs_quantile
owner,id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
12385792@N00,11378305226,1,0,7760,0,0,1,0,6,1,1,1


### GRID SEARCH - Image Views Quantile (IVQ)

#### Set Search Parameters

In [79]:
param_distributions = {'Logistic': {"C": sp_expon(loc=0.001, scale=1),
                                    "fit_intercept": [True, False],
                                    "intercept_scaling": sp_randint(1, 5),
                                    "warm_start": [False, True]
                                    },
                       'RandomForest': {"max_depth": sp_randint(2, 7),
                                        "max_features": ['auto', None, sp_randint(5, 501)],
                                        "min_samples_split": sp_randint(1, 201),
                                        "min_samples_leaf": sp_randint(1, 201),
                                        "bootstrap": [True, False],
                                        "criterion": ["gini", "entropy"],
                                        "oob_score": [False, True],
                                        "warm_start": [False, True] 
                                        },
                       'AdaBoost_DT': {"learning_rate": sp_expon(loc=0.001, scale=1.5),
                                       "algorithm" : ['SAMME.R', 'SAMME']
                                       },
                       'GradientBoost': {},
                       'SVC': {}
                       }
                       
                       

#### Build Models

In [80]:
DT = DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1,
      min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, class_weight=None,
      presort=False)

model_ivq_LogitClassifier = LogisticRegression(penalty='l2', dual=False, tol=0.0001,
                                               class_weight=None, random_state=25,
                                               solver='liblinear', max_iter=1000, multi_class='ovr', verbose=1,
                                               n_jobs=1)

model_ivq_RandomForest = RandomForestClassifier(n_estimators=1000, min_weight_fraction_leaf=0.0, n_jobs=36,
                                                random_state=42, verbose=1, class_weight=None)

model_ivq_AdaBoost_DT = AdaBoostClassifier(base_estimator=DT, n_estimators=300, random_state=12)

model_ivq_GBC = GradientBoostingClassifier(loss='deviance', learning_rate=0.1, n_estimators=100, subsample=1.0,
                                           min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0,
                                           max_depth=3, init=None, random_state=None, max_features=None, verbose=0,
                                           max_leaf_nodes=None, warm_start=False, presort='auto')

model_ivq_SVC = SVC(C=1.0, kernel='rbf', degree=3, gamma='auto', coef0=0.0, shrinking=True, probability=False,
                    tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=-1,
                    decision_function_shape=None, random_state=None)


#### Logistic Random Search CV

In [None]:
n_iter_search = 30
random_search_LogitClassifier = RandomizedSearchCV(estimator=model_ivq_LogitClassifier,
                                                   param_distributions=param_distributions['Logistic'],
                                                   n_iter=n_iter_search,
                                                   n_jobs=36, cv=5, verbose=2, random_state=30, error_score='raise')
random_search_LogitClassifier.fit(X_train, y_train['image_views_quantile'])

Fitting 5 folds for each of 30 candidates, totalling 150 fits


In [None]:
random_search_LogitClassifier.grid_scores_

In [None]:
random_search_LogitClassifier.best_score_

In [None]:
best_LogitClassifier = random_search_LogitClassifier.best_estimator_
y_pred = best_LogitClassifier.predict(X_test)
print "Best Logit Classifier F1 Score: ", f1_score(y_test['image_views_quantile'], y_pred, labels=None, pos_label=None, average='macro', sample_weight=None)

#### Random Forest Random Search CV

In [None]:
n_iter_search = 30
random_search_RandomForest = RandomizedSearchCV(estimator=model_ivq_RandomForest,
                                                   param_distributions=param_distributions['RandomForest'],
                                                   n_iter=n_iter_search,
                                                   n_jobs=36, cv=5, verbose=1, random_state=30, error_score='raise')
random_search_RandomForest.fit(X_train, y_train['image_views_quantile'])

In [None]:
random_search_RandomForest.best_score_

In [None]:
best_RandomForest = random_search_RandomForest.best_estimator_
y_pred = best_RandomForest.predict(X_test)
print "Best Random Forest Classifier F1 Score: ", f1_score(y_test['image_views_quantile'], y_pred, labels=None, pos_label=None, average='macro', sample_weight=None)

#### AdaBoost DT Random Search CV

In [None]:
n_iter_search = 30
random_search_AdaBoost_DT = RandomizedSearchCV(estimator=model_ivq_AdaBoost_DT,
                                                   param_distributions=param_distributions['RandomForest'],
                                                   n_iter=n_iter_search,
                                                   n_jobs=36, cv=5, verbose=1, random_state=30, error_score='raise')
random_search_AdaBoost_DT.fit(X_train, y_train['image_views_quantile'])

In [None]:
random_search_AdaBoost_DT.best_score_

In [None]:
best_AdaBoost_DT = random_search_AdaBoost_DT.best_estimator_
y_pred = best_AdaBoost_DT.predict(X_test)
print "Best AdaBoost DT F1 Score: ", f1_score(y_test['image_views_quantile'], y_pred, labels=None, pos_label=None, average='macro', sample_weight=None)

#### Gradient Boost Classifier Random Search CV

In [None]:
n_iter_search = 30
random_search_GBC = RandomizedSearchCV(estimator=model_ivq_GBC,
                                                   param_distributions=param_distributions['RandomForest'],
                                                   n_iter=n_iter_search,
                                                   n_jobs=36, cv=5, verbose=1, random_state=30, error_score='raise')
random_search_GBC.fit(X_train, y_train['image_views_quantile'])

In [None]:
random_search_GBC.best_score_

In [None]:
best_GBC = random_search_GBC.best_estimator_
y_pred = best_GBC.predict(X_test)
print "Best AdaBoost DT F1 Score: ", f1_score(y_test['image_views_quantile'], y_pred, labels=None, pos_label=None, average='macro', sample_weight=None)

#### SVC Random Search CV

In [None]:
n_iter_search = 30
random_search_SVC = RandomizedSearchCV(estimator=model_ivq_SVC,
                                                   param_distributions=param_distributions['RandomForest'],
                                                   n_iter=n_iter_search,
                                                   n_jobs=36, cv=5, verbose=1, random_state=30, error_score='raise')
random_search_SVC.fit(X_train, y_train['image_views_quantile'])

In [None]:
random_search_SVC.best_score_

In [None]:
best_SVC = random_search_SVC.best_estimator_
y_pred = best_SVC.predict(X_test)
print "Best AdaBoost DT F1 Score: ", f1_score(y_test['image_views_quantile'], y_pred, labels=None, pos_label=None, average='macro', sample_weight=None)

### GRID SEARCH - Image nComments Quantile (InCQ)

### GRID SEARCH - Image nFavs Quantile (InFQ)

In [None]:
rv = sp_expon(loc=0.001, scale=1.5)

In [None]:
plt.hist(rv.rvs(1000), bins=40)
plt.plot()

In [75]:
nums = rv.rvs(1000)