In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression, LogisticRegressionCV, Lasso, Ridge, RidgeClassifier, SGDClassifier
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, precision_recall_curve, precision_recall_fscore_support, f1_score, r2_score 
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
import cPickle as pickle

### MODEL SPORTS DATA

In [None]:
df_features = pd.read_csv('../data/modeling/SPORTS/feature_data_SPORTS_21205.csv', sep='|')
df_targets = pd.read_csv('../data/modeling/SPORTS/target_data_SPORTS_21205.csv', sep='|')

df_features = df_features.set_index('owner').set_index("id", append=True)

df_targets = df_targets.set_index('owner').set_index("id", append=True)

target_columns = list(df_targets.columns)
target_columns.remove('image_ntags')
target_columns

df = df_features.join(df_targets, how='inner')

df.dropna(axis=0, inplace=True)

In [None]:
df.head(1)

### EDA

In [None]:
plt.hist(df['image_views'], bins=50)
plt.show()

In [None]:
plt.hist(df['image_views'][df['image_views']<=1000], bins=50)
plt.show()

### Create New Quartile Targets

In [None]:
def name_quantile(x, limits):
    quantile_cats = range(1, len(limits)+1, 1)
    for cat, limit in zip(quantile_cats, limits):
        if x <= limit:
            return cat

def create_quantile_target_col(df, target_columns, col_name, n_quantiles=5):
    """
    Create a new column in the DataFrame that indicates which quantile the target column value falls into.
    
    PARAMETERS
    ----------
    df : DataFrame
    
    n_quantiles : int
        The number of bins. For 4 bins (0 to 0.25, 0.25 to 0.5, etc...), n_quantiles=4.
        
    col_name : str
    
    target_columns : list
    
    RETURNS
    -------
    df : DataFrame
    
    target_columns : list        
    """
    min_value = df[col_name].min()
    max_value = df[col_name].max()
    limits = []
    for i in range(1, n_quantiles+1):
        limits.append(df[col_name].quantile(i/float(n_quantiles)))
    
    new_col_name = col_name+"_quantile"
    target_columns.append(new_col_name)
    
    df[new_col_name] = df[col_name].apply(lambda x: name_quantile(x, limits))
    print limits
    return df, target_columns

In [None]:
df, target_columns = create_quantile_target_col(df, target_columns, 'image_views', 4)

In [None]:
df_train, df_test = train_test_split(df, train_size=0.8, random_state=42)

In [None]:
def pop_columns(df, col_names):
    for i, name in enumerate(list(col_names)):
        if i == 0:
            df_dropped_cols = df.pop(name)
        else:
            df_dropped_cols = pd.concat((df_dropped_cols, df.pop(name)), axis=1)
    return df, df_dropped_cols

In [None]:
X_train, y_train = pop_columns(df_train, target_columns)
X_test, y_test = pop_columns(df_test, target_columns)

In [None]:
y_test.head(1)

In [None]:
X_columns = X_train.columns
y_columns = y_train.columns

In [None]:
scaler_mean_std = StandardScaler()
X_train = scaler_mean_std.fit_transform(X_train)
X_test = scaler_mean_std.transform(X_test)

In [None]:
X_train = pd.DataFrame(data=X_train, columns=X_columns)
X_test = pd.DataFrame(data=X_test, columns=X_columns)

### Predicting Image View Quantile (IVQ)

#### LOGISTIC MODEL

In [None]:
model_ivq_logitclassifier = LogisticRegression(multi_class='multinomial', solver='lbfgs', random_state=1,
                                               class_weight='balanced', max_iter=10000, verbose=1, warm_start=True,
                                               n_jobs=30, penalty='l2', C=1.0)

In [None]:
model_ivq_logitclassifier.fit(X_train, y_train['image_views_quantile'])

In [None]:
y_pred = model_ivq_logitclassifier.predict(X_test)
y_probs = model_ivq_logitclassifier.predict_proba(X_test)

In [None]:
f1_score(y_test['image_views_quantile'], y_pred, labels=None, pos_label=None, average='micro', sample_weight=None)

In [None]:
f1_scores_int = []
f1_scores_no_int = []
f1_scores_cold_int = []
f1_scores_cold_no_int = []
C_values = [0.01, 0.02, 0.05, 0.1, 0.3, 0.6, 1.0, 2.0, 5.0, 10]
for i in C_values:
    model_ivq_logitclassifier = LogisticRegression(multi_class='multinomial', solver='lbfgs', random_state=1,
                                               class_weight='balanced', max_iter=10000, verbose=1, warm_start=True,
                                               n_jobs=30, penalty='l2', C=i)
    model_ivq_logitclassifier.fit(X_train, y_train['image_views_quantile'])
    y_pred = model_ivq_logitclassifier.predict(X_test)
    # y_probs = model_ivq_logitclassifier.predict_proba(X_test)
    f1_scores_int.append(f1_score(y_test['image_views_quantile'], y_pred, labels=None, pos_label=None, average='micro', sample_weight=None))
for i in C_values:
    model_ivq_logitclassifier = LogisticRegression(multi_class='multinomial', solver='lbfgs', random_state=1,
                                               class_weight='balanced', max_iter=10000, verbose=1, warm_start=True,
                                               n_jobs=30, penalty='l2', C=i, fit_intercept=False)
    model_ivq_logitclassifier.fit(X_train, y_train['image_views_quantile'])
    y_pred = model_ivq_logitclassifier.predict(X_test)
    # y_probs = model_ivq_logitclassifier.predict_proba(X_test)
    f1_scores_no_int.append(f1_score(y_test['image_views_quantile'], y_pred, labels=None, pos_label=None, average='micro', sample_weight=None))
for i in C_values:
    model_ivq_logitclassifier = LogisticRegression(multi_class='multinomial', solver='lbfgs', random_state=1,
                                               class_weight='balanced', max_iter=10000, verbose=1, warm_start=False,
                                               n_jobs=30, penalty='l2', C=i)
    model_ivq_logitclassifier.fit(X_train, y_train['image_views_quantile'])
    y_pred = model_ivq_logitclassifier.predict(X_test)
    # y_probs = model_ivq_logitclassifier.predict_proba(X_test)
    f1_scores_cold_int.append(f1_score(y_test['image_views_quantile'], y_pred, labels=None, pos_label=None, average='micro', sample_weight=None))
for i in C_values:
    model_ivq_logitclassifier = LogisticRegression(multi_class='multinomial', solver='lbfgs', random_state=1,
                                               class_weight='balanced', max_iter=10000, verbose=1, warm_start=False,
                                               n_jobs=30, penalty='l2', C=i,  fit_intercept=False)
    model_ivq_logitclassifier.fit(X_train, y_train['image_views_quantile'])
    y_pred = model_ivq_logitclassifier.predict(X_test)
    # y_probs = model_ivq_logitclassifier.predict_proba(X_test)
    f1_scores_cold_no_int.append(f1_score(y_test['image_views_quantile'], y_pred, labels=None, pos_label=None, average='micro', sample_weight=None))

In [None]:
plt.plot(C_values, f1_scores_int, 'r')
plt.plot(C_values, f1_scores_no_int, 'm')
plt.plot(C_values, f1_scores_cold_int, 'b')
plt.plot(C_values, f1_scores_cold_no_int, 'g')

#### Random Forest Classifier

In [None]:
f1_scores_gini = None
model_ivq_randomforest = RandomForestClassifier(n_estimators=1000, criterion='gini', max_depth=None,
                                                min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0,
                                                max_features='auto', max_leaf_nodes=None, bootstrap=True,
                                                oob_score=False, n_jobs=30, random_state=5, verbose=1,
                                                warm_start=False, class_weight=None)
model_ivq_randomforest.fit(X_train, y_train['image_views_quantile'])
y_pred = model_ivq_randomforest.predict(X_test)
f1_scores_gini = f1_score(y_test['image_views_quantile'], y_pred, labels=None, pos_label=None, average='micro', sample_weight=None)

In [None]:
f1_scores_entropy = None
model_ivq_randomforest = RandomForestClassifier(n_estimators=1000, criterion='entropy', max_depth=None,
                                                min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0,
                                                max_features='auto', max_leaf_nodes=None, bootstrap=True,
                                                oob_score=False, n_jobs=30, random_state=5, verbose=1,
                                                warm_start=False, class_weight=None)
model_ivq_randomforest.fit(X_train, y_train['image_views_quantile'])
y_pred = model_ivq_randomforest.predict(X_test)
f1_scores_entropy = f1_score(y_test['image_views_quantile'], y_pred, labels=None, pos_label=None, average='micro', sample_weight=None)

In [None]:
print "GINI: ", f1_scores_gini
print "Entropy, ", f1_scores_entropy

#### AdaBoostClassifier

In [None]:
tree = DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=2, min_samples_split=50,
                              min_samples_leaf=20, min_weight_fraction_leaf=0.0, max_features=None,
                              random_state=30, max_leaf_nodes=None, class_weight=None, presort=False)
num_estimators = 300

model_ivq_adaboost = AdaBoostClassifier(base_estimator=tree, n_estimators=num_estimators, learning_rate=1,
                                        algorithm='SAMME.R', random_state=10)
model_ivq_adaboost.fit(X_train, y_train['image_views_quantile'])

In [None]:
f1_scores = []
for i, y_pred in zip(range(1, num_estimators+1), model_ivq_adaboost.staged_predict(X_test)):
    f1_scores.append(f1_score(y_test['image_views_quantile'], y_pred, labels=None, pos_label=None, average='macro', sample_weight=None))

plt.plot(f1_scores, 'b')

#### Gradient Boosted Decision Trees

In [None]:
num_estimators = 300
model_ivq_GBC = GradientBoostingClassifier(loss='deviance', learning_rate=0.1, n_estimators=300,
                                           subsample=1.0, min_samples_split=2, min_samples_leaf=1,
                                           min_weight_fraction_leaf=0.0, max_depth=3, init=None,
                                           random_state=35, max_features=None, verbose=1, max_leaf_nodes=None,
                                           warm_start=False, presort='auto')
model_ivq_GBC.fit(X_train, y_train['image_views_quantile'])

In [None]:
with open('../models/model_GBC_IVQ_300.pkl', 'w') as f:
    pickle.dump(model_ivq_GBC, f)

In [None]:
f1_scores = []
y_predictions = None
y_pred_probabilities = None
for i, y_pred, y_pred_proba in zip(range(1, num_estimators+1), model_ivq_GBC.staged_predict(X_test), model_ivq_GBC.staged_predict_proba(X_test)):
    f1_scores.append(f1_score(y_test['image_views_quantile'], y_pred, labels=None, pos_label=None, average='macro', sample_weight=None))
    y_predictions = y_pred
    y_pred_probabilities = y_pred_proba

In [None]:
y_pred_probabilities[0]

In [None]:
# [20.0, 43.0, 99.0, 20134.0]
lefts = np.array([0, 20.0, 43.0, 99.0]).reshape((-1, 1))
rights = np.array([20.0, 43.0, 99.0, 20134.0]).reshape((-1, 1))
bin_names = np.array(['0-20', '21-43', '44-99', '100-20134']).reshape((-1, 1))
widths = rights - lefts
data = pd.DataFrame(data=lefts, columns=['lefts'])
data['Image Views'] = bin_names
data['Probability'] = y_pred_probabilities[0].reshape((-1, 1))

In [None]:
data.info()

In [None]:
palette = sns.cubehelix_palette(4, start=.5, rot=-.75)

In [None]:
import seaborn as sns
#sns.axes_style(style='darkgrid')
with sns.plotting_context(context='poster', font_scale=1):
    plt.figure(figsize=(8, 8))
    ax = sns.barplot(x='Image Views', y='Probability', data=data, palette=palette, )
#ax.set(xlabel='Image View Bins', ylabel='Probability')
#ax.tick_params(axis='x', reset=True, which='major', direction="in", length=10, width=2, bottom='on', labelbottom='on')
#plt.xticks(range(1,6))
# plt.title('PhotoPro Estimated Views')
# plt.xlabel('Num Estimators')
# plt.ylabel('Bin Probabilities')
# plt.xscale('linear')
# plt.xlim( 0, 150 )
# plt.save()
#ax.axes.axis('on')
plt.ylabel("Probability")
plt.title('PhotoPro Predicted Image Views', fontsize=20)
plt.show()

In [None]:
plt.figure(figsize=(8, 8))
plt.plot(f1_scores, 'b')
plt.title('Gradient Descent Boosted Model:\nF1 Score', fontsize=30)
plt.xlabel('Num Estimators', fontsize=30)
plt.ylabel('F1 Score', fontsize=30)
#plt.save()

In [None]:
y_

In [None]:
num_estimators = 300
model_ivq_GBC = GradientBoostingClassifier(loss='deviance', learning_rate=0.1, n_estimators=3000,
                                           subsample=0.4, min_samples_split=10, min_samples_leaf=10,
                                           min_weight_fraction_leaf=0.0, max_depth=3, init=None,
                                           random_state=35, max_features='auto', verbose=1, max_leaf_nodes=None,
                                           warm_start=False, presort='auto')
model_ivq_GBC.fit(X_train, y_train['image_views_quantile'])

In [None]:
f1_scores = []
for i, y_pred in zip(range(1, num_estimators+1), model_ivq_GBC.staged_predict(X_test)):
    f1_scores.append(f1_score(y_test['image_views_quantile'], y_pred, labels=None, pos_label=None, average='macro', sample_weight=None))

plt.plot(f1_scores, 'b')

#### SVM

In [None]:
model_ivq_svc = SVC(C=1.0, kernel='rbf', degree=3, gamma='auto', coef0=0.0, shrinking=True, probability=False,
                    tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=-1,
                    decision_function_shape=None, random_state=50)
model_ivq_svc.fit(X_train, y_train['image_views_quantile'])
y_pred = model_ivq_svc.predict(X_test)
f1_score(y_test['image_views_quantile'], y_pred, labels=None, pos_label=None, average='macro', sample_weight=None)

### PLOTLY

In [None]:
import plotly
import plotly.graph_objs as go
plotly.offline.init_notebook_mode()

In [None]:
x = ['0-20', '21-43', '44-99', '100-20k+']
y = [10, 17, 85, 15]
params = {'orientation':'v',
          "opacity":0.5, "marker": {"color": ["red", "red", "blue", "red"]}}
graph = go.Bar(x=x, y=y, **params)

In [None]:
annotations=[{"x": xi,
              "y": yi,
              "text": str(yi),
              "xanchor": 'center',
              "yanchor": 'bottom',
              "showarrow": False} for xi, yi in zip(x, y)]

In [None]:
go.Layout().help('hovermode', return_help=False)

In [None]:
xaxis = {"title": "Range of Predicted Views", "titlefont":{"color":"#444", "size":20}}
yaxis = {"title": "Predicted Probabilities", "titlefont":{"color":"#444", "size":20}, "range":[0, 101],
         "showticksuffix":"all", "hoverformat":"", "ticksuffix":"%", "autorange":False}

layout_params = {"titlefont":{"color":"#444", "family":"Open Sans, verdana, arial, sans-serif", "size":26}, "hovermode":"false",
                 "font":{"color":"#444", "family":"Open Sans, verdana, arial, sans-serif"}, "autosize":False, "dragmode":"false",
                 "margin":{"b":100, "l":100, "r":80, "pad":12, "t":100, "autoexpand":False}}

layout = go.Layout(title='PhotoPro Predicted View Bins Probabilities', annotations=annotations,
                   height=600, width=800, paper_bgcolor='none', plot_bgcolor='light blue', xaxis=xaxis, yaxis=yaxis, **layout_params)
data= [graph]
fig = go.Figure(layout=layout, data=data)
plotly.offline.iplot(figure_or_data=fig)

In [None]:
url = plotly.plotly.plot(fig, filename="Test Plot 2", sharing='public', fileopt='new', auto_open=False)

In [None]:
url

In [None]:
plotly.plotly.image.save_as(fig, '../web_app/static/plots/test_plotly', 'png', width=800, height=600, scale=3)

In [None]:
pwd