In [8]:
import pandas as pd
import numpy as np
import pickle

### Cross Validation and Model Selection metrics
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_predict

from sklearn.metrics import f1_score
from sklearn.metrics import hamming_loss 
from sklearn.metrics import make_scorer
from sklearn.metrics import jaccard_similarity_score as jaccard_score
from sklearn.metrics import classification_report

# Preprocessing
import sklearn.preprocessing as Preprocessing
from sklearn.preprocessing import StandardScaler as Standardize
from sklearn.preprocessing import MultiLabelBinarizer

### Classification
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from sklearn.svm import SVC

from sklearn import tree
from sklearn import ensemble
from sklearn.ensemble import RandomForestClassifier as RandomForest
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier

from sklearn.linear_model import LogisticRegression as Log_Reg




In [2]:
### Load Dataset
# X: Unprocessed features
# X_std: standardized by Preprocessor
# y: MultiLabel Binarized targets
[X_data, X_data_std, y_data] = pickle.load(open('data/cleaned_data_for_traditional_models.p', 'r'))

print 'X_data shape:', X_data.shape
print 'X_data_std shape:', X_data_std.shape
print 'y shape', y_data.shape

X_data shape: (5996L, 124L)
X_data_std shape: (5996L, 124L)
y shape (5996L, 20L)


## Fitting traditional models on the data

The following traditional models were fitted on the data using **One-vs-Rest classifier** for all the genres. The relevant parameters were tuned using 3-fold cross validation. 

 - Weighted Logistic regression: Tuned **Regularization Parameter C** using Cross-Validation.  
 - Decision Tree: Tuned **max_depth** using cross validation
 - Random Forest: Tuned **max_depth** using cross validation
 - AdaBoost: Tuned **n_estimators** using cross validation
 
** The models were run on separate computers. The results were saved in a pickle file for each and were analyzed below.**

In [None]:
# Tune models by minimizing Hamming Loss
hamming_scorer = make_scorer(hamming_loss, greater_is_better = False)

### Logistic Regression

In [None]:
### Weighted Logistic Regression
# use L2 regularization, and balanced weight
LogReg_Model = OneVsRestClassifier(Log_Reg(penalty = 'l2', class_weight = 'balanced'))

# grid search for regularization parameter C
LogReg_grid = GridSearchCV(LogReg_Model, 
                           param_grid={'estimator__C': np.logspace(-5, 15, 20)}, 
                                       scoring= hamming_scorer,
                                       n_jobs = 5)
LogReg_grid.fit(X_data_std, y_data)

# Fit the best model on testing data
y_pred_LogReg = cross_val_predict(LogReg_grid.best_estimator_, X_data_std, y_data)

# Dump CV results and predictions from best model
pickle.dump([LogReg_grid.cv_results_, y_pred_LogReg], open('results/traditional/LogReg_grid_results.p', 'wb'))

### Decision Tree

In [None]:
### Single Decision Tree
DecisionTree_Model = OneVsRestClassifier(tree.DecisionTreeClassifier(criterion='gini'))
# grid search on max_depth
DT_grid = GridSearchCV(DecisionTree_Model, 
                    param_grid = {'estimator__max_depth': range(1,10)},
                                  scoring = hamming_scorer)
DT_grid.fit(X_data_std, y_data)

# Fit the best model on testing data
y_pred_Decision_Tree = cross_val_predict(DT_grid.best_estimator_, X_data_std, y_data)

# Dump CV results and predictions from best model
pickle.dump([DT_grid.cv_results_, y_pred_Decision_Tree], open('results/traditional/DecisionTree_grid_results.p', 'wb'))

### Random Forest

In [8]:
### Random Forest
RandomForest_Model = OneVsRestClassifier(RandomForest())
# grid search on max_depth
rf_grid = GridSearchCV(RandomForest_Model,
                       param_grid = {'estimator__max_depth': 10*np.linspace(1,7, 7) },
                       scoring = hamming_scorer)

rf_grid.fit(X_data_std, y_data)

# Fit the best model on testing data
y_pred_RF = cross_val_predict(rf_grid.best_estimator_, X_data_std, y_data)

# Dump CV results and predictions from best model
pickle.dump([rf_grid.cv_results_, y_pred_RF], open('results/traditional/RandomForest_grid_results.p', 'wb'))

### AdaBoost

In [None]:
### Adaboost
Ada = OneVsRestClassifier(AdaBoostClassifier())
# grid search on the number of estimators
ada_grid = GridSearchCV(Ada,
                       param_grid = {'estimator__n_estimators': np.logspace(1,3,6).astype(int) },
                                     scoring = hamming_scorer)
ada_grid.fit(X_data_std, y_data)

# Dump CV results and predictions from best model
y_pred_ada = cross_val_predict(ada_grid.best_estimator_, X_data_std, y_data)
pickle.dump([ada_grid.cv_results_, y_pred_ada], open('results/traditional/Adaboost_grid_results.p', 'wb'))

### Assessing Model Performance for Multilabel Classification

To perform multilabel classification using the OneVsAll classifier, we used several different metrics to assess model performance. Six different metrics were used as the performance metrics of our model: 0-1 loss based accuracy, Hamming loss, precision, recall, F1-score, and Jaccard Similarity. In addition to reporting these metrics for the overall dataset under different models, we evaluated each of these metrics for each genre label, to detect whether any labels were being excluded from prediction due to imbalance. 


#### 1. Accuracy

Accuracy is calculated on a zero-one loss basis, which counts a match between the prediction and true value as one. It then takes the mean of all the indicator and produced the percentage that the binary entries in prediction matrix correctly match up with true values.

#### 2. Hamming Loss

Simply put, the Hamming loss is the fraction of labels that are incorrectly predicted, penalized according to sample weights. In this case, Hamming loss is $1 - accuracy$  

#### 3. Precision

$$Precision=\frac{tp}{tp+fp}$$
where tp = True Positives and fp = False Positives

#### 4. Recall

$$Recall=\frac{tp}{tp+fn}$$ 
where tp = True Positives and fn = False Negatives

#### 5. F-1 Score

F-1 score is the harmonic mean of Precision and Recall. It is a measurement that considers both Precision and Recall. 

$$F_{1}=2\cdot {\frac {1}{{\tfrac {1}{\mathrm {recall} }}+{\tfrac {1}{\mathrm {precision} }}}}=2\cdot {\frac {\mathrm {precision} \cdot \mathrm {recall} }{\mathrm {precision} +\mathrm {recall} }}$$

#### 6. Jaccard Similarities

The Jaccard Similarity between the predicted labels (y_pred) and ground truth labels (y_data) is defined as the intersection divided by the size of the union of the two label sets for a given data point $X_i$. Therefore, JS penalizes the inclusion or exclusion of true labels from the prediction. 

$$ J(A,B) = {{|A \cap B|}\over{|A \cup B|}} = {{|A \cap B|}\over{|A| + |B| - |A \cap B|}}$$
where A = dataset labels, B = multilabel classification output

### Performance Evaluation and Visualizations

In [3]:
### Unpickle Model Results
[LogReg_cv, y_pred_LogReg] = pickle.load(open('results/traditional/LogReg_grid_results.p', 'r'))

[DT_grid_cv, y_pred_Decision_Tree] = pickle.load(open('results/traditional/DecisionTree_grid_results.p', 'r'))

[rf_grid_cv, y_pred_RF] = pickle.load(open('results/traditional/RandomForest_grid_results.p', 'r'))

[ada_grid_cv, y_pred_ada] = pickle.load(open('results/traditional/Adaboost_grid_results.p', 'r'))

# Create list with all model prediction
prediction_list = [y_pred_LogReg, y_pred_Decision_Tree, y_pred_RF, y_pred_ada]

In [4]:
# Read column names to get genre labels for tables below
genre_ids = pickle.load(open('data/binarizer_genre_list.p', 'r'))

# read the genre list (genre ids and names)
genre_list = pd.read_csv('data/genre_list.csv')

# add foreign movies (a genre that is not included in the downloaded genre list)
foreign = pd.DataFrame({'id': [10769], 'GenreName': ['Foreign']})
genre_list = pd.concat([genre_list, foreign], axis = 0)

# order the genrenames according to the binarizer classes
genre_labels = []
for genre_id in genre_ids:
    genre = genre_list['GenreName'][genre_list['id'] == genre_id].values[0]
    genre_labels.append(genre)

In [5]:
# check the order of the genres
genre_labels

['Adventure',
 'Fantasy',
 'Animation',
 'Drama',
 'Horror',
 'Action',
 'Comedy',
 'History',
 'Western',
 'Thriller',
 'Crime',
 'Documentary',
 'Science Fiction',
 'Mystery',
 'Music',
 'Romance',
 'Family',
 'War',
 'Foreign',
 'TV Movie']

In [6]:
### SUMMARIZE MODEL ACCURACY: 
    # for MULTILABEL DATA, calculates baseline accuracy, hamming loss, f1 score, jaccard similarity, classification report
    # INPUTS:
        # y_prediction: predicted y
        # y_data : ground truth y
    # OUTPUTS:
        # prints accuracy metrics
        # Return 0

def summarize_model_accuracy (y_prediction, y_data, names):
    # Get basic accuracy: what proportion of labels are correct
    print 'Accuracy:', np.mean(y_prediction == y_data)
    
    # Get Hamming Loss
    print 'Hamming Loss:', hamming_loss(y_data, y_prediction)
    
    # Get f1
    print 'F1 Score:', f1_score(y_data, y_prediction, average = 'weighted')
    
    # get Jaccard Similarity
    print 'Jaccard Similarity:', jaccard_score(y_data, y_prediction)
    
    # Classification report:report recall, precision, f1 ON EACH CLASS (can be used for multilabel case)
    print classification_report(y_data, y_prediction, target_names = names)

### Logistic Regression Model Performance


In [9]:
# Summarize LogReg Performance
LogRegSummary = summarize_model_accuracy(y_pred_LogReg, y_data, genre_labels)

Accuracy: 0.595488659106
Hamming Loss: 0.404511340894
F1 Score: 0.363745775457
Jaccard Similarity: 0.151680275982
                 precision    recall  f1-score   support

      Adventure       0.10      0.61      0.18       367
        Fantasy       0.07      0.66      0.13       268
      Animation       0.10      0.80      0.18       339
          Drama       0.52      0.67      0.58      2179
         Horror       0.22      0.74      0.35       856
         Action       0.20      0.74      0.32       774
         Comedy       0.36      0.68      0.47      1496
        History       0.04      0.80      0.07       125
        Western       0.02      0.73      0.03        55
       Thriller       0.29      0.76      0.42      1157
          Crime       0.09      0.71      0.17       396
    Documentary       0.30      0.91      0.45       909
Science Fiction       0.11      0.77      0.20       422
        Mystery       0.06      0.67      0.11       269
          Music       0.08    

### Decision Tree Model Performance

In [10]:
Decision_Tree = summarize_model_accuracy(y_pred_Decision_Tree, y_data, genre_labels)

Accuracy: 0.908522348232
Hamming Loss: 0.0914776517678
F1 Score: 0.168331087028
Jaccard Similarity: 0.13245377871
                 precision    recall  f1-score   support

      Adventure       0.59      0.04      0.08       367
        Fantasy       0.14      0.01      0.01       268
      Animation       0.45      0.05      0.09       339
          Drama       0.56      0.27      0.37      2179
         Horror       0.00      0.00      0.00       856
         Action       0.49      0.03      0.06       774
         Comedy       0.86      0.12      0.22      1496
        History       0.08      0.01      0.01       125
        Western       0.00      0.00      0.00        55
       Thriller       0.25      0.00      0.00      1157
          Crime       0.00      0.00      0.00       396
    Documentary       0.82      0.30      0.44       909
Science Fiction       0.30      0.03      0.06       422
        Mystery       0.00      0.00      0.00       269
          Music       0.65    

### Random Forest Model Performance

In [12]:
RF_Model_Summary = summarize_model_accuracy(y_pred_RF, y_data, genre_labels)

Accuracy: 0.909372915277
Hamming Loss: 0.0906270847231
F1 Score: 0.215170777762
Jaccard Similarity: 0.169916055148
                 precision    recall  f1-score   support

      Adventure       0.52      0.06      0.11       367
        Fantasy       0.27      0.01      0.02       268
      Animation       0.52      0.04      0.07       339
          Drama       0.60      0.34      0.43      2179
         Horror       0.62      0.09      0.16       856
         Action       0.44      0.05      0.09       774
         Comedy       0.64      0.17      0.27      1496
        History       0.00      0.00      0.00       125
        Western       0.00      0.00      0.00        55
       Thriller       0.40      0.10      0.16      1157
          Crime       0.14      0.00      0.00       396
    Documentary       0.76      0.35      0.48       909
Science Fiction       0.47      0.02      0.04       422
        Mystery       0.00      0.00      0.00       269
          Music       0.63   

### Ada Boost Model Performance

In [13]:
Ada_Boost = summarize_model_accuracy(y_pred_ada, y_data, genre_labels)

Accuracy: 0.909856571047
Hamming Loss: 0.0901434289526
F1 Score: 0.265702827649
Jaccard Similarity: 0.21204890562
                 precision    recall  f1-score   support

      Adventure       0.49      0.10      0.17       367
        Fantasy       0.00      0.00      0.00       268
      Animation       0.43      0.09      0.15       339
          Drama       0.58      0.39      0.47      2179
         Horror       0.62      0.19      0.29       856
         Action       0.47      0.09      0.15       774
         Comedy       0.70      0.18      0.29      1496
        History       0.26      0.05      0.08       125
        Western       0.00      0.00      0.00        55
       Thriller       0.46      0.15      0.23      1157
          Crime       0.00      0.00      0.00       396
    Documentary       0.71      0.45      0.55       909
Science Fiction       0.43      0.02      0.04       422
        Mystery       0.00      0.00      0.00       269
          Music       0.62    