##### Silver Speech and Golden Silence: Spoiler Detection Project

### Balanced Random Forest (review-wise)

In [1]:
#Imports
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.pipeline import Pipeline
from sklearn.calibration import CalibratedClassifierCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_selection import SelectKBest, chi2, 
from sklearn.utils import class_weight
from imblearn.ensemble import BalancedRandomForestClassifier

#### Load Data

In [2]:
#Load train data
train = pd.read_json('data/train_reduced.json')
train.drop(['user_id', 'title'], inplace = True, axis = 1)
train.reset_index(drop = True, inplace = True)

In [3]:
#Load validation data
val = pd.read_json('data/validation_reduced.json')
val.drop(['user_id', 'title'], inplace = True, axis = 1)
val.reset_index(drop = True, inplace = True)

In [4]:
#Load reviews as a whole for train and validation
X_tr = pd.read_csv('data/X_train.csv')
X_v = pd.read_csv('data/X_val.csv')
#Rename columns
X_tr.rename(columns = {"0": 'review_whole'}, inplace = True)
X_v.rename(columns = {"0": 'review_whole'}, inplace = True)
#Drop unnamed columns
X_tr.drop('Unnamed: 0', inplace = True, axis = 1)
X_v.drop('Unnamed: 0', inplace = True, axis = 1)
#Reset indices
X_tr.reset_index(drop = True, inplace = True)
X_v.reset_index(drop = True, inplace = True)

In [5]:
#Combine dataframes and reviews 
train_df = pd.concat([train, X_tr], axis = 1)
val_df = pd.concat([val, X_v], axis = 1)

In [6]:
#Save data for later use
train_df.to_json('data/train_only_needed_columns.json')
val_df.to_json('data/val_only_needed_columns.json')

In [7]:
#Since the data is imbalanced, we downsample the non-spoilers. This further reduces the sample to about 25,000.
def downsample_nonspoilers_reviewwise(df):
    df_majority = df[df['spoiler_dum'] == 0] #nonspoilers
    df_minority = df[df['spoiler_dum'] == 1] #spoilers
    N = 2*len(df_minority)
    
    # Downsample majority labels equals three times the number of samples in the minority class while keeping the original genre proportion
    df_majority_ = df_majority.groupby('genre', group_keys = False).apply(lambda x: x.sample(int(np.rint(N*len(x)/len(df))))).sample(frac = 1)

    # Concatenate the majority and minority dataframes
    sample = pd.concat([df_majority_, df_minority], axis = 0)
    
    sample.reset_index(inplace = True, drop = True)
    return sample

In [8]:
train_df = downsample_nonspoilers_reviewwise(train_df)
train_df.shape

(180169, 6)

In [9]:
X_train = train_df.drop('spoiler_dum', axis = 1)
y_train = train_df.spoiler_dum

In [10]:
X_val = val_df.drop('spoiler_dum', axis = 1)
y_val = val_df.spoiler_dum

In [11]:
X_train_rev = X_train.review_whole
X_val_rev = X_val.review_whole

#### Balanced Random Forest Classifier

In [12]:
#Function for model training and prediction
def run_model(pipeline, X_train, y_train, X_test, y_test):
    #Fit the model
    model = pipeline.fit(X_train, y_train)
    
    #Predict labels of test data
    y_pred = model.predict(X_test)
    
    #Print classification report and confusion matrix
    return print(classification_report(y_test, y_pred)), print(confusion_matrix(y_test, y_pred, normalize = 'true'))
    return y_pred

In [15]:
#Build a pipeline for feature extraction with TF IDF and SGD
#TFIDF
tfidf = TfidfVectorizer(stop_words = 'english', ngram_range = (1,2), min_df = 100)

#Feature Selection: K-Best
feat = SelectKBest(score_func = chi2, k = 10000)

#Balanced Random Forest classifier
brf = BalancedRandomForestClassifier(n_estimators = 100, random_state = 42, verbose = 1, n_jobs = -1, class_weight = 'balanced', oob_score = True)

pipe = Pipeline([('tfidf', tfidf),('select_kbest', feat), ('BRF', brf)])

In [16]:
#run the model review-wise (validation)
run_model(pipe, X_train_rev.astype('str'), y_train, X_val_rev.astype('str'), y_val)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  7.4min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed: 16.4min finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    6.7s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:   14.8s finished


              precision    recall  f1-score   support

           0       0.99      0.61      0.76    235341
           1       0.16      0.95      0.27     18062

    accuracy                           0.63    253403
   macro avg       0.58      0.78      0.51    253403
weighted avg       0.93      0.63      0.72    253403

[[0.60983424 0.39016576]
 [0.04955154 0.95044846]]


(None, None)