##### Silver Speech and Golden Silence: Spoiler Detection Project

### Baseline Stochastic Gradient Descent Classifier

In [1]:
#Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier

In [2]:
#Disable scientific notation for floats
pd.options.display.float_format = '{:,}'.format

#Enable viewing more (in this case: all) features of a dataset
pd.set_option('display.max_columns', 500)

#ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [3]:
#Load datafiles
train = pd.read_json('data/train_preprocessed.json')

In [4]:
val = pd.read_json('data/validation_preprocessed.json')

### Model: SGD-Classifier

Stochastic Gradient Descent (SGD) is a simple and very efficient approach to fitting linear classifiers under convex loss functions such as (linear) Support Vector Machines and Logistic Regression. 

SGD has been successfully applied to large-scale machine learning problems often encountered in text classification and natural language processing. 

Therefore, we use this a approach as a basic model for spoiler detection.

We calculate two kinds of models: In the first one, the reviews are fed sentence-wise to the classifier, in the second one, we give in the whole review. 

#### First model: Feed the reviews sentence-wise

In [5]:
#Function to transfer the review sentences to a list and then to a numpy array for training.
def get_X_sen(df):
    
    '''Get review sentences from a dataframe df given in.
    The review sentences are written in a list 'lst' which is then transformed
    into a numpy array'''
    
    lst = []
    for review in df['tokenized']:
        for sentence in review:
            lst.append(sentence)
    X = np.array(lst) 
    return X 

In [6]:
#Function to transfer the review labels for each review sentence to a list and then to a numpy array for training.
def get_y_sen(df):
    
    '''Get review labels for each sentence from a dataframe df given in.
    The review sentences are written in a list 'llst' which is then transformed
    into a numpy array'''
    
    llst = []
    for labellist in df['sentence_labels']:
        for label in labellist:
            llst.append(label)
    y = np.array(llst)
    return y

In [7]:
#Get X and y (train) with sentence-wise review texts
X_train_sen = get_X_sen(train)
y_train_sen = get_y_sen(train)

print(y_train_sen.shape, X_train_sen.shape)

(12372436,) (12372436,)


In [8]:
#Get X and y (validation) with sentence-wise review texts
X_val_sen = get_X_sen(val)
y_val_sen = get_y_sen(val)

print(y_val_sen.shape, X_val_sen.shape)

(3510496,) (3510496,)


In [10]:
#Build a pipeline for feature extraction with TF IDF and SGD
#TFIDF
tfidf = TfidfVectorizer(stop_words = 'english', ngram_range = (1,1), min_df = 100, max_features = 5000)
#SGD
sgd = SGDClassifier(random_state = 42, penalty = 'l2', shuffle = True, n_jobs = -1, max_iter = 1000, 
                                       loss = 'hinge', class_weight = {0: 0.5, 1: .5}, alpha = .0001)
pipe = Pipeline([('tfidf', tfidf),('sgd', sgd)])

In [9]:
#Function to run a model and print the classification report
def run_sgd(pipeline, X_train, y_train, X_test, y_test):
    #Fit the model
    sgd = pipeline.fit(X_train, y_train)
    
    #Predict labels of test data
    y_pred = pipeline.predict(X_test)
    
    return print(classification_report(y_test, y_pred))

In [12]:
run_sgd(pipe, X_train_sen, y_train_sen, X_val_sen, y_val_sen)

              precision    recall  f1-score   support

           0       0.97      1.00      0.98   3396530
           1       0.00      0.00      0.00    113966

    accuracy                           0.97   3510496
   macro avg       0.48      0.50      0.49   3510496
weighted avg       0.94      0.97      0.95   3510496



The basic model is completely fails to detect spoilers.
We tune the hyperparameters.

In [18]:
#Build a pipeline for feature extraction with TF IDF and SGD
#TFIDF
tfidf = TfidfVectorizer(stop_words = 'english', ngram_range = (1,2), min_df = 1)
#SGD
sgd = SGDClassifier(random_state = 42, penalty = 'elasticnet', alpha = .001, class_weight = {0: 0.3, 1: 0.7}, 
                    l1_ratio = 0, max_iter = 1000, loss = 'perceptron', shuffle = True, n_jobs = -1)
pipe = Pipeline([('tfidf', tfidf), ('sgd', sgd)])

In [14]:
run_sgd(pipe, X_train_sen, y_train_sen, X_val_sen, y_val_sen)

              precision    recall  f1-score   support

           0       0.98      0.97      0.98   3396530
           1       0.39      0.53      0.45    113966

    accuracy                           0.96   3510496
   macro avg       0.69      0.75      0.71   3510496
weighted avg       0.96      0.96      0.96   3510496



This is better. Let's try another one.

In [15]:
#Build a pipeline for feature extraction with TF IDF and SGD
#TFIDF
tfidf = TfidfVectorizer(stop_words = 'english', ngram_range = (1,2), min_df = 100)
#SGD
sgd = SGDClassifier(random_state = 42, penalty = 'l2',alpha = .0001, class_weight = {0: 0.35, 1: 0.65}, 
                    max_iter = 1000, loss = 'hinge', shuffle = True, n_jobs = -1)
pipe = Pipeline([('tfidf', tfidf), ('sgd', sgd)])

In [16]:
run_sgd(pipe, X_train_sen, y_train_sen, X_val_sen, y_val_sen)

              precision    recall  f1-score   support

           0       0.97      1.00      0.98   3396530
           1       0.00      0.00      0.00    113966

    accuracy                           0.97   3510496
   macro avg       0.48      0.50      0.49   3510496
weighted avg       0.94      0.97      0.95   3510496



#### Second model: review-wise modelling

Now we use the whole review as predictor. 

In [8]:
#For review-wise model training: Transfer sentences to np.array:
def reviewwise_X(df):
    reviews = []
    for review in df['tokenized']: 
        reviews.append(' '.join(review))
    X = np.array(reviews)
    return X

In [9]:
X_train_rev = reviewwise_X(train)
y_train_rev = train.spoiler_dum
print(X_train_rev.shape, y_train_rev.shape)

(962875,) (962875,)


In [10]:
X_val_rev = reviewwise_X(val)
y_val_rev = val.spoiler_dum
print(X_val_rev.shape, y_val_rev.shape)

(253403,) (253403,)


We begin with the best model from sentence-wise modelling.

In [13]:
#Build a pipeline for feature extraction with TF IDF and SGD
#TFIDF
tfidf = TfidfVectorizer(stop_words = 'english', ngram_range = (1,2), min_df = 100)
#SGD
sgd = SGDClassifier(random_state = 42, penalty = 'elasticnet', alpha = .001, class_weight = {0: 0.3, 1: 0.7}, 
                    l1_ratio = 0, max_iter = 1000, loss = 'perceptron', shuffle = True, n_jobs = -1)
pipe = Pipeline([('tfidf', tfidf), ('sgd', sgd)])

In [15]:
run_sgd(pipe, X_train_rev, y_train_rev, X_val_rev, y_val_rev)

              precision    recall  f1-score   support

           0       0.95      0.95      0.95    235341
           1       0.36      0.40      0.38     18062

    accuracy                           0.91    253403
   macro avg       0.66      0.67      0.66    253403
weighted avg       0.91      0.91      0.91    253403



Hyperparameter-Tuning...

In [16]:
#Build a pipeline for feature extraction with TF IDF and SGD
#TFIDF
tfidf = TfidfVectorizer(stop_words = 'english', ngram_range = (1,1), min_df = 1)
#SGD
sgd = SGDClassifier(random_state = 42, penalty = 'l2', shuffle = True, n_jobs = -1, max_iter = 1000, 
                                       loss = 'hinge', class_weight = {0: 0.5, 1: .5}, alpha = .0001)

pipe = Pipeline([('tfidf', tfidf),('sgd', sgd)])

In [17]:
run_sgd(pipe, X_train_rev, y_train_rev, X_val_rev, y_val_rev)

              precision    recall  f1-score   support

           0       0.93      1.00      0.96    235341
           1       0.00      0.00      0.00     18062

    accuracy                           0.93    253403
   macro avg       0.46      0.50      0.48    253403
weighted avg       0.86      0.93      0.89    253403



The best baseline model is the one with sentence-wise training and the following hyperparameters:

tfidf = TfidfVectorizer(stop_words = 'english', ngram_range = (1,2), min_df = 1)

SGDClassifier(random_state = 42, penalty = 'l2', alpha = .001, class_weight = {0: 0.3, 1: 0.7}, 
                    max_iter = 1000, loss = 'perceptron', shuffle = True, n_jobs = -1)