##### Silver Speech and Golden Silence: Spoiler Detection Project

### Review-wise SGD with Metadata (combined in 1 feature)

In [18]:
#Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
from imblearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.preprocessing import StandardScaler
from sklearn.utils import class_weight
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.linear_model import SGDClassifier

#### Load Data

In [3]:
#Load train data
train = pd.read_json('/Users/juliaschafer/NF_Capstone_Spoiler_Detection/data/train_reduced.json')
train.drop(['user_id', 'title'], inplace = True, axis = 1)
train.reset_index(drop = True, inplace = True)

In [4]:
#Load validation data
val = pd.read_json('/Users/juliaschafer/NF_Capstone_Spoiler_Detection/data/validation_reduced.json')
val.drop(['user_id', 'title'], inplace = True, axis = 1)
val.reset_index(drop = True, inplace = True)

In [6]:
#Load reviews as a whole for train and validation
X_tr = pd.read_csv('/Users/juliaschafer/NF_Capstone_Spoiler_Detection/data/X_train.csv')
X_v = pd.read_csv('/Users/juliaschafer/NF_Capstone_Spoiler_Detection/data/X_val.csv')
#Rename columns
X_tr.rename(columns = {"0": 'review_whole'}, inplace = True)
X_v.rename(columns = {"0": 'review_whole'}, inplace = True)
#Drop unnamed columns
X_tr.drop('Unnamed: 0', inplace = True, axis = 1)
X_v.drop('Unnamed: 0', inplace = True, axis = 1)
#Reset indices
X_tr.reset_index(drop = True, inplace = True)
X_v.reset_index(drop = True, inplace = True)

In [7]:
#Combine dataframes and reviews 
train_df = pd.concat([train, X_tr], axis = 1)
val_df = pd.concat([val, X_v], axis = 1)

In [9]:
#Save data for later use
train_df.to_json('/Users/juliaschafer/NF_Capstone_Spoiler_Detection/data/train_only_needed_columns.json')
val_df.to_json('/Users/juliaschafer/NF_Capstone_Spoiler_Detection/data/val_only_needed_columns.json')

In [10]:
#Since the data is imbalanced, we downsample the non-spoilers. This further reduces the sample to about 25,000.
def downsample_nonspoilers_reviewwise(df):
    df_majority = df[df['spoiler_dum'] == 0] #nonspoilers
    df_minority = df[df['spoiler_dum'] == 1] #spoilers
    N = 2*len(df_minority)
    
    # Downsample majority labels equals three times the number of samples in the minority class while keeping the original genre proportion
    df_majority_ = df_majority.groupby('genre', group_keys = False).apply(lambda x: x.sample(int(np.rint(N*len(x)/len(df))))).sample(frac = 1)

    # Concatenate the majority and minority dataframes
    sample = pd.concat([df_majority_, df_minority], axis = 0)
    
    sample.reset_index(inplace = True, drop = True)
    return sample

In [11]:
train_df = downsample_nonspoilers_reviewwise(train_df)
train_df.shape

The history saving thread hit an unexpected error (OperationalError('database is locked',)).History will not be written to the database.


(180169, 6)

In [12]:
#New column with genre and review text
train_df[['text_and_genre']] = train_df.genre + ' ' + train_df.review_whole
val_df[['text_and_genre']] = val_df.genre + ' ' + val_df.review_whole

In [13]:
X_train = train_df.drop('spoiler_dum', axis = 1)
y_train = train_df.spoiler_dum

In [14]:
X_val = val_df.drop('spoiler_dum', axis = 1)
y_val = val_df.spoiler_dum

In [15]:
X_train_rev = X_train.text_and_genre
X_val_rev = X_val.text_and_genre

#### Pipeline for Balanced Random Forest with Review Texts and Genre Metadata

In [16]:
#Function for model training and prediction
def run_model(pipeline, X_train, y_train, X_test, y_test):
    #Fit the model
    model = pipeline.fit(X_train, y_train)
    
    #Predict labels of test data
    y_pred = model.predict(X_test)
    
    #Print classification report and confusion matrix
    return print(classification_report(y_test, y_pred)), print(confusion_matrix(y_test, y_pred, normalize = 'true'))
    return y_pred

In [19]:
#Build a pipeline for feature extraction with TF IDF and SGD
#TFIDF
tfidf = TfidfVectorizer(stop_words = 'english', ngram_range = (1,2), min_df = 100)

#Feature Selection: K-Best
feat = SelectKBest(score_func = chi2, k = 10000)

#SGD
sgd = SGDClassifier(random_state = 42, penalty = 'l2', shuffle = True, n_jobs = -1, max_iter = 1000, 
                                       loss = 'hinge', class_weight = {0: 0.4, 1: .6}, alpha = .0001)

pipe = Pipeline([('tfidf', tfidf),('select_kbest', feat), ('SGD', sgd)])

In [20]:
#run the model review-wise (validation)
run_model(pipe, X_train_rev.astype('str'), y_train, X_val_rev.astype('str'), y_val)

              precision    recall  f1-score   support

           0       0.97      0.77      0.86    235341
           1       0.18      0.68      0.29     18062

    accuracy                           0.76    253403
   macro avg       0.58      0.72      0.57    253403
weighted avg       0.91      0.76      0.82    253403

[[0.76803872 0.23196128]
 [0.3218359  0.6781641 ]]


(None, None)