##### Silver Speech and Golden Silence: Spoiler Detection Project

## Text Preprocessing and Baseline Model (SGD-Classifier)

In [25]:
#Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import re
import spacy
from sklearn.utils import resample
from langdetect import detect
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report


In [2]:
#Disable scientific notation for floats
pd.options.display.float_format = '{:,}'.format

#Enable viewing more (in this case: all) features of a dataset
pd.set_option('display.max_columns', 500)

#ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [None]:
#Load datafile
train = pd.read_json('/Volumes/My Passport OSX/NF_Capstone_Spoiler_Detection/data/train_data.json')

Since our dataset is very large, we only take a sample of about 25,000 and only apply the code on this sample before executing it on all data.
Since the data is imbalanced with regard to the target feature, we take a balanced sample.

In [3]:
#Take a sample from the data 
def get_sample(df):
    sample = df[['review','spoiler','genre','spoiler_dum', 'sentence_labels', 'review_texts',
                      'review_len']].sample(n = 200000, random_state = 42).reset_index(drop = True) 
    return sample

In [None]:
train_sample = get_sample(train)

In [4]:
#Since the data is imbalanced, we downsample the non-spoilers. This further reduces the sample to about 25,000.
def downsample_nonspoilers(df):
    df_majority = df[df['spoiler_dum'] == 0] #nonspoilers
    df_minority = df[df['spoiler_dum'] == 1] #spoilers
    
    # Downsample majority labels equal to the number of samples in the minority class
    df_majority = df_majority.sample(len(df_minority), random_state = 42)

    # Concatenate the majority and minority dataframes
    sample = pd.concat([df_majority, df_minority])
    
    sample.reset_index(inplace = True, drop = True)
    
    return sample

In [None]:
sample = downsample_nonspoilers(train_sample)

In [None]:
sample.spoiler.value_counts()

In [None]:
#Save subsample
sample.to_json('data/train_sample.json')

### Text Preprocessing

Let's check if all reviews are really written in English by using langdetect.
The language code is written in a new column 'lang'.

In [5]:
#Language detection
#Let's check if all reviews are really written in English by using langdetect.
# The language code is written in a new column 'lang'. 
# Then, all non-english cases are dropped.
def lang_det(df):
    df['lang'] = pd.Series()
    for i in tqdm(range(len(df))):
        df['lang'][i] = detect(df['review_texts'][i])
    df.drop(df[df['lang'] != 'en'].index, inplace = True)
    df.reset_index(inplace = True, drop = True)
    return df

In [None]:
sample = lang_det(sample)

In [None]:
sample.lang.value_counts()

We need to adapt the reviews before we can feed them to a model:
1. Make all words lower case
2. Noise Removal
  * remove links, email adresses etc
  * numbers and special characters
  * remove stop words 
  * lemmatize 
3. Tokenize

In [6]:
#Function to replace short forms/ enlarge contractions
#https://en.wikipedia.org/wiki/Wikipedia%3aList_of_English_contractions
def decontracted(text):
    # specific
    text = re.sub(r"won\'t", "will not", text)
    text = re.sub(r"can\'t", "can not", text)

    # general
    text = re.sub(r"n\'t", " not", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'s", " is", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'t", " not", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'m", " am", text)
    text = re.sub(r"\'m", " am", text)
    
    #words
    text = re.sub("gimme", "give me", text)
    text = re.sub("cuz", "because", text)
    text = re.sub("'cause", "give me", text)
    text = re.sub("finna", "fixing to", text)
    text = re.sub("cuz", "because", text)
    text = re.sub("wanna", "want to", text)
    text = re.sub("gotta", "got to", text)
    text = re.sub("hafta", "have to", text)
    text = re.sub("woulda", "would have", text)
    text = re.sub("coulda", "could have", text)
    text = re.sub("shoulda", "should have", text)
    text = re.sub("ma'am", "madam", text)
    text = re.sub("howdy", "how do you", text)
    text = re.sub("let's", "let us", text)
    text = re.sub("y'all", "you all", text)

    return text

In [7]:
#Function for Noise Removal
def preprocessing(text):
    #t0 = time.time()
    # remove '--' and replace them with whitespace
    text = text.replace('-', ' ')
    #change to lower case
    text = text.lower()
    # replace contractions
    text = decontracted(text)
    #remove urls if there are any
    text = re.sub(r'http:\S+', '', text)
    text = re.sub(r'www\S+', '', text)
    #remove emails and words containing @
    text = re.sub("\S*@\S*\s?"," ", text)
    # remove digits and words containing digits
    text = re.sub(r"\d", "", text)
    #remove punctuation
    text = re.sub(r'[(,.;:@#?!&$)"*/-]+', ' ', text)
    text = re.sub(r"[']", '', text)
    # replace whitespaces
    text = re.sub(r"\s+", ' ', text).strip()
    return text

In [8]:
#Function for Spellcheck (includes preprocessing function)
from spellchecker import SpellChecker
from symspellpy import SymSpell, Verbosity
import pkg_resources

symspell1 = SymSpell()

def spellcheck(text):
    '''
    function takes string as input, preprocesses text and returns a spellchecked text
    '''
    #preprocess text
    text = preprocessing(text)
    #hand to spellchecking and return the best result only if corrected phrase is more than just a single letter
    # otherwise continue
    sp = symspell1.lookup_compound(text, max_edit_distance=1)
    if sp[0].term == None:
        return text
    else:
        return sp[0].term

In [9]:
#Function for lemmatization
nlp_spacy = spacy.load("en_core_web_sm")

def lemmatizer(text):
    '''
    tokenizes string input using spacy 
    removes english stopwords
    '''
    doc = nlp_spacy(text)
    text = " ".join([token.lemma_ for token in doc if not token.is_stop])
    return text

In [None]:
#New sample feature with preprocessed reviews
sample['prep'] = pd.Series(list)

for row in tqdm(range(len(sample))):
    sample['prep'][row] = []
    for sentence in sample.review[row]:
        sample['prep'][row].append(spellcheck(sentence[1]))

In [None]:
#New sample feature with lemmatized data
sample['tokenized'] = pd.Series(list)

for row in tqdm(range(len(sample))):
    sample['tokenized'][row] = []
    for sentence in sample.prep[row]:
        sample['tokenized'][row].append(lemmatizer(sentence))

In [None]:
sample.head()

We drop all 1-sentence-reviews for these contain not much information.

In [10]:
#Function to drop one-sentence-reviews
def drop_onesen(df):
    df['review_len'][row] = len(df['review'][row])
    df.drop(df[df['review_len'] == 1].index, inplace = True)
    df.reset_index(drop = True, inplace = True)
    return df

In [None]:
drop_onesen(sample)