In [1]:
# pyspellchecker helps with fixing typos
# !python3 -m pip install pyspellchecker
# !python3 -m spacy download en_core_web_lg


# Sentiment Analysis on Yelp dataset

## Loading libraries


In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re
import seaborn as sns
from matplotlib import rcParams

# import modules to help with preprocessing
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.corpus import stopwords
import spacy

# import modules to help with collecting & counting the tokens
from itertools import chain

# from collections import Counter
from nltk.probability import FreqDist
from typing import Union

# to lemmatise tokens in corpus
from nltk.stem import WordNetLemmatizer
from nltk.tag import pos_tag

# to trying to fix typos
# from spellchecker import SpellChecker

%matplotlib inline
rcParams["figure.figsize"] = (16, 9)
plt.style.use("ggplot")  # personal preference


## Loading the dataset


In [3]:
IO_TRAIN = "../input/yelp-review-dataset/yelp_review_polarity_csv/train.csv"
# quick checking the file, turns out it is missing the header
ylp = pd.read_csv(IO_TRAIN, header=None)
ylp.columns = ["sentiment", "review"]  # replacing numeric columns with meaningful names
# from readme.txt, newlines are replaced by '\n', which appears in string as escaped '\\n'
ylp["review"] = ylp["review"].apply(lambda rev: re.sub(r"\\n", "\n", rev))
# preview the data
ylp.head()


Unnamed: 0,sentiment,review
0,1,"Unfortunately, the frustration of being Dr. Go..."
1,2,Been going to Dr. Goldberg for over 10 years. ...
2,1,I don't know what Dr. Goldberg was like before...
3,1,I'm writing this review to give you a heads up...
4,2,All the food is great here. But the best thing...


to be able to deal with classes, it should be in some human interpretable form, so using knowledge from `readme.txt`, class 1 is `NEG` for negative, and class 2 is `POS` for positive


In [4]:
ylp.replace({1: "NEG", 2: "POS"}, inplace=True)
ylp["sentiment"] = ylp["sentiment"].astype("category")


### define globals


In [5]:
REGEX_URL = r"(?:https?://|www\.)\S+"
nlp = spacy.load("en_core_web_lg")
# combining the stopwords from gensim & nltk
STOPWORDS = STOPWORDS.union(frozenset(stopwords.words("english")), nlp.Defaults.stop_words)


## Preprocessing

load libraries to help with preprocessing, & define preprocessing subroutine


In [6]:
# import spacy

# correct = SpellChecker().correctionspacy.lang.en
nlp = spacy.load("en_core_web_lg")
STOPWORDS = STOPWORDS.union(nlp.Defaults.stop_words)

# to_disable = ["tok2vec", "tagger", "parser", "attribute_ruler", "ner"]
to_disable = ["tok2vec", "parser", "ner"]

# define preprocess subroutine
def preprocess(
    doc: str, stopwords: frozenset = STOPWORDS, **kwargs
) -> spacy.tokens.doc.Doc:
    """
    TODO: fill in pydoc
    """
    # remove any hyperlinks
    doc_processed = re.sub(REGEX_URL, "", doc)

    # takes care of tokenising, lowercasing, and removing punctuations
    doc_processed = simple_preprocess(
        doc_processed,
        min_len=kwargs.get("min_len", 2),
        max_len=kwargs.get("max_len", 15),
    )

    # dropping stopwords & fixing typos (if any)
    doc_processed = [
        # FIXME: correct takes a long time, skip for now
        # correct(token) for token in doc_processed if token not in stopwords
        token
        for token in doc_processed
        if token not in stopwords
    ]

    # lemmatising the tokens
    return nlp(" ".join(doc_processed), disable=to_disable)


def preprocess_generator(
    docs,
    batch_size=10000,
    stopwords: frozenset = STOPWORDS,
    **kwargs
):
    """
    TODO:

    """
    n = len(docs)
    for i in range(0, n, batch_size):
        yield docs.iloc[i : i + batch_size].apply(preprocess)


In [7]:
ylp_processed = ylp.copy()  # a clean copy of raw data for preprocessing
reviews = ylp_processed["review"]


In [8]:
result = pd.Series(dtype='object')
for res in preprocess_generator(reviews):
    result = result.append(res)


In [9]:
ylp_processed["review"] = result
# dropping empty reviews
ylp_processed = ylp_processed[ylp_processed["review"].apply(len) > 0]
ylp_processed.shape, ylp.shape


((559907, 2), (560000, 2))

let's compare the processed reviews against raw reviews


In [10]:
ylp.shape[0] - ylp_processed.shape[0]


93

we have dropped $93$ records on account of being empty reviews, after preprocessing
