In [42]:
import re
import nltk
import functools
import operator
import pandas as pd

from langdetect import detect
from langdetect.lang_detect_exception import LangDetectException
from textblob import Word

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [20]:
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to /Users/janis/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /Users/janis/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/janis/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /Users/janis/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [21]:
STOPWORDS = stopwords.words('english')

In [2]:
file_path = "data/DisneylandReviews.csv"

In [3]:
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,Review_ID,Rating,Year_Month,Reviewer_Location,Review_Text,Branch
0,670772142,4,2019-4,Australia,If you've ever been to Disneyland anywhere you...,Disneyland_HongKong
1,670682799,4,2019-5,Philippines,Its been a while since d last time we visit HK...,Disneyland_HongKong
2,670623270,4,2019-4,United Arab Emirates,Thanks God it wasn t too hot or too humid wh...,Disneyland_HongKong
3,670607911,4,2019-4,Australia,HK Disneyland is a great compact park. Unfortu...,Disneyland_HongKong
4,670607296,4,2019-4,United Kingdom,"the location is not in the city, took around 1...",Disneyland_HongKong


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1008 entries, 0 to 1007
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Review_ID          1008 non-null   int64 
 1   Rating             1008 non-null   int64 
 2   Year_Month         1008 non-null   object
 3   Reviewer_Location  1008 non-null   object
 4   Review_Text        1008 non-null   object
 5   Branch             1008 non-null   object
dtypes: int64(2), object(4)
memory usage: 47.4+ KB


# Preprocessing

In [10]:
def get_language_code(text):
    try:
        return detect(text)
    except LangDetectException:
        return np.nan

In [13]:
# Bottleneck on large datasets!
df["language_code"] = df.apply(lambda row: get_language_code(row["Review_Text"]), axis=1)

In [18]:
# Get none english row count
df.shape[0] - df[df["language_code"] == "en"].shape[0]

0

In [23]:
def clean_text(text: str):
    # remove and replace all urls
    text = re.sub(r'http\S+', ' ', text)

    # remove and replace none alphanumerical letters
    text = re.sub(r'\W+', ' ', text.lower())

    words = []
    for word in text.split():
        if word in STOPWORDS:
            continue
        words.append(Word(word).lemmatize())
    return " ".join(words)

In [28]:
df['text_cleaned'] = df['Review_Text'].apply(clean_text)

In [34]:
df['tokens'] = df['text_cleaned'].apply(word_tokenize)

In [35]:
df.head()

Unnamed: 0,Review_ID,Rating,Year_Month,Reviewer_Location,Review_Text,Branch,language_code,text_cleaned,tokens
0,670772142,4,2019-4,Australia,If you've ever been to Disneyland anywhere you...,Disneyland_HongKong,en,ever disneyland anywhere find disneyland hong ...,"[ever, disneyland, anywhere, find, disneyland,..."
1,670682799,4,2019-5,Philippines,Its been a while since d last time we visit HK...,Disneyland_HongKong,en,since last time visit hk disneyland yet time s...,"[since, last, time, visit, hk, disneyland, yet..."
2,670623270,4,2019-4,United Arab Emirates,Thanks God it wasn t too hot or too humid wh...,Disneyland_HongKong,en,thanks god hot humid visiting park otherwise w...,"[thanks, god, hot, humid, visiting, park, othe..."
3,670607911,4,2019-4,Australia,HK Disneyland is a great compact park. Unfortu...,Disneyland_HongKong,en,hk disneyland great compact park unfortunately...,"[hk, disneyland, great, compact, park, unfortu..."
4,670607296,4,2019-4,United Kingdom,"the location is not in the city, took around 1...",Disneyland_HongKong,en,location city took around 1 hour kowlon kid li...,"[location, city, took, around, 1, hour, kowlon..."


## Get Bigrams

In [30]:
def is_valid_2_gram(ngram):
    if '-pron-' in ngram or 't' in ngram:
        return False
    for word in ngram:
        if word in STOPWORDS or word.isspace():
            return False
    acceptable_types = ('JJ', 'JJR', 'JJS', 'NN', 'NNS', 'NNP', 'NNPS')
    second_type = ('NN', 'NNS', 'NNP', 'NNPS')
    tags = nltk.pos_tag(ngram)
    return tags[0][1] in acceptable_types and tags[1][1] in second_type

In [39]:
# Type: List of List
tokens = df['tokens'].tolist()

In [43]:
# Type List
flatten_token = list(functools.reduce(operator.concat, tokens))

In [49]:
bigram_finder = nltk.collocations.BigramCollocationFinder.from_words(flatten_token)

In [64]:
bigram_freq = [
    bigram for bigram in list(bigram_finder.ngram_fd.items())
    if is_valid_2_gram(bigram[0])
]

In [65]:
bigram_df = pd.DataFrame(list(bigram_freq), columns=['bigram','freq']).sort_values(by='freq', ascending=False)
bigram_df.head()

Unnamed: 0,bigram,freq
2,"(hong, kong)",292
435,"(kong, disneyland)",102
266,"(disney, park)",100
70,"(theme, park)",94
14,"(hk, disneyland)",88


## Get Trigrams

In [57]:
def is_valid_3_gram(ngram):
    if '-pron-' in ngram or 't' in ngram:
        return False
    for word in ngram:
        if word in STOPWORDS or word.isspace():
            return False
    first_type = ('JJ', 'JJR', 'JJS', 'NN', 'NNS', 'NNP', 'NNPS')
    third_type = ('JJ', 'JJR', 'JJS', 'NN', 'NNS', 'NNP', 'NNPS')
    tags = nltk.pos_tag(ngram)
    return tags[0][1] in first_type and tags[2][1] in third_type

In [59]:
trigram_finder = nltk.collocations.TrigramCollocationFinder.from_words(flatten_token)

In [66]:
trigram_freq = [
    trigram for trigram in list(trigram_finder.ngram_fd.items())
    if is_valid_3_gram(trigram[0])
]

In [67]:
triram_df = pd.DataFrame(list(trigram_freq), columns=['bigram','freq']).sort_values(by='freq', ascending=False)
triram_df.head()

Unnamed: 0,bigram,freq
287,"(lion, king, show)",43
1,"(disneyland, hong, kong)",38
20,"(iron, man, experience)",32
62,"(happiest, place, earth)",28
650,"(toy, story, land)",27
