In [None]:
pip install pyLDAvis



In [None]:
pip install langdetect

Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[?25l[K     |▍                               | 10 kB 15.6 MB/s eta 0:00:01[K     |▊                               | 20 kB 19.1 MB/s eta 0:00:01[K     |█                               | 30 kB 22.5 MB/s eta 0:00:01[K     |█▍                              | 40 kB 24.0 MB/s eta 0:00:01[K     |█▊                              | 51 kB 11.2 MB/s eta 0:00:01[K     |██                              | 61 kB 8.9 MB/s eta 0:00:01[K     |██▍                             | 71 kB 8.8 MB/s eta 0:00:01[K     |██▊                             | 81 kB 9.5 MB/s eta 0:00:01[K     |███                             | 92 kB 8.5 MB/s eta 0:00:01[K     |███▍                            | 102 kB 9.2 MB/s eta 0:00:01[K     |███▊                            | 112 kB 9.2 MB/s eta 0:00:01[K     |████                            | 122 kB 9.2 MB/s eta 0:00:01[K     |████▍                           | 133 kB 9.2 MB/s eta 0:00:01[K  

In [1]:
import re
import nltk
import functools
import operator
import pandas as pd
import gensim
import pyLDAvis.gensim_models

from langdetect import detect
from langdetect.lang_detect_exception import LangDetectException
from textblob import Word

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from gensim.corpora import Dictionary

  from collections import Iterable


In [15]:
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
STOPWORDS = stopwords.words('english')

In [3]:
file_path = "/content/drive/MyDrive/Hackathon/NanoGiants3/datasets/DisneylandReviews.csv"

In [4]:
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,Review_ID,Rating,Year_Month,Reviewer_Location,Review_Text,Branch
0,670772142,4,2019-4,Australia,If you've ever been to Disneyland anywhere you...,Disneyland_HongKong
1,670682799,4,2019-5,Philippines,Its been a while since d last time we visit HK...,Disneyland_HongKong
2,670623270,4,2019-4,United Arab Emirates,Thanks God it wasn t too hot or too humid wh...,Disneyland_HongKong
3,670607911,4,2019-4,Australia,HK Disneyland is a great compact park. Unfortu...,Disneyland_HongKong
4,670607296,4,2019-4,United Kingdom,"the location is not in the city, took around 1...",Disneyland_HongKong


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1008 entries, 0 to 1007
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Review_ID          1008 non-null   int64 
 1   Rating             1008 non-null   int64 
 2   Year_Month         1008 non-null   object
 3   Reviewer_Location  1008 non-null   object
 4   Review_Text        1008 non-null   object
 5   Branch             1008 non-null   object
dtypes: int64(2), object(4)
memory usage: 47.4+ KB


# Preprocessing

In [6]:
def get_language_code(text):
    try:
        return detect(text)
    except LangDetectException:
        return np.nan

In [7]:
# Bottleneck on large datasets!
df["language_code"] = df.apply(lambda row: get_language_code(row["Review_Text"]), axis=1)

In [8]:
# Get none english row count
df.shape[0] - df[df["language_code"] == "en"].shape[0]

0

In [9]:
def clean_text(text: str):
    # remove and replace all urls
    text = re.sub(r'http\S+', ' ', text)

    # remove and replace none alphanumerical letters
    text = re.sub(r'\W+', ' ', text.lower())

    words = []
    for word in text.split():
        if word in STOPWORDS:
            continue
        words.append(Word(word).lemmatize())
    return " ".join(words)

In [10]:
df['text_cleaned'] = df['Review_Text'].apply(clean_text)

In [11]:
df['tokens'] = df['text_cleaned'].apply(word_tokenize)

In [12]:
df.head()

Unnamed: 0,Review_ID,Rating,Year_Month,Reviewer_Location,Review_Text,Branch,language_code,text_cleaned,tokens
0,670772142,4,2019-4,Australia,If you've ever been to Disneyland anywhere you...,Disneyland_HongKong,en,ever disneyland anywhere find disneyland hong ...,"[ever, disneyland, anywhere, find, disneyland,..."
1,670682799,4,2019-5,Philippines,Its been a while since d last time we visit HK...,Disneyland_HongKong,en,since last time visit hk disneyland yet time s...,"[since, last, time, visit, hk, disneyland, yet..."
2,670623270,4,2019-4,United Arab Emirates,Thanks God it wasn t too hot or too humid wh...,Disneyland_HongKong,en,thanks god hot humid visiting park otherwise w...,"[thanks, god, hot, humid, visiting, park, othe..."
3,670607911,4,2019-4,Australia,HK Disneyland is a great compact park. Unfortu...,Disneyland_HongKong,en,hk disneyland great compact park unfortunately...,"[hk, disneyland, great, compact, park, unfortu..."
4,670607296,4,2019-4,United Kingdom,"the location is not in the city, took around 1...",Disneyland_HongKong,en,location city took around 1 hour kowlon kid li...,"[location, city, took, around, 1, hour, kowlon..."


# LDA

In [13]:
id2word = Dictionary(df['tokens'])
texts = df['tokens']
corpus = [id2word.doc2bow(doc) for doc in df['tokens']]

In [17]:
lda_model = gensim.models.ldamodel.LdaModel(
    id2word=id2word,
    num_topics = 4,
    random_state=42,
    passes = 10,
    update_every=1,
    alpha='auto',
    per_word_topics=True
)

In [18]:
pyLDAvis.enable_notebook()

In [19]:
vis = pyLDAvis.gensim_models.prepare(
    lda_model,
    corpus,
    id2word
)
vis

  by='saliency', ascending=False).head(R).drop('saliency', 1)
