# Importing necessary things

In [1]:
import pandas as pd
import numpy as np
import sklearn

import sklearn.metrics as metrics
from sklearn.utils import resample
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, recall_score, precision_score
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.svm import SVC

import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize, RegexpTokenizer, TweetTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.probability import FreqDist

import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import cm

from collections import Counter

import re
from datetime import datetime

import pickle

import string

==================================================================================================================

# Retrieving lists and making Dataframes

In [2]:
# Retrieving the lists.

import bz2

def decompress_pickle(file):
    data = bz2.BZ2File(file, 'rb')
    data = pickle.load(data)
    return data

In [4]:
# Seeing if everything transferred properly

reviewlistglobal = decompress_pickle('compressedlistreview.pbz2')
reviewlistsea = decompress_pickle('compressedlistreview2.pbz2')

print(len(reviewlistglobal))
print(len(reviewlistsea))

6469
9950


In [6]:
dfglobal = pd.DataFrame(reviewlistglobal)
dfglobal.head()

Unnamed: 0,content,score,thumbsUpCount,reviewCreatedVersion,at,replyContent,appId
0,pretty good,5,0,1.0.1,2019-01-15 22:34:05,,com.gravity.romNAg
1,oh how I have missed this game,5,0,1.0.1,2019-01-12 09:36:02,,com.gravity.romNAg
2,Just like the original Ragnarok Online on PC. ...,5,1,1.0.5,2019-08-12 02:33:52,,com.gravity.romNAg
3,Good so far,5,0,1.2.2,2020-08-12 19:28:59,,com.gravity.romNAg
4,love it,5,0,1.0.1,2019-03-14 17:33:28,,com.gravity.romNAg


In [7]:
dfsea = pd.DataFrame(reviewlistsea)
dfsea.head()

Unnamed: 0,content,score,thumbsUpCount,reviewCreatedVersion,at,replyContent,appId
0,all in 1 good gaming experience ...,5,0,,2018-12-07 05:05:47,,com.gravity.romg
1,"Good game, bisa nostalgia maen Ragnarok dan ve...",5,0,1.0.5,2018-11-05 23:08:30,,com.gravity.romg
2,nostalgia. brings back the memories,5,0,1.0.5,2018-10-31 20:57:24,,com.gravity.romg
3,Great game,5,0,1.0.25,2020-01-17 23:09:46,,com.gravity.romg
4,need Bard & Sage class ...,5,0,1.0.8,2019-04-30 16:22:49,,com.gravity.romg


# Changing datatypes to correct datatypes.

In [8]:
#Changing datatype to correct datatype as they are all objects.

print(dfglobal.dtypes)
print(dfsea.dtypes)

content                 object
score                   object
thumbsUpCount           object
reviewCreatedVersion    object
at                      object
replyContent            object
appId                   object
dtype: object
content                 object
score                   object
thumbsUpCount           object
reviewCreatedVersion    object
at                      object
replyContent            object
appId                   object
dtype: object


In [9]:
dfglobal['content'] = dfglobal['content'].astype('string')
dfglobal['score'] = dfglobal['score'].astype('int')
dfglobal['thumbsUpCount'] = dfglobal['thumbsUpCount'].astype('int')
dfglobal['reviewCreatedVersion'] = dfglobal['reviewCreatedVersion'].astype('string')
dfglobal['at'] = dfglobal['at'].astype('string')
dfglobal['replyContent'] = dfglobal['replyContent'].astype('string')
dfglobal['appId'] = dfglobal['appId'].astype('string')

dfsea['content'] = dfsea['content'].astype('string')
dfsea['score'] = dfsea['score'].astype('int')
dfsea['thumbsUpCount'] = dfsea['thumbsUpCount'].astype('int')
dfsea['reviewCreatedVersion'] = dfsea['reviewCreatedVersion'].astype('string')
dfsea['at'] = dfsea['at'].astype('string')
dfsea['replyContent'] = dfsea['replyContent'].astype('string')
dfsea['appId'] = dfsea['appId'].astype('string')

In [14]:
# checking for null values.

print(dfglobal.isna().sum())
print('-------------------------')
print(dfsea.isna().sum())

content                 0
score                   0
thumbsUpCount           0
reviewCreatedVersion    0
at                      0
replyContent            0
appId                   0
dtype: int64
-------------------------
content                 0
score                   0
thumbsUpCount           0
reviewCreatedVersion    0
at                      0
replyContent            0
appId                   0
dtype: int64


==================================================================================================================

# EDA

In [15]:
# Looking at distribution of ratings in both global and sea

print(dfglobal['score'].value_counts())
print('---------------------')
print(dfsea['score'].value_counts())

5    3082
1    1766
4     649
3     565
2     407
Name: score, dtype: int64
---------------------
5    9950
Name: score, dtype: int64


In [None]:
scorecount = Counter(df['score'])
scorecount.most_common(5)

In [None]:
# Plotting distribution of ratings out of all the reviews
# Interestingly, it is very similar to overall rating distribution across all top mobile games.


xaxis = [val[0] for val in scorecount.most_common(5)]
yaxis = [val[1] for val in scorecount.most_common(5)]

plt.figure(figsize=(10,10))
ax = sns.barplot(xaxis, yaxis)

plt.title('% Rating for Ragnarok Online Mobile')
plt.ylabel('Count', fontsize = 20)
plt.xlabel('Rating', fontsize = 20)


for p in ax.patches:
             ax.annotate("%.2f" % (p.get_height()*100/sum(yaxis))+'%', (p.get_x() + p.get_width() / 2., p.get_height()),
                 ha='center', va='center', fontsize=13, color='black', xytext=(0, 20),
                 textcoords='offset points')
        
for p in ax.patches:
             ax.annotate(str(p.get_height()), (p.get_x() + p.get_width() / 2., p.get_height()),
                 ha='center', va='center', fontsize=12, color='black', xytext=(0, -60),
                 textcoords='offset points')

In [None]:
df['thumbsUpCount'].value_counts()

In [None]:
df['reviewCreatedVersion'].value_counts()

### Datetime cleaning

In [None]:
df['at'].value_counts()

In [None]:
df['at'] = df['at'].str[0:10]
df['at'].value_counts()

In [None]:
df['month'] = df['at'].str[5:7]
df['month']

In [None]:
df['at'] = pd.to_datetime(df['at'], format='%Y-%m-%d')
df['at']

In [None]:
reviewtimecounts = df['at'].value_counts().reset_index(name='counts')
reviewtimecounts.columns = ['date', 'counts']
reviewtimecounts

In [None]:
reviewtimecounts = reviewtimecounts.sort_values(by='date')
reviewtimecounts

In [None]:
reviewmonthcount = df['month'].value_counts().reset_index(name='counts')
reviewmonthcount.columns = ['month', 'counts']
reviewmonthcount

In [None]:
fig, ax = plt.subplots(figsize=(16, 10))

ax.plot(reviewtimecounts['date'],
        reviewtimecounts['counts'],
        color='red')

ax.set(xlabel="Date",
       ylabel="Count",
       title="Number of Reviews throughout history of ROM")

plt.show()

In [None]:
plt.figure(figsize=(18,13))
ax = sns.barplot(reviewmonthcount['month'], reviewmonthcount['counts'])

plt.title('Distribution of Reviews in ROM by Month', fontsize = 20)
plt.ylabel('Count', fontsize = 20)
plt.xlabel('Month', fontsize = 20)


for p in ax.patches:
             ax.annotate("%.2f" % (p.get_height()*100/sum(reviewmonthcount['counts']))+'%', (p.get_x() + p.get_width() / 2., p.get_height()),
                 ha='center', va='center', fontsize=13, color='black', xytext=(0, 20),
                 textcoords='offset points')
        
for p in ax.patches:
             ax.annotate(str(p.get_height()), (p.get_x() + p.get_width() / 2., p.get_height()),
                 ha='center', va='center', fontsize=12, color='black', xytext=(0, -8),
                 textcoords='offset points')

## Meta Features

In [None]:
df['content']

In [None]:
df["numwords"] = df["content"].apply(lambda x: len(str(x).split()))

df["numuniquewords"] = df["content"].apply(lambda x: len(set(str(x).split())))

df["numchars"] = df["content"].apply(lambda x: len(str(x)))

df["numpuncts"] = df['content'].apply(lambda x: len([c for c in str(x) if c in string.punctuation]))

df["numcapts"] = df["content"].apply(lambda x: len([w for w in str(x).split() if w.isupper()]))

df["numtitles"] = df["content"].apply(lambda x: len([w for w in str(x).split() if w.istitle()]))

df["meanwordlength"] = df["content"].apply(lambda x: np.mean([len(w) for w in str(x).split()]))

In [None]:
df['numwords']

In [None]:
## Truncate some extreme values for better visuals ##
df['numwords'].loc[df['numwords']>250] = 250 
df['numchars'].loc[df['numchars']>1250] = 1250
df['numpuncts'].loc[df['numpuncts']>45] = 45

f, axes = plt.subplots(3, 1, figsize=(10,20))

sns.boxplot(x='score', y='numwords', data=df, ax=axes[0])
axes[0].set_xlabel('Rating', fontsize=10)
axes[0].set_ylabel('Number of Words', fontsize=10)
axes[0].set_title("Number of words in each rating", fontsize=12)

sns.boxplot(x='score', y='numchars', data=df, ax=axes[1])
axes[1].set_xlabel('Rating', fontsize=10)
axes[0].set_ylabel('Number of Characters', fontsize=10)
axes[1].set_title("Number of characters in each rating", fontsize=12)

sns.boxplot(x='score', y='numpuncts', data=df, ax=axes[2])
axes[2].set_xlabel('Rating', fontsize=10)
axes[0].set_ylabel('Number of Punctuations', fontsize=10)
axes[2].set_title("Number of punctuations in each rating", fontsize=12)
plt.show()

In [None]:
df['numcapts'].loc[df['numcapts']>36] = 36 
df['numtitles'].loc[df['numtitles']>60] = 60
df['meanwordlength'].loc[df['meanwordlength']>18] = 18

f, axes = plt.subplots(3, 1, figsize=(10,20))

sns.boxplot(x='score', y='numcapts', data=df, ax=axes[0])
axes[0].set_xlabel('Rating', fontsize=10)
axes[0].set_ylabel('Number of Capitalized Words', fontsize=10)
axes[0].set_title("Number of Capitalized words in each rating", fontsize=12)

sns.boxplot(x='score', y='numtitles', data=df, ax=axes[1])
axes[1].set_xlabel('Rating', fontsize=10)
axes[0].set_ylabel('Number of Title Characters', fontsize=10)
axes[1].set_title("Number of title in each rating", fontsize=12)

sns.boxplot(x='score', y='meanwordlength', data=df, ax=axes[2])
axes[2].set_xlabel('Rating', fontsize=10)
axes[0].set_ylabel('Mean Number of Words', fontsize=10)
axes[2].set_title("Mean number of words in each rating", fontsize=12)
plt.show()

==================================================================================================================

# Tokenizing

In [None]:
# Separating reviews into their scores.

rating1 = df.loc[df['score'] == 1]
rating2 = df.loc[df['score'] == 2]
rating3 = df.loc[df['score'] == 3]
rating4 = df.loc[df['score'] == 4]
rating5 = df.loc[df['score'] == 5]

In [None]:
# Remove common words
stopwords_list = stopwords.words('english') + list(string.punctuation)
# Remove more punctuation
stopwords_list += ["''", '""', '...', '``']
# Remove random numerals
stopwords_list += ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
# Remove contractions
stopwords_list += ["'d", "'ve", "ca", "n't",  "'t", "'re", "'ll", "'s"]

In [None]:
def process_reviews(reviews):
    tokens = nltk.word_tokenize(reviews)
    stopwords_removed = [token.lower() for token in tokens if token.lower() not in stopwords_list]
    return stopwords_removed   

In [None]:
starttime = datetime.now()

rating1['wordlist'] = rating1['content'].apply(lambda x: process_reviews(x))
rating2['wordlist'] = rating2['content'].apply(lambda x: process_reviews(x))
rating3['wordlist'] = rating3['content'].apply(lambda x: process_reviews(x))
rating4['wordlist'] = rating4['content'].apply(lambda x: process_reviews(x))
rating5['wordlist'] = rating5['content'].apply(lambda x: process_reviews(x))

print(datetime.now() - starttime)

In [None]:
lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmatize_text(text):
    return [lemmatizer.lemmatize(w) for w in text]

In [None]:
starttime = datetime.now()

rating1['wordlistlem'] = rating1['wordlist'].apply(lemmatize_text)
rating2['wordlistlem'] = rating2['wordlist'].apply(lemmatize_text)
rating3['wordlistlem'] = rating3['wordlist'].apply(lemmatize_text)
rating4['wordlistlem'] = rating4['wordlist'].apply(lemmatize_text)
rating5['wordlistlem'] = rating5['wordlist'].apply(lemmatize_text)

print(datetime.now() - starttime)

# Saving Dataframes

In [None]:
# Saving tokens into a sub-folder in for tidiness.

from pathlib import Path

root = Path('.')

my_path1 = root / "tokens" / "processedreviewrating1.pbz2"
my_path2 = root / "tokens" / "processedreviewrating2.pbz2"
my_path3 = root / "tokens" / "processedreviewrating3.pbz2"
my_path4 = root / "tokens" / "processedreviewrating4.pbz2"
my_path5 = root / "tokens" / "processedreviewrating5.pbz2"

import bz2

def compressed_pickle(path, data):
    with bz2.BZ2File(path, 'w') as f:
        pickle.dump(data, f)

In [None]:
compressed_pickle(my_path1, rating1)
compressed_pickle(my_path2, rating2)
compressed_pickle(my_path3, rating3)
compressed_pickle(my_path4, rating4)
compressed_pickle(my_path5, rating5)

In [None]:
# Retrieve our pickled dataframes

rating1 = decompress_pickle(my_path1)
rating2 = decompress_pickle(my_path2)
rating3 = decompress_pickle(my_path3)
rating4 = decompress_pickle(my_path4)
rating5 = decompress_pickle(my_path5)

# Word Frequency in all reviews

In [None]:
starttime = datetime.now()

processedreviewrating1 = [item for elem in rating1['wordlistlem'] for item in elem]
processedreviewrating2 = [item for elem in rating2['wordlistlem'] for item in elem]
processedreviewrating3 = [item for elem in rating3['wordlistlem'] for item in elem]
processedreviewrating4 = [item for elem in rating4['wordlistlem'] for item in elem]
processedreviewrating5 = [item for elem in rating5['wordlistlem'] for item in elem]

print(datetime.now() - starttime)
print('------------')
print(len(processedreviewrating1))
print(len(processedreviewrating2))
print(len(processedreviewrating3))
print(len(processedreviewrating4))
print(len(processedreviewrating5))

In [None]:
totalprocessedreviews = processedreviewrating1 + processedreviewrating2 + processedreviewrating3 + processedreviewrating4 + processedreviewrating5
len(totalprocessedreviews)

In [None]:
review_freqdist = FreqDist(totalprocessedreviews)
review_freqdist.most_common(20)

In [None]:
mostcommonwords20 = review_freqdist.most_common(20)

yaxis20 = [val[1] for val in mostcommonwords20]

xaxis20 = [val[0] for val in mostcommonwords20]

plt.figure(figsize=(18, 14))
ax = sns.barplot(xaxis20, yaxis20)
plt.title('Top 20 Most Common Words in Ragnarok Mobile Reviews', fontsize = 25)
plt.ylabel('Frequency of word', fontsize = 20)
plt.xticks(rotation=45, fontsize = 15)

for p in ax.patches:
             ax.annotate("%.2f" % (p.get_height()*100/sum(yaxis20))+'%', (p.get_x() + p.get_width() / 2., p.get_height()),
                 ha='center', va='center', fontsize=13, color='black', xytext=(0, 20),
                 textcoords='offset points')

# Top 20 words per Rating [Lemmatized]

In [None]:
uselesswords = ['game', 'play', 'get', 'still', 'would']
processedreviewrating1[:] = [x for x in processedreviewrating1 if x not in uselesswords]

In [None]:
review1_freqdist = FreqDist(processedreviewrating1)
review1_freqdist.most_common(20)

In [None]:
mostcommonwords201 = review1_freqdist.most_common(20)

yaxis201 = [val[1] for val in mostcommonwords201]

xaxis201 = [val[0] for val in mostcommonwords201]

plt.figure(figsize=(18, 14))
ax = sns.barplot(xaxis201, yaxis201)
plt.title('Top 20 Most Common Words in Ragnarok Mobile Reviews for Rating 1', fontsize = 25)
plt.ylabel('Frequency of word', fontsize = 20)
plt.xticks(rotation=45, fontsize = 15)

for p in ax.patches:
             ax.annotate("%.2f" % (p.get_height()*100/sum(yaxis201))+'%', (p.get_x() + p.get_width() / 2., p.get_height()),
                 ha='center', va='center', fontsize=13, color='black', xytext=(0, 20),
                 textcoords='offset points')

In [None]:
processedreviewrating2[:] = [x for x in processedreviewrating2 if x not in uselesswords]

In [None]:
review2_freqdist = FreqDist(processedreviewrating2)
review2_freqdist.most_common(20)

In [None]:
mostcommonwords202 = review2_freqdist.most_common(20)

yaxis202 = [val[1] for val in mostcommonwords202]

xaxis202 = [val[0] for val in mostcommonwords202]

plt.figure(figsize=(18, 14))
ax = sns.barplot(xaxis202, yaxis202)
plt.title('Top 20 Most Common Words in Ragnarok Mobile Reviews for Rating 2', fontsize = 25)
plt.ylabel('Frequency of word', fontsize = 20)
plt.xticks(rotation=45, fontsize = 15)

for p in ax.patches:
             ax.annotate("%.2f" % (p.get_height()*100/sum(yaxis202))+'%', (p.get_x() + p.get_width() / 2., p.get_height()),
                 ha='center', va='center', fontsize=13, color='black', xytext=(0, 20),
                 textcoords='offset points')

In [None]:
processedreviewrating3[:] = [x for x in processedreviewrating3 if x not in uselesswords]

In [None]:
review3_freqdist = FreqDist(processedreviewrating3)
review3_freqdist.most_common(20)

In [None]:
mostcommonwords203 = review3_freqdist.most_common(20)

yaxis203 = [val[1] for val in mostcommonwords203]

xaxis203 = [val[0] for val in mostcommonwords203]

plt.figure(figsize=(18, 14))
ax = sns.barplot(xaxis203, yaxis203)
plt.title('Top 20 Most Common Words in Ragnarok Mobile Reviews for Rating 3', fontsize = 25)
plt.ylabel('Frequency of word', fontsize = 20)
plt.xticks(rotation=45, fontsize = 15)

for p in ax.patches:
             ax.annotate("%.2f" % (p.get_height()*100/sum(yaxis203))+'%', (p.get_x() + p.get_width() / 2., p.get_height()),
                 ha='center', va='center', fontsize=13, color='black', xytext=(0, 20),
                 textcoords='offset points')

In [None]:
processedreviewrating4[:] = [x for x in processedreviewrating4 if x not in uselesswords]

In [None]:
review4_freqdist = FreqDist(processedreviewrating4)
review4_freqdist.most_common(20)

In [None]:
mostcommonwords204 = review4_freqdist.most_common(20)

yaxis204 = [val[1] for val in mostcommonwords204]

xaxis204 = [val[0] for val in mostcommonwords204]

plt.figure(figsize=(18, 14))
ax = sns.barplot(xaxis204, yaxis204)
plt.title('Top 20 Most Common Words in Ragnarok Mobile Reviews for Rating 4', fontsize = 25)
plt.ylabel('Frequency of word', fontsize = 20)
plt.xticks(rotation=45, fontsize = 15)

for p in ax.patches:
             ax.annotate("%.2f" % (p.get_height()*100/sum(yaxis204))+'%', (p.get_x() + p.get_width() / 2., p.get_height()),
                 ha='center', va='center', fontsize=13, color='black', xytext=(0, 20),
                 textcoords='offset points')

In [None]:
processedreviewrating5[:] = [x for x in processedreviewrating5 if x not in uselesswords]

In [None]:
review5_freqdist = FreqDist(processedreviewrating5)
review5_freqdist.most_common(20)

In [None]:
mostcommonwords205 = review5_freqdist.most_common(20)

yaxis205 = [val[1] for val in mostcommonwords205]

xaxis205 = [val[0] for val in mostcommonwords205]

plt.figure(figsize=(18, 14))
ax = sns.barplot(xaxis205, yaxis205)
plt.title('Top 20 Most Common Words in Ragnarok Mobile Reviews for Rating 5', fontsize = 25)
plt.ylabel('Frequency of word', fontsize = 20)
plt.xticks(rotation=45, fontsize = 15)

for p in ax.patches:
             ax.annotate("%.2f" % (p.get_height()*100/sum(yaxis205))+'%', (p.get_x() + p.get_width() / 2., p.get_height()),
                 ha='center', va='center', fontsize=13, color='black', xytext=(0, 20),
                 textcoords='offset points')

# Bigrams/Trigrams overall

In [None]:
def n_gramall(df_col, n=None):
    # Build the vectorizer, specify inputted n-gram, remove stop words
    vect = CountVectorizer(ngram_range = (n,n), stop_words = gramstopwords_list, min_df = 10)
    # fit/transform vectorizer on specified column
    vect_fit = vect.fit_transform(df_col.values.astype('U'))
    # get all words and store in a list
    word_list = vect.get_feature_names()
    # get a count of all the words
    count_list = vect_fit.toarray().sum(axis=0)
    #create a dictionary of word to count pairings
    count_dict = dict(zip(word_list, count_list))
    # sort the dictionary to show highest counted words on top
    sort_vocab = sorted(count_dict.items(), key=lambda x: x[1], reverse=True)
    return sort_vocab

In [None]:
gramstopwords_list = stopwords.words('english') + list(string.punctuation)
gramstopwords_list += ["''", '""', '...', '``']

In [None]:
bigramall = n_gramall(df['content'], n=2)
bigramall[:20]

In [None]:
mostcommonwords20bigram = bigramall[:20]

yaxis20bigram = [val[1] for val in mostcommonwords20bigram]

xaxis20biram = [val[0] for val in mostcommonwords20bigram]

plt.figure(figsize=(18, 14))
ax = sns.barplot(xaxis20biram, yaxis20bigram, palette ="rocket_r")
plt.title('Top 20 Most Bigrams in Ragnarok Mobile Reviews Overall', fontsize = 25)
plt.ylabel('Frequency of word', fontsize = 20)
plt.xticks(rotation=45, fontsize = 15, ha='right')


for p in ax.patches:
             ax.annotate("%.2f" % (p.get_height()*100/sum(yaxis20bigram))+'%', (p.get_x() + p.get_width() / 2., p.get_height()),
                 ha='center', va='center', fontsize=13, color='black', xytext=(0, 20),
                 textcoords='offset points')

In [None]:
trigramall = n_gramall(df['content'], n=3)
trigramall[:20]

In [None]:
mostcommonwords20trigram = trigramall[:20]

yaxis20trigram = [val[1] for val in mostcommonwords20trigram]

xaxis20triram = [val[0] for val in mostcommonwords20trigram]

plt.figure(figsize=(18, 14))
ax = sns.barplot(xaxis20triram, yaxis20trigram, palette ="cubehelix")
plt.title('Top 20 Most Trigrams in Mobile Game Reviews Overall', fontsize = 25)
plt.ylabel('Frequency of word', fontsize = 20)
plt.xticks(rotation=45, fontsize = 15, ha='right')

for p in ax.patches:
             ax.annotate("%.2f" % (p.get_height()*100/sum(yaxis20trigram))+'%', (p.get_x() + p.get_width() / 2., p.get_height()),
                 ha='center', va='center', fontsize=13, color='black', xytext=(0, 20),
                 textcoords='offset points')

# Bigrams/Trigrams for specific ratings

In [None]:
# Rating 1

bigram1 = n_gramall(rating1['content'], n=2)
bigram1[:20]

In [None]:
mostcommonwords201bi = bigram1[:20]

yaxis201bi = [val[1] for val in mostcommonwords201bi]

xaxis201bi = [val[0] for val in mostcommonwords201bi]

plt.figure(figsize=(18, 14))
ax = sns.barplot(xaxis201bi, yaxis201bi, palette ="vlag_r")
plt.title('Top 20 Most Common Bigrams in Ragnarok Mobile Reviews for 1 Rating', fontsize = 25)
plt.ylabel('Frequency of word', fontsize = 20)
plt.xticks(rotation=45, fontsize = 15, ha='right')


for p in ax.patches:
             ax.annotate("%.2f" % (p.get_height()*100/sum(yaxis201bi))+'%', (p.get_x() + p.get_width() / 2., p.get_height()),
                 ha='center', va='center', fontsize=13, color='black', xytext=(0, 20),
                 textcoords='offset points')

In [None]:
trigram1 = n_gramall(rating1['content'], n=3)
trigram1[:20]

-------

In [None]:
# Rating 2

bigram2 = n_gramall(rating2['content'], n=2)
bigram2[:20]

In [None]:
# Returns nothing.

#trigram2 = n_gramall(rating2['content'], n=3)
#trigram2[:20]

--------

In [None]:
# Rating 3

bigram3 = n_gramall(rating3['content'], n=2)
bigram3[:20]

In [None]:
mostcommonwords203bi = bigram3[:20]

yaxis203bi = [val[1] for val in mostcommonwords203bi]

xaxis203bi = [val[0] for val in mostcommonwords203bi]

plt.figure(figsize=(18, 14))
ax = sns.barplot(xaxis203bi, yaxis203bi, palette ="Spectral")
plt.title('Top 20 Most Common Bigrams in Mobile Game Reviews for 3 Rating', fontsize = 25)
plt.ylabel('Frequency of word', fontsize = 20)
plt.xticks(rotation=45, fontsize = 15, ha='right')


for p in ax.patches:
             ax.annotate("%.2f" % (p.get_height()*100/sum(yaxis203bi))+'%', (p.get_x() + p.get_width() / 2., p.get_height()),
                 ha='center', va='center', fontsize=13, color='black', xytext=(0, 20),
                 textcoords='offset points')

In [None]:
# Returns nothing.

#trigram3 = n_gramall(rating3['content'], n=3)
#trigram3[:20]

---------

In [None]:
# Rating 4

bigram4 = n_gramall(rating4['content'], n=2)
bigram4[:20]

In [None]:
mostcommonwords204bi = bigram4[:20]

yaxis204bi = [val[1] for val in mostcommonwords204bi]

xaxis204bi = [val[0] for val in mostcommonwords204bi]

plt.figure(figsize=(18, 14))
ax = sns.barplot(xaxis204bi, yaxis204bi, palette ="Spectral")
plt.title('Top 20 Most Common Bigrams in Mobile Game Reviews for 5 Rating', fontsize = 25)
plt.ylabel('Frequency of word', fontsize = 20)
plt.xticks(rotation=45, fontsize = 15, ha='right')


for p in ax.patches:
             ax.annotate("%.2f" % (p.get_height()*100/sum(yaxis205bi))+'%', (p.get_x() + p.get_width() / 2., p.get_height()),
                 ha='center', va='center', fontsize=13, color='black', xytext=(0, 20),
                 textcoords='offset points')

In [None]:
# Returns nothing.

#trigram4 = n_gramall(rating4['content'], n=3)
#trigram4[:20]

------

In [None]:
# Rating 5

bigram5 = n_gramall(rating5['content'], n=2)
bigram5[:20]

In [None]:
mostcommonwords205bi = bigram5[:20]

yaxis205bi = [val[1] for val in mostcommonwords205bi]

xaxis205bi = [val[0] for val in mostcommonwords205bi]

plt.figure(figsize=(18, 14))
ax = sns.barplot(xaxis205bi, yaxis205bi, palette ="Spectral")
plt.title('Top 20 Most Common Bigrams in Mobile Game Reviews for 5 Rating', fontsize = 25)
plt.ylabel('Frequency of word', fontsize = 20)
plt.xticks(rotation=45, fontsize = 15, ha='right')


for p in ax.patches:
             ax.annotate("%.2f" % (p.get_height()*100/sum(yaxis205bi))+'%', (p.get_x() + p.get_width() / 2., p.get_height()),
                 ha='center', va='center', fontsize=13, color='black', xytext=(0, 20),
                 textcoords='offset points')

In [None]:
trigram5 = n_gramall(rating5['content'], n=3)
trigram5[:20]