# **Table of Contents**

## **1. Business Understanding**

## **2. Data Understanding**

## **3. Data Preprocessing**

### **3.1 Import Library**

In [1]:
# Scraping data from google play store
from google_play_scraper import app
from google_play_scraper import Sort, reviews

# Data manipulation
import pandas as pd
import numpy as np

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

### **3.2 Scrapping Data**

In [2]:
# Get reviews from google play store
result, continuation_token = reviews(
    'com.bca',
    lang='id', # defaults to 'id'
    country='id', # defaults to 'id'
    sort=Sort.NEWEST, # defaults to Sort.MOST_RELEVANT
    count=1000, # defaults to 100
    filter_score_with=None # defaults to None(means all score)
)

In [3]:
# Convert to dataframe
df = pd.DataFrame(np.array(result),columns=['review'])
df = df.join(pd.DataFrame(df.pop('review').tolist()))

df.head()

Unnamed: 0,reviewId,userName,userImage,content,score,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt
0,cf3f7b98-2ad8-4e15-b033-825190bf0389,Chris Nur,https://play-lh.googleusercontent.com/a/AGNmyx...,ok,5,0,4.0.3,2023-03-27 16:03:27,Thank you for the review. Hopefully the BCA mo...,2023-03-27 17:27:17
1,13ee74ae-92a0-4370-8e65-8426ea139379,shadow_owl 82,https://play-lh.googleusercontent.com/a-/ACB-R...,Sangat membantu,5,0,4.0.3,2023-03-27 15:59:12,Terima kasih atas ulasannya. Semoga aplikasi B...,2023-03-27 17:27:27
2,26644161-52ee-4fda-9399-7c9bfb429b26,tata kmy,https://play-lh.googleusercontent.com/a/AGNmyx...,Ribet pake verifikasi muka segala udah kayak p...,1,0,4.0.3,2023-03-27 15:54:45,Mhn maaf atas kendalanya. Penambahan verifikas...,2023-03-27 17:27:30
3,b8c6b163-a108-4c0c-905e-961d837877d2,Jhoy Wae351,https://play-lh.googleusercontent.com/a/AGNmyx...,Susah buka rek baru.. selalu gagal masuk via s...,3,0,4.0.3,2023-03-27 15:39:23,Mhn maaf atas ketidaknyamanannya Bpk/Ibu. Moho...,2023-03-27 17:27:36
4,1cd736ad-6ba5-4032-b6e2-e6e7cb71401e,Fidyah Romadhan,https://play-lh.googleusercontent.com/a-/ACB-R...,Ribet bgt skrg harus verifikasi KTP dulu berka...,1,0,4.0.3,2023-03-27 14:58:13,Mhn maaf atas kendalanya. Penambahan verifikas...,2023-03-27 15:22:08


In [4]:
# Make a new dataframe with only 4 columns (userName, score, content, at)
new_df = df[['userName', 'score', 'content', 'at']]
new_df.head()

# Save to csv
import os.path
new_df.to_csv(os.path.join('E:\KAMPUS\BERKAS - BERKAS\Skripsi\Dataset','bcaMobile_reviews.csv'), index=False)

### **3.3 Data Review**

In [5]:
# Check the data
new_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   userName  1000 non-null   object        
 1   score     1000 non-null   int64         
 2   content   1000 non-null   object        
 3   at        1000 non-null   datetime64[ns]
dtypes: datetime64[ns](1), int64(1), object(2)
memory usage: 31.4+ KB


In [6]:
# Check null value
new_df.isnull().sum()

userName    0
score       0
content     0
at          0
dtype: int64

In [7]:
# Check duplicate value
new_df.duplicated().sum()

0

In [8]:
# Check empty string in content column
new_df['content'].str.isspace().sum()

0

### **3.4 Text Preprocessing**

In [9]:
# Labelling the sentiment based on score
def sentiment(score):
    if score >= 4:
        return 'Positive'
    elif score == 3:
        return 'Neutral'
    else:
        return 'Negative'
    
new_df['sentiment'] = new_df['score'].apply(sentiment)
new_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['sentiment'] = new_df['score'].apply(sentiment)


Unnamed: 0,userName,score,content,at,sentiment
0,Chris Nur,5,ok,2023-03-27 16:03:27,Positive
1,shadow_owl 82,5,Sangat membantu,2023-03-27 15:59:12,Positive
2,tata kmy,1,Ribet pake verifikasi muka segala udah kayak p...,2023-03-27 15:54:45,Negative
3,Jhoy Wae351,3,Susah buka rek baru.. selalu gagal masuk via s...,2023-03-27 15:39:23,Neutral
4,Fidyah Romadhan,1,Ribet bgt skrg harus verifikasi KTP dulu berka...,2023-03-27 14:58:13,Negative


In [10]:
# Count the sentiment and make the chart using plotly, where Positive is green, Neutral is yellow, and Negative is red
fig = px.histogram(new_df, x="sentiment", color="sentiment", title="Sentiment Count")
fig.show()

In [11]:
# Case folding
new_df['content'] = new_df['content'].str.lower()
new_df.head()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,userName,score,content,at,sentiment
0,Chris Nur,5,ok,2023-03-27 16:03:27,Positive
1,shadow_owl 82,5,sangat membantu,2023-03-27 15:59:12,Positive
2,tata kmy,1,ribet pake verifikasi muka segala udah kayak p...,2023-03-27 15:54:45,Negative
3,Jhoy Wae351,3,susah buka rek baru.. selalu gagal masuk via s...,2023-03-27 15:39:23,Neutral
4,Fidyah Romadhan,1,ribet bgt skrg harus verifikasi ktp dulu berka...,2023-03-27 14:58:13,Negative


In [12]:
# Remove punctuation
new_df['content'] = new_df['content'].str.replace('[^\w\s]','')

# Remove number
new_df['content'] = new_df['content'].str.replace('\d+','')

# Remove whitespace leading & trailing
new_df['content'] = new_df['content'].str.strip()

# Remove whitespace double or more
new_df['content'] = new_df['content'].str.replace('\s+',' ')

# Remove emoji
import re
def remove_emoji(string):
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)

new_df['content'] = new_df['content'].apply(remove_emoji)
new_df.head()


The default value of regex will change from True to False in a future version.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The default value of regex will change from True to False in a future version.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The default value of regex will change 

Unnamed: 0,userName,score,content,at,sentiment
0,Chris Nur,5,ok,2023-03-27 16:03:27,Positive
1,shadow_owl 82,5,sangat membantu,2023-03-27 15:59:12,Positive
2,tata kmy,1,ribet pake verifikasi muka segala udah kayak p...,2023-03-27 15:54:45,Negative
3,Jhoy Wae351,3,susah buka rek baru selalu gagal masuk via sms...,2023-03-27 15:39:23,Neutral
4,Fidyah Romadhan,1,ribet bgt skrg harus verifikasi ktp dulu berka...,2023-03-27 14:58:13,Negative


In [13]:
# Tokenizing
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

content_token = []
for i in new_df['content']:
    content_token.append(word_tokenize(i))

new_df['content_token'] = content_token
new_df.head()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ashia\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,userName,score,content,at,sentiment,content_token
0,Chris Nur,5,ok,2023-03-27 16:03:27,Positive,[ok]
1,shadow_owl 82,5,sangat membantu,2023-03-27 15:59:12,Positive,"[sangat, membantu]"
2,tata kmy,1,ribet pake verifikasi muka segala udah kayak p...,2023-03-27 15:54:45,Negative,"[ribet, pake, verifikasi, muka, segala, udah, ..."
3,Jhoy Wae351,3,susah buka rek baru selalu gagal masuk via sms...,2023-03-27 15:39:23,Neutral,"[susah, buka, rek, baru, selalu, gagal, masuk,..."
4,Fidyah Romadhan,1,ribet bgt skrg harus verifikasi ktp dulu berka...,2023-03-27 14:58:13,Negative,"[ribet, bgt, skrg, harus, verifikasi, ktp, dul..."


In [14]:
# Check stopwords in Bahasa Indonesia
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stopwords = stopwords.words('indonesian')

# Remove stopwords
new_df['content_token'] = new_df['content_token'].apply(lambda x: [item for item in x if item not in stopwords])
new_df.head()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ashia\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,userName,score,content,at,sentiment,content_token
0,Chris Nur,5,ok,2023-03-27 16:03:27,Positive,[ok]
1,shadow_owl 82,5,sangat membantu,2023-03-27 15:59:12,Positive,[membantu]
2,tata kmy,1,ribet pake verifikasi muka segala udah kayak p...,2023-03-27 15:54:45,Negative,"[ribet, pake, verifikasi, muka, udah, kayak, p..."
3,Jhoy Wae351,3,susah buka rek baru selalu gagal masuk via sms...,2023-03-27 15:39:23,Neutral,"[susah, buka, rek, gagal, masuk, via, sms, per..."
4,Fidyah Romadhan,1,ribet bgt skrg harus verifikasi ktp dulu berka...,2023-03-27 14:58:13,Negative,"[ribet, bgt, skrg, verifikasi, ktp, berkali, g..."


In [15]:
# Stemming with Sastrawi
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

factory = StemmerFactory()
stemmer = factory.create_stemmer()

content_stem = []
for i in new_df['content_token']:
    content_stem.append([stemmer.stem(j) for j in i])

new_df['content_stem'] = content_stem
new_df.head()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,userName,score,content,at,sentiment,content_token,content_stem
0,Chris Nur,5,ok,2023-03-27 16:03:27,Positive,[ok],[ok]
1,shadow_owl 82,5,sangat membantu,2023-03-27 15:59:12,Positive,[membantu],[bantu]
2,tata kmy,1,ribet pake verifikasi muka segala udah kayak p...,2023-03-27 15:54:45,Negative,"[ribet, pake, verifikasi, muka, udah, kayak, p...","[ribet, pake, verifikasi, muka, udah, kayak, p..."
3,Jhoy Wae351,3,susah buka rek baru selalu gagal masuk via sms...,2023-03-27 15:39:23,Neutral,"[susah, buka, rek, gagal, masuk, via, sms, per...","[susah, buka, rek, gagal, masuk, via, sms, per..."
4,Fidyah Romadhan,1,ribet bgt skrg harus verifikasi ktp dulu berka...,2023-03-27 14:58:13,Negative,"[ribet, bgt, skrg, verifikasi, ktp, berkali, g...","[ribet, bgt, skrg, verifikasi, ktp, kali, gaga..."


In [16]:
# Open the dictionary of slang words in Bahasa Indonesia
with open(r"E:\KAMPUS\BERKAS - BERKAS\Skripsi\Workstation\NLP_bahasa_resources\combined_slang_words.txt", 'r') as slang_words:
    slang_words = slang_words.read().splitlines()

# Remove slang words, if there is any slang words in the content column
new_df['content_stem'] = new_df['content_stem'].apply(lambda x: [item for item in x if item not in slang_words])
new_df.head()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,userName,score,content,at,sentiment,content_token,content_stem
0,Chris Nur,5,ok,2023-03-27 16:03:27,Positive,[ok],[ok]
1,shadow_owl 82,5,sangat membantu,2023-03-27 15:59:12,Positive,[membantu],[bantu]
2,tata kmy,1,ribet pake verifikasi muka segala udah kayak p...,2023-03-27 15:54:45,Negative,"[ribet, pake, verifikasi, muka, udah, kayak, p...","[ribet, pake, verifikasi, muka, udah, kayak, p..."
3,Jhoy Wae351,3,susah buka rek baru selalu gagal masuk via sms...,2023-03-27 15:39:23,Neutral,"[susah, buka, rek, gagal, masuk, via, sms, per...","[susah, buka, rek, gagal, masuk, via, sms, per..."
4,Fidyah Romadhan,1,ribet bgt skrg harus verifikasi ktp dulu berka...,2023-03-27 14:58:13,Negative,"[ribet, bgt, skrg, verifikasi, ktp, berkali, g...","[ribet, bgt, skrg, verifikasi, ktp, kali, gaga..."


## **4. Modelling**

In [17]:
# Modelling with PyCaret
from pycaret.classification import *

# Setup the environment
exp_clf = setup(data = new_df, target = 'sentiment', session_id=123, silent=True)

# Compare the model
best_model = compare_models(sort='Accuracy', n_select=3)
best_model

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.734
dt,Decision Tree Classifier,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.022
svm,SVM - Linear Kernel,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.024
ridge,Ridge Classifier,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.024
rf,Random Forest Classifier,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.107
ada,Ada Boost Classifier,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.073
gbc,Gradient Boosting Classifier,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.315
et,Extra Trees Classifier,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.089
xgboost,Extreme Gradient Boosting,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.313
lightgbm,Light Gradient Boosting Machine,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.187


[LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                    intercept_scaling=1, l1_ratio=None, max_iter=1000,
                    multi_class='auto', n_jobs=None, penalty='l2',
                    random_state=123, solver='lbfgs', tol=0.0001, verbose=0,
                    warm_start=False),
 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=None, max_features=None, max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=123, splitter='best'),
 SGDClassifier(alpha=0.0001, average=False, class_weight=None,
               early_stopping=False, epsilon=0.1, eta0=0.001, fit_intercept=True,
               l1_ratio=0.15, learning_rate='optimal', loss='hinge',
               max_it