# Filter the data

In [None]:
# filter the data, only select the columns that will be used
import pandas as pd
df = pd.read_csv('https://drive.google.com/uc?id=1jASI_I_shP2rhIR-BIOnu3nR6TNgh2pg', usecols=['userName','content', 'score'])

In [None]:
df

Unnamed: 0,userName,content,score
0,MaxPro,Saya kemaren hampir dapat Samsung M22 and Jam(...,1
1,Kasron Siregar,Setiap saya main shopee tan dan ketika sudah p...,1
2,ferdy ferdyan,Semua prodak yang saya beli di ol shop shoppy ...,1
3,Abdul Hakim,Tolong y shopee pesanan saya 2112144TTRFCNC di...,1
4,Fifiana Radianov,Shopee lg knp sih? Udh di update dr Googleplay...,1
...,...,...,...
19995,May Sharoh,Mantappp,5
19996,Mohamad Rudi,Saya pengguna shopee lama tolong pengajuan sho...,5
19997,Baim Satria,Puas belanja di shopee,5
19998,J.Rita Latief,Shopee is the best,5


# Preprocessing Data

In [None]:
df.columns=['username','comment','rating']
df

Unnamed: 0,username,comment,rating
0,MaxPro,Saya kemaren hampir dapat Samsung M22 and Jam(...,1
1,Kasron Siregar,Setiap saya main shopee tan dan ketika sudah p...,1
2,ferdy ferdyan,Semua prodak yang saya beli di ol shop shoppy ...,1
3,Abdul Hakim,Tolong y shopee pesanan saya 2112144TTRFCNC di...,1
4,Fifiana Radianov,Shopee lg knp sih? Udh di update dr Googleplay...,1
...,...,...,...
19995,May Sharoh,Mantappp,5
19996,Mohamad Rudi,Saya pengguna shopee lama tolong pengajuan sho...,5
19997,Baim Satria,Puas belanja di shopee,5
19998,J.Rita Latief,Shopee is the best,5


In [None]:
import numpy as np

# declare the encoding used to 'ascii'
DATA = df
DATA['comment'].str.encode('ascii', 'ignore')
DATA.head()

Unnamed: 0,username,comment,rating
0,MaxPro,Saya kemaren hampir dapat Samsung M22 and Jam(...,1
1,Kasron Siregar,Setiap saya main shopee tan dan ketika sudah p...,1
2,ferdy ferdyan,Semua prodak yang saya beli di ol shop shoppy ...,1
3,Abdul Hakim,Tolong y shopee pesanan saya 2112144TTRFCNC di...,1
4,Fifiana Radianov,Shopee lg knp sih? Udh di update dr Googleplay...,1


In [None]:
# ------ Case Folding --------
# use Series.str.lower() function from Pandas to lowercase
DATA['comment'] = DATA['comment'].str.lower()

print('Case Folding Result : \n')
print(DATA['comment'].head(5))

Case Folding Result : 

0    saya kemaren hampir dapat samsung m22 and jam(...
1    setiap saya main shopee tan dan ketika sudah p...
2    semua prodak yang saya beli di ol shop shoppy ...
3    tolong y shopee pesanan saya 2112144ttrfcnc di...
4    shopee lg knp sih? udh di update dr googleplay...
Name: comment, dtype: object






In [None]:
import string 
import re #regex library

import nltk
nltk.download('punkt')

# import word_tokenize & FreqDist from NLTK
from nltk.tokenize import word_tokenize 
from nltk.probability import FreqDist

# ------ Tokenizing ---------

def remove_special(text):
    # remove tab, new line, ans back slice
    text = text.replace('\\t'," ").replace('\\n'," ").replace('\\u'," ").replace('\\',"")
    # remove non ASCII (emoticon, chinese word, .etc)
    text = text.encode('ascii', 'replace').decode('ascii')
    # remove mention, link, hashtag
    text = ' '.join(re.sub("([@#][A-Za-z0-9]+)|(\w+:\/\/\S+)"," ", text).split())
    # remove incomplete URL
    return text.replace("http://", " ").replace("https://", " ")
                
DATA['comment'] = DATA['comment'].apply(remove_special)

#remove number
def remove_number(text):
    return  re.sub(r"\d+", "", text)

DATA['comment'] = DATA['comment'].apply(remove_number)

#remove punctuation
def remove_punctuation(text):
    return text.translate(str.maketrans("","",string.punctuation))

DATA['comment'] = DATA['comment'].apply(remove_punctuation)

#remove whitespace leading & trailing
def remove_whitespace_LT(text):
    return text.strip()

DATA['comment'] = DATA['comment'].apply(remove_whitespace_LT)

#remove multiple whitespace into single whitespace
def remove_whitespace_multiple(text):
    return re.sub('\s+',' ',text)

DATA['comment'] = DATA['comment'].apply(remove_whitespace_multiple)

# remove single char
def remove_singl_char(text):
    return re.sub(r"\b[a-zA-Z]\b", "", text)

DATA['comment'] = DATA['comment'].apply(remove_singl_char)

# NLTK word rokenize 
def word_tokenize_wrapper(text):
    return word_tokenize(text)

DATA['tokens'] = DATA['comment'].apply(word_tokenize_wrapper)

print('Tokenizing Result : \n') 
print(DATA['tokens'].head())

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Tokenizing Result : 

0    [saya, kemaren, hampir, dapat, samsung, and, j...
1    [setiap, saya, main, shopee, tan, dan, ketika,...
2    [semua, prodak, yang, saya, beli, di, ol, shop...
3    [tolong, shopee, pesanan, saya, ttrfcnc, di, p...
4    [shopee, lg, knp, sih, udh, di, update, dr, go...
Name: tokens, dtype: object






In [None]:
# NLTK calc frequency distribution
def freqDist_wrapper(text):
    return FreqDist(text)

DATA['tokens_fdist'] = DATA['tokens'].apply(freqDist_wrapper)

print('Frequency Tokens : \n') 
print(DATA['tokens_fdist'].head().apply(lambda x : x.most_common()))

Frequency Tokens : 

0    [(saya, 4), (dapat, 2), (and, 2), (flash, 2), ...
1    [(saya, 5), (tidak, 5), (shopee, 4), (dan, 3),...
2    [(saya, 3), (shoppy, 3), (prodak, 2), (beli, 2...
3    [(di, 5), (tolong, 3), (shopee, 3), (nya, 3), ...
4    [(gak, 4), (shopee, 3), (udh, 3), (update, 3),...
Name: tokens_fdist, dtype: object


In [None]:
from nltk.corpus import stopwords

nltk.download('stopwords')

# ----------------------- get stopword from NLTK stopword -------------------------------
# get stopword indonesia
list_stopwords = stopwords.words('indonesian')
print(len(list_stopwords))

# ---------------------------- manualy add stopword  ------------------------------------
# append additional stopword
list_stopwords.extend(["yg", "dg", "rt", "dgn", "ny", "d", 'klo', 
                       'kalo', 'amp', 'biar', 'bikin', 'bilang', 
                       'gak', 'ga', 'krn', 'nya', 'nih', 'sih', 
                       'si', 'tau', 'tdk', 'tuh', 'utk', 'ya', 
                       'jd', 'jgn', 'sdh', 'aja', 'n', 't', 
                       'nyg', 'hehe', 'pen', 'u', 'nan', 'loh', 'rt',
                       '&amp', 'yah'])

len(list_stopwords)
# ----------------------- add stopword from txt file ------------------------------------
# read txt stopword using pandas
link_stopword = 'https://github.com/Muhammad-Yunus/Neural-Network/blob/master/3.%20Convolutional%20Neural%20Network/Text%20Preprocessing/stopwords.txt'
txt_stopword = pd.read_csv(link_stopword, names= ["stopwords"], header = None)

# convert stopword string to list & append additional stopword
list_stopwords.extend(txt_stopword["stopwords"][0].split(' '))
len(list_stopwords)
# ---------------------------------------------------------------------------------------

# convert list to dictionary
list_stopwords = set(list_stopwords)

#remove stopword pada list token
def stopwords_removal(words):
    return [word for word in words if word not in list_stopwords]

DATA['tokens_WSW'] = DATA['tokens'].apply(stopwords_removal) 

print(DATA['tokens_WSW'].head(10))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
758
0    [kemaren, samsung, and, jamlupa, merkred, flas...
1    [main, shopee, tan, panen, voucher, tanam, vou...
2    [prodak, beli, ol, shop, shoppy, produk, bekas...
3    [tolong, shopee, pesanan, ttrfcnc, proses, pen...
4    [shopee, lg, knp, udh, update, dr, googleplay,...
5    [banget, event, kayak, pengirimannya, dampak, ...
6    [pembayaran, spaylaterpas, tanyatidak, membisa...
7    [no, tertera, apk, ganti, no, tlpnya, dr, udh,...
8    [kesekian, kalinya, mendaftar, spaylater, gaga...
9    [app, shoope, hp, lemot, emang, sistem, tolong...
Name: tokens_WSW, dtype: object


In [None]:
# add normalized word from excel
normalizad_word = pd.read_excel("https://drive.google.com/uc?id=1e8HYwX_dWk6K22Dr0Qz67GrgzUpWViL-")

normalizad_word_dict = {}

for index, row in normalizad_word.iterrows():
    if row[0] not in normalizad_word_dict:
        normalizad_word_dict[row[0]] = row[1] 

def normalized_term(document):
    return [normalizad_word_dict[term] if term in normalizad_word_dict else term for term in document]

DATA['normalized'] = DATA['tokens_WSW'].apply(normalized_term)

DATA['normalized'].head(10)

0    [kemarin, samsung, and, jamlupa, merkred, flas...
1    [main, shopee, tan, panen, voucher, tanam, vou...
2    [prodak, beli, oleh, shop, shoppy, produk, bek...
3    [tolong, shopee, pesanan, ttrfcnc, proses, pen...
4    [shopee, lagi, kenapa, sudah, update, dari, go...
5    [banget, event, kayak, pengirimannya, dampak, ...
6    [pembayaran, spaylaterpas, tanyatidak, membisa...
7    [nomor, tertera, apk, ganti, nomor, tlpnya, da...
8    [kesekian, kalinya, mendaftar, spaylater, gaga...
9    [app, shoope, hp, lambat, memang, sistem, tolo...
Name: normalized, dtype: object

In [None]:
# remove more words from the data
list_special_stopwords = ['deh', 'shopee', 'saya', 'sudah', 'salah', 'tidak', 
                          'aplikasi', 'bagaimana', 'lagi', 'bagus', 'banget', 
                          'tapi', 'apk', 'app', 'shoope', 'shoppe', 'shope', 'shope' 
                          'kali', 'suka', 'pakai']

def special_stopwords_removal(words):
    return [word for word in words if word not in list_special_stopwords]

DATA['normalized'] = DATA['normalized'].apply(special_stopwords_removal) 

print(DATA['normalized'].head(10))

0    [kemarin, samsung, and, jamlupa, merkred, flas...
1    [main, tan, panen, voucher, tanam, voucher, mi...
2    [prodak, beli, oleh, shop, shoppy, produk, bek...
3    [tolong, pesanan, ttrfcnc, proses, penjual, ti...
4    [kenapa, update, dari, googleplay, pas, bayar,...
5    [event, kayak, pengirimannya, dampak, selanjut...
6    [pembayaran, spaylaterpas, tanyatidak, membisa...
7    [nomor, tertera, ganti, nomor, tlpnya, dari, n...
8    [kesekian, kalinya, mendaftar, spaylater, gaga...
9    [hp, lambat, memang, sistem, tolong, recomend,...
Name: normalized, dtype: object


In [None]:
# make new column for the cleaned data
DATA['cleaned'] = [' '.join(text).strip() for text in DATA['normalized']]

In [None]:
DATA.head()

Unnamed: 0,username,comment,rating,tokens,tokens_fdist,tokens_WSW,normalized,cleaned
0,MaxPro,saya kemaren hampir dapat samsung and jamlupa...,1,"[saya, kemaren, hampir, dapat, samsung, and, j...","{'saya': 4, 'kemaren': 1, 'hampir': 1, 'dapat'...","[kemaren, samsung, and, jamlupa, merkred, flas...","[kemarin, samsung, and, jamlupa, merkred, flas...",kemarin samsung and jamlupa merkred flash sale...
1,Kasron Siregar,setiap saya main shopee tan dan ketika sudah p...,1,"[setiap, saya, main, shopee, tan, dan, ketika,...","{'setiap': 1, 'saya': 5, 'main': 2, 'shopee': ...","[main, shopee, tan, panen, voucher, tanam, vou...","[main, tan, panen, voucher, tanam, voucher, mi...",main tan panen voucher tanam voucher minyak go...
2,ferdy ferdyan,semua prodak yang saya beli di ol shop shoppy ...,1,"[semua, prodak, yang, saya, beli, di, ol, shop...","{'semua': 1, 'prodak': 2, 'yang': 1, 'saya': 3...","[prodak, beli, ol, shop, shoppy, produk, bekas...","[prodak, beli, oleh, shop, shoppy, produk, bek...",prodak beli oleh shop shoppy produk bekas kwal...
3,Abdul Hakim,tolong shopee pesanan saya ttrfcnc di proses ...,1,"[tolong, shopee, pesanan, saya, ttrfcnc, di, p...","{'tolong': 3, 'shopee': 3, 'pesanan': 1, 'saya...","[tolong, shopee, pesanan, ttrfcnc, proses, pen...","[tolong, pesanan, ttrfcnc, proses, penjual, ti...",tolong pesanan ttrfcnc proses penjual tipu kay...
4,Fifiana Radianov,shopee lg knp sih udh di update dr googleplay ...,1,"[shopee, lg, knp, sih, udh, di, update, dr, go...","{'shopee': 3, 'lg': 2, 'knp': 1, 'sih': 1, 'ud...","[shopee, lg, knp, udh, update, dr, googleplay,...","[kenapa, update, dari, googleplay, pas, bayar,...",kenapa update dari googleplay pas bayar listri...


In [None]:
#  change column name
DATA = DATA[['username','cleaned','rating']]
DATA.columns = ['username','comment', 'rating']
# convert the data to csv
DATA.to_csv("data_cleaned.csv")
DATA

Unnamed: 0,username,comment,rating
0,MaxPro,kemarin samsung and jamlupa merkred flash sale...,1
1,Kasron Siregar,main tan panen voucher tanam voucher minyak go...,1
2,ferdy ferdyan,prodak beli oleh shop shoppy produk bekas kwal...,1
3,Abdul Hakim,tolong pesanan ttrfcnc proses penjual tipu kay...,1
4,Fifiana Radianov,kenapa update dari googleplay pas bayar listri...,1
...,...,...,...
19995,May Sharoh,mantappp,5
19996,Mohamad Rudi,pengguna tolong pengajuan paylater setujui,5
19997,Baim Satria,puas belanja,5
19998,J.Rita Latief,is the best,5


In [None]:
# remove empty data
DATA.dropna()
DATA.rating.value_counts()

5    5000
4    5000
2    5000
1    5000
Name: rating, dtype: int64

In [None]:
# save data to csv
DATA.to_csv("data_cleaned.csv")