PREPROCESSING


In [None]:
import numpy
import pandas as pd
import string
import re
import nltk

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [None]:
# Load Dataset
data = pd.read_csv("sample.csv", encoding= 'unicode_escape')
review = data['text']

data


Unnamed: 0,text,sentiment
0,sangat memuaskan service nya hanya kedepan nya...,negative
1,saya pesan tiket hotel via traveloka mudah beg...,positive
2,bagus tetap menjadi pilihan pertamapesan tiket...,negative
3,buat saya aplikasi ticketing terbaik di indone...,positive
4,aplikasi nya mudah digunakan dari segi fitur s...,positive
...,...,...
3958,maaf ini sampai sekarang belum ada respon sald...,negative
3959,ganti no karna no kita sudah mati susah nya mi...,negative
3960,pelayanan cs kurang memuaskan,neutral
3961,lupa pin kok tdk ada cs ny,neutral


DATA CLEANING

In [None]:
# Data Cleaning

def removePunctuation(x):
    # removing number
    x = re.sub(r"\d+", "", x)
    # removing non ASCII character
    x = re.sub(r"[^\x00-\x7f]+", " ", x)
    x = x.replace(".", " ")
    x = x.replace("-", " ")
    x = x.replace("/", " ")
    # removing punctuation
    return x.translate(str.maketrans(" ", " ", string.punctuation))

review = review.apply(removePunctuation)

# removing whitespace
def removeWhitespace_LT(x):
    return x.strip()

review = review.apply(removeWhitespace_LT)

# removing multiple whitespace
def removeWhitespace_multiple(x):
    return re.sub("\s+", " ", x)

review = review.apply(removeWhitespace_multiple)

print('Data Cleaning Result : \n')
print(review.head(5))

In [None]:
# Case Folding
review = review.str.lower()

print('Case Folding Result : \n')
print(review.head(5))

In [None]:
# Stopword Removal 
# get stopword from NLTK stopword
# get stopword Indonesia
nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('indonesian'))

def stopword_removal(x):
    tokens = word_tokenize(x)
    tokens = [w for w in tokens if not w in stop_words]
    tokens = ' '.join(tokens)
    return tokens

review = review.apply(stopword_removal)

print('Stopword Removal Result : \n')
print(review.head())

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
Stopword Removal Result : 

0    memuaskan service nya kedepan nyaharus cermat ...
1    pesan tiket hotel via traveloka mudah refund b...
2    bagus pilihan pertamapesan tiket batalin maska...
3    aplikasi ticketing terbaik indonesia traveloka...
4    aplikasi nya mudah segi fitur pemesanan pembay...
Name: text, dtype: object


In [None]:
review.to_csv('preprocessing.csv', index=False)

SPLIT DATASET SETELAH DATA PREPROCESSING DILAKUKAN LABELLING


In [None]:
# Train Test valid Split dengan rasio 60% train, 20% test dan 20% valid
# Load dataset
import numpy
import pandas as pd
from numpy.random import RandomState

df = pd.read_csv("sample.csv", encoding= 'unicode_escape')
rng = RandomState()

train = df.sample(frac=0.8, random_state=rng)
test = df.loc[~df.index.isin(train.index)]

train, test, valid = numpy.split(df.sample(frac=1, random_state=rng), [int(.6*len(df)), int(.8*len(df))])

In [None]:
print(train.head(10))

                                                   text sentiment
2198  aplikasi linkaja bagus memudahkan bertransaksi...  positive
3144  adlh pengguna linkaja mengalami akun terblokir...  negative
2904  lengkap banget pake aplikasi dompet digital pi...  positive
3804               gagal upgret coba pakai ribetttttttt  negative
2870  linkaja mantap banget e wallet pembayaran kese...  positive
2352  linkaja aplikasi bagus bermanfaat memudahkan m...  positive
80    mudah cepat hemat booking beli ticket liburan ...  positive
787   makasih aplikasinya bagus nama asli nomor asin...  positive
58    terbantu traveloka mudah informasi disediakan ...  positive
2925  aplikasi mempermudah transaksi elektronik sala...  positive


In [None]:
print(test.head(10))

                                                   text sentiment
3304  apk taik pulihkan akun aja susah nya udah kaya...  negative
3565           insentif prakerja link aja lemot e walet  negative
3837                             eror ya ga login hadeh   neutral
366                     cepat pelayanannya informasinya  positive
309   aplikasi penunjang kegiatan perjalanan recomended  positive
1092  gua kasih bintang top banget membingungkan hat...  positive
3807                   parah sulit login input kode otp  negative
3208  kesini jelek ngisi saldo atm nggak gagal konek...  negative
2331  aplikasi membantu era modern transaksi simple ...  positive
2899  terima kasih linkaja gak ribet ribet kalo baya...  positive


In [None]:
print(valid.head(10))

                                                   text sentiment
2305  aplikasi bagus temui kenyamanan kenyamanan man...  positive
1262        bagus bangat aplikasinya makasih cetcontact  positive
1260                              sejjauh sih msh bagus  positive
2105  aplikasi nya bagus sekalii recommended bangett...  positive
3111  niat promo bagus promo bikin malu gabungan per...  positive
2868  wow keren sih aplikasi nya berguna banget fitu...  positive
1727  apk ngk sesuai dgn iklankan apk anjinggggggggg...  negative
1578  yg lihat nama kontak taunya gak penipuan gw ha...   neutral
3874                               top up free fire tdk  positive
2656  aplikasi bagus si rekomendasi banget zaman ser...  positive


In [None]:
train.to_csv('train.csv', index=False)
test.to_csv('test.csv', index=False)
valid.to_csv('valid.csv', index=False)

In [None]:
data = pd.read_csv("train.csv")
data

Unnamed: 0,text,sentiment
0,saya upgrade full service kog gagal terus ya j...,negative
1,mau log in saja susah padahal tidak seribet in...,negative
2,habis kirim saldo ke link aja tapi tidak masuk...,negative
3,terima kasih apk nya sangat membantu sekali,positive
4,ini gimana kok gak bisa masuk ke aplikasi nya ...,neutral
...,...,...
1976,susah bgt buat login aja selalu gagal verifika...,negative
1977,susah g bisa login pake wa ceklis pake google ...,negative
1978,aplikasinya mudah dan lengkap mudahan makin ba...,positive
1979,aplikasi ini sangat bagus,positive
