In [2]:
import pandas as pd 
import numpy as np

TWEET_DATA = pd.read_csv("dataset_sementara.csv", encoding = "ISO-8859-1")

TWEET_DATA.head()

Unnamed: 0,tweet,label
0,"""RT @mas__piyuuu: Bantu Bekasi, Anies Baswedan...",0
1,"""@LisaAmartatara3 @fadlizon Yg dibangga bangga...",1
2,"""RT @RustamIbrahim: POLLING: Dari 4 gubernur d...",1
3,"""ojok ampe Anies Baswedan menikmati hsil proye...",1
4,"""RT @ghanieierfan: Pertemuan sangat mengharuka...",0


In [3]:
# ------ Case Folding --------
# gunakan fungsi Series.str.lower() pada Pandas
TWEET_DATA['tweet'] = TWEET_DATA['tweet'].str.lower()


print('Case Folding Result : \n')
print(TWEET_DATA['tweet'].head(5))
print('\n\n\n')

Case Folding Result : 

0    "rt @mas__piyuuu: bantu bekasi, anies baswedan...
1    "@lisaamartatara3 @fadlizon yg dibangga bangga...
2    "rt @rustamibrahim: polling: dari 4 gubernur d...
3    "ojok ampe anies baswedan menikmati hsil proye...
4    "rt @ghanieierfan: pertemuan sangat mengharuka...
Name: tweet, dtype: object






In [4]:
import string 
import re #regex library

# import word_tokenize & FreqDist from NLTK
from nltk.tokenize import word_tokenize 
from nltk.probability import FreqDist

# ------ Tokenizing ---------
#remove number
def remove_number(text):
    return  re.sub(r"\d+", "", text)

TWEET_DATA['tweet'] = TWEET_DATA['tweet'].apply(remove_number)

#remove punctuation
def remove_punctuation(text):
    return text.translate(str.maketrans("","",string.punctuation))

TWEET_DATA['tweet'] = TWEET_DATA['tweet'].apply(remove_punctuation)

#remove whitespace leading & trailing
def remove_whitespace_LT(text):
    return text.strip()

TWEET_DATA['tweet'] = TWEET_DATA['tweet'].apply(remove_whitespace_LT)

#remove multiple whitespace into single whitespace
def remove_whitespace_multiple(text):
    return re.sub('\s+',' ',text)

TWEET_DATA['tweet'] = TWEET_DATA['tweet'].apply(remove_whitespace_multiple)


# NLTK word rokenize 
def word_tokenize_wrapper(text):
    return word_tokenize(text)

TWEET_DATA['tweet_tokens'] = TWEET_DATA['tweet'].apply(word_tokenize_wrapper)

print('Tokenizing Result : \n') 
print(TWEET_DATA['tweet_tokens'].head())
print('\n\n\n')

Tokenizing Result : 

0    [rt, maspiyuuu, bantu, bekasi, anies, baswedan...
1    [lisaamartatara, fadlizon, yg, dibangga, bangg...
2    [rt, rustamibrahim, polling, dari, gubernur, d...
3    [ojok, ampe, anies, baswedan, menikmati, hsil,...
4    [rt, ghanieierfan, pertemuan, sangat, mengharu...
Name: tweet_tokens, dtype: object






In [5]:
# NLTK calc frequency distribution
def freqDist_wrapper(text):
    return FreqDist(text)

TWEET_DATA['tweet_tokens_fdist'] = TWEET_DATA['tweet_tokens'].apply(freqDist_wrapper)

print('Frequency Tokens : \n') 
print(TWEET_DATA['tweet_tokens_fdist'].head().apply(lambda x : x.most_common()))

Frequency Tokens : 

0    [(rt, 1), (maspiyuuu, 1), (bantu, 1), (bekasi,...
1    [(lisaamartatara, 1), (fadlizon, 1), (yg, 1), ...
2    [(rt, 1), (rustamibrahim, 1), (polling, 1), (d...
3    [(ojok, 1), (ampe, 1), (anies, 1), (baswedan, ...
4    [(rt, 1), (ghanieierfan, 1), (pertemuan, 1), (...
Name: tweet_tokens_fdist, dtype: object


In [6]:
from nltk.corpus import stopwords

# get stopword indonesia
list_stopwords = stopwords.words('indonesian')

# append additional stopword
list_stopwords.extend(["yg", "dg", "rt", "dgn", "ny", "d", 'klo', 'kalo', 'amp', 'biar', 'bikin', 'bilang', 'gak', 'ga', 'krn', 'nya', 'nih', 'sih', 'si', 'tau', 'tdk', 'tuh', 'utk', 'ya', 'jd', 'jgn', 'sdh', 'aja'])

list_stopwords = set(list_stopwords)

#remove stopword pada list token
def stopwords_removal(words):
    return [word for word in words if not word in list_stopwords]

%timeit TWEET_DATA['tweet_tokens_fdist_WSW'] = TWEET_DATA['tweet_tokens_fdist'].apply(stopwords_removal) 


print(TWEET_DATA['tweet_tokens_fdist_WSW'].head())

64.2 ms ± 2.85 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
0    [maspiyuuu, bantu, bekasi, anies, baswedan, me...
1    [lisaamartatara, fadlizon, dibangga, banggain,...
2    [rustamibrahim, polling, gubernur, jawa, ridwa...
3    [ojok, ampe, anies, baswedan, menikmati, hsil,...
4    [ghanieierfan, pertemuan, mengharukan, aniesba...
Name: tweet_tokens_fdist_WSW, dtype: object


In [7]:
# import Sastrawi package
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import swifter


# create stemmer
factory = StemmerFactory()
stemmer = factory.create_stemmer()

# stemmed
def stemmed_wrapper(term):
    return stemmer.stem(term)

term_dict = {}

for document in TWEET_DATA['tweet_tokens_fdist_WSW']:
    for term in document:
        if term not in term_dict:
            term_dict[term] = ' '
            
print(len(term_dict))
print("------------------------")

for term in term_dict:
    term_dict[term] = stemmed_wrapper(term)
    print(term,":" ,term_dict[term])
    
print(term_dict)
print("------------------------")


# apply stemmed term to dataframe
def get_stemmed_term(document):
    return [term_dict[term] for term in document]

TWEET_DATA['tweet_tokens_stemmed'] = TWEET_DATA['tweet_tokens_fdist_WSW'].swifter.apply(get_stemmed_term)
print(TWEET_DATA['tweet_tokens_stemmed'])

65138
------------------------
maspiyuuu : maspiyuuu
bantu : bantu
bekasi : bekas
anies : anies
baswedan : baswedan
melayani : layan
warga : warga
negara : negara
indonesia : indonesia
httpstcobiwxvhsvknntinggal : httpstcobiwxvhsvknntinggal
dilantik : lantik
inimah : inimah
pru : pru
lisaamartatara : lisaamartatara
fadlizon : fadlizon
dibangga : bangga
banggain : banggain
buzzer : buzzer
cendana : cendana
yah : yah
cuman : cuman
orangn : orangn
alm : alm
soeharton : soeharton
baswedannyg : baswedannyg
lewathehe : lewathehe
rustamibrahim : rustamibrahim
polling : polling
gubernur : gubernur
jawa : jawa
ridwan : ridwan
kamil : kamil
ganjar : ganjar
pranowo : pranowo
khofifah : khofifah
indar : indar
parawansa : parawansa
penu : penu
ojok : ojok
ampe : ampe
menikmati : nikmat
hsil : hsil
proyek : proyek
monas : monas
yng : yng
penuh : penuh
kejanggalannya : janggal
usutskandalmonasgate : usutskandalmonasgate
ghanieierfan : ghanieierfan
pertemuan : temu
mengharukan : haru
aniesbaswedan : a

KeyboardInterrupt: 

In [None]:
TWEET_DATA.to_csv("Text_Preprocessing.csv")

In [None]:
TWEET_DATA.to_excel("Text_Preprocessing.xlsx")

In [None]:
TWEET_DATA.to_hdf("Text_Preprocessing.h5")