In [1]:
import pandas as pd 
import numpy as np

TWEET_DATA = pd.read_csv("C:/Users/Irgi/Documents/Data/Raw Data/Clear/text_processing.csv", delimiter = ',')

TWEET_DATA.head()

Unnamed: 0,Tweet
0,Segera rilis tanggal Mei di Korea dan udah te...
1,Enak banget cuma karna polos bisa debut main f...
2,Elon musk
3,Reyhan
4,jahat ya iya


In [2]:
# ------ Case Folding --------
# gunakan fungsi Series.str.lower() pada Pandas
TWEET_DATA['Tweet'] = TWEET_DATA['Tweet'].str.lower()


print('Case Folding Result : \n')
print(TWEET_DATA['Tweet'].head(5))

Case Folding Result : 

0    segera rilis tanggal  mei di korea dan udah te...
1    enak banget cuma karna polos bisa debut main f...
2                                            elon musk
3                                               reyhan
4                                         jahat ya iya
Name: Tweet, dtype: object


In [3]:
import string 
import re #regex library

# import word_tokenize & FreqDist from NLTK
from nltk.tokenize import word_tokenize 
from nltk.probability import FreqDist

# ------ Tokenizing ---------

def remove_tweet_special(text):
    # remove tab, new line, ans back slice
    text = text.replace('\\t'," ").replace('\\n'," ").replace('\\u'," ").replace('\\',"")
    # remove non ASCII (emoticon, chinese word, .etc)
    text = text.encode('ascii', 'replace').decode('ascii')
    # remove mention, link, hashtag
    text = ' '.join(re.sub("([@#][A-Za-z0-9]+)|(\w+:\/\/\S+)"," ", text).split())
    # remove incomplete URL
    return text.replace("http://", " ").replace("https://", " ")
                
TWEET_DATA['Tweet'] = TWEET_DATA['Tweet'].apply(remove_tweet_special)

#remove number
def remove_number(text):
    return  re.sub(r"\d+", "", text)

TWEET_DATA['Tweet'] = TWEET_DATA['Tweet'].apply(remove_number)

#remove punctuation
def remove_punctuation(text):
    return text.translate(str.maketrans("","",string.punctuation))

TWEET_DATA['Tweet'] = TWEET_DATA['Tweet'].apply(remove_punctuation)

#remove whitespace leading & trailing
def remove_whitespace_LT(text):
    return text.strip()

TWEET_DATA['Tweet'] = TWEET_DATA['Tweet'].apply(remove_whitespace_LT)

#remove multiple whitespace into single whitespace
def remove_whitespace_multiple(text):
    return re.sub('\s+',' ',text)

TWEET_DATA['Tweet'] = TWEET_DATA['Tweet'].apply(remove_whitespace_multiple)

# remove single char
def remove_singl_char(text):
    return re.sub(r"\b[a-zA-Z]\b", "", text)

TWEET_DATA['Tweet'] = TWEET_DATA['Tweet'].apply(remove_singl_char)

# NLTK word rokenize 
def word_tokenize_wrapper(text):
    return word_tokenize(text)

TWEET_DATA['Tweet_tokens'] = TWEET_DATA['Tweet'].apply(word_tokenize_wrapper)

print('Tokenizing Result : \n') 
print(TWEET_DATA['Tweet_tokens'].head())

Tokenizing Result : 

0    [segera, rilis, tanggal, mei, di, korea, dan, ...
1    [enak, banget, cuma, karna, polos, bisa, debut...
2                                         [elon, musk]
3                                             [reyhan]
4                                     [jahat, ya, iya]
Name: Tweet_tokens, dtype: object


In [4]:
# NLTK calc frequency distribution
def freqDist_wrapper(text):
    return FreqDist(text)

TWEET_DATA['Tweet_tokens_fdist'] = TWEET_DATA['Tweet_tokens'].apply(freqDist_wrapper)

print('Frequency Tokens : \n') 
print(TWEET_DATA['Tweet_tokens_fdist'].head().apply(lambda x : x.most_common()))

Frequency Tokens : 

0    [(rilis, 2), (di, 2), (segera, 1), (tanggal, 1...
1    [(enak, 1), (banget, 1), (cuma, 1), (karna, 1)...
2                               [(elon, 1), (musk, 1)]
3                                        [(reyhan, 1)]
4                      [(jahat, 1), (ya, 1), (iya, 1)]
Name: Tweet_tokens_fdist, dtype: object


In [5]:
from nltk.corpus import stopwords

# ----------------------- get stopword from NLTK stopword -------------------------------
# get stopword indonesia
list_stopwords = stopwords.words('indonesian')


# ---------------------------- manualy add stopword  ------------------------------------
# append additional stopword
list_stopwords.extend(["yg", "dg", "rt", "dgn", "ny", "di", 'klo', 
                       'kalo', 'amp', 'biar', 'bikin', 'bilang', 
                       'gak', 'ga', 'krn', 'nya', 'nih', 'sih', 
                       'si', 'tau', 'tdk', 'tuh', 'utk', 'ya', 
                       'jd', 'jgn', 'sdh', 'aja', 'nyg', 'hehe',
                       'pen', 'nan', 'loh', 'rt', '&amp', 'yah',
                       'di', 'bgt', 'gpp'])

# ----------------------- add stopword from txt file ------------------------------------
# read txt stopword using pandas
txt_stopword = pd.read_csv("stopwords-id.txt", names= ["stopwords"], header = None)

# convert stopword string to list & append additional stopword
list_stopwords.extend(txt_stopword["stopwords"][0].split(' '))

# ---------------------------------------------------------------------------------------

# convert list to dictionary
list_stopwords = set(list_stopwords)


#remove stopword pada list token
def stopwords_removal(words):
    return [word for word in words if word not in list_stopwords]

TWEET_DATA['Tweet_tokens_WSW'] = TWEET_DATA['Tweet_tokens'].apply(stopwords_removal) 

print('Stopwords Result : \n') 
print(TWEET_DATA['Tweet_tokens_WSW'].head())

Stopwords Result : 

0    [rilis, tanggal, mei, korea, udah, terjual, ne...
1    [enak, banget, karna, polos, debut, main, film...
2                                         [elon, musk]
3                                             [reyhan]
4                                         [jahat, iya]
Name: Tweet_tokens_WSW, dtype: object


In [6]:
normalizad_word = pd.read_excel("C:/Users/Irgi/Documents/.ipynb_checkpoints/normalisasi.xlsx", engine="openpyxl")

normalizad_word_dict = {}

for index, row in normalizad_word.iterrows():
    if row[0] not in normalizad_word_dict:
        normalizad_word_dict[row[0]] = row[1] 

def normalized_term(document):
    return [normalizad_word_dict[term] if term in normalizad_word_dict else term for term in document]

TWEET_DATA['Tweet_normalized'] = TWEET_DATA['Tweet_tokens_WSW'].apply(normalized_term)

print('Normalize Result : \n') 
TWEET_DATA['Tweet_normalized'].head(10)

Normalize Result : 



0    [rilis, tanggal, mei, korea, sudah, terjual, n...
1    [enak, banget, karna, polos, debut, main, film...
2                                         [elon, musk]
3                                             [reyhan]
4                                         [jahat, iya]
5                                        [bokep, indo]
6                                               [beli]
7                [animator, film, the, hobbit, hunger]
8                     [written, in, bahasa, indonesia]
9                          [purely, review, rate, aku]
Name: Tweet_normalized, dtype: object

In [7]:
# import Sastrawi package
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import swifter


# create stemmer
factory = StemmerFactory()
stemmer = factory.create_stemmer()

# stemmed
def stemmed_wrapper(term):
    return stemmer.stem(term)

term_dict = {}

for document in TWEET_DATA['Tweet_normalized']:
    for term in document:
        if term not in term_dict:
            term_dict[term] = ' '
            
print(len(term_dict))
print("------------------------")

for term in term_dict:
    term_dict[term] = stemmed_wrapper(term)
    print(term,":" ,term_dict[term])
    
print(term_dict)
print("------------------------")

# apply stemmed term to dataframe
def get_stemmed_term(document):
    return [term_dict[term] for term in document]

TWEET_DATA['Tweet_tokens_stemmed'] = TWEET_DATA['Tweet_normalized'].swifter.apply(get_stemmed_term)
print(TWEET_DATA['Tweet_tokens_stemmed'])

1008
------------------------
rilis : rilis
tanggal : tanggal
mei : mei
korea : korea
sudah : sudah
terjual : jual
negara : negara
indonesia : indonesia
bioskop : bioskop
indo : indo
nonton : nonton
enak : enak
banget : banget
karna : karna
polos : polos
debut : debut
main : main
filmapakabar : filmapakabar
kariernya : karier
bangun : bangun
nol : nol
tenggelam : tenggelam
palung : palung
mariana : mariana
elon : elon
musk : musk
reyhan : reyhan
jahat : jahat
iya : iya
bokep : bokep
beli : beli
animator : animator
film : film
the : the
hobbit : hobbit
hunger : hunger
written : written
in : in
bahasa : bahasa
purely : purely
review : review
rate : rate
aku : aku
met : met
founder : founder
interviewed : interviewed
kandidat : kandidat
kejora : kejora
seorang : orang
chief : chief
investment : investment
officer : officer
perusahaan : usaha
triliunan : triliun
calon : calon
biz : biz
counterpart : counterpart
dr : dr
meta : meta
thai : thai
investors : investors
producer : producer
owner

snowdrop : snowdrop
winter : winter
kuntilanak : kuntilanak
garagara : garagara
warisan : waris
kkn : kkn
desa : desa
penari : tari
jack : jack
sparrow : sparrow
thread : thread
tujuan : tuju
tingkat : tingkat
presentasi : presentasi
kemampuan : mampu
sedia : sedia
daerah : daerah
nasional : nasional
internasional : internasional
takarir : takarir
sedunia : dunia
perempuanperdamaian : perempuanperdamaian
jaseng : jaseng
serang : serang
recommended : recommended
score : score
prangko : prangko
peringatan : ingat
hubungan : hubung
bilateral : bilateral
dokumenter : dokumenter
menipu : tipu
kawin : kawin
cerai : cerai
korupsi : korupsi
narkoba : narkoba
masuk : masuk
penjarashow : penjarashow
televisi : televisi
podcast : podcast
media : media
terkenal : kenal
kemkominfo : kemkominfo
platejohnny : platejohnny
nggak : nggak
deng : deng
becanda : becanda
jagahutan : jagahutan
jagamasadepan : jagamasadepan
donasikomik : donasikomik
military : military
prosecutor : prosecutor
doberman : dober

Pandas Apply:   0%|          | 0/356 [00:00<?, ?it/s]

0      [rilis, tanggal, mei, korea, sudah, jual, nega...
1      [enak, banget, karna, polos, debut, main, film...
2                                           [elon, musk]
3                                               [reyhan]
4                                           [jahat, iya]
                             ...                        
351                             [film, baru, donny, yen]
352    [industri, film, salah, industri, kreatif, mil...
353    [konflik, batin, johan, de, vries, tentara, mu...
354                                  [akan, vonis, laku]
355    [tonton, film, favorit, april, beli, tiket, gr...
Name: Tweet_tokens_stemmed, Length: 356, dtype: object


In [8]:
TWEET_DATA.head()

Unnamed: 0,Tweet,Tweet_tokens,Tweet_tokens_fdist,Tweet_tokens_WSW,Tweet_normalized,Tweet_tokens_stemmed
0,segera rilis tanggal mei di korea dan udah ter...,"[segera, rilis, tanggal, mei, di, korea, dan, ...","{'segera': 1, 'rilis': 2, 'tanggal': 1, 'mei':...","[rilis, tanggal, mei, korea, udah, terjual, ne...","[rilis, tanggal, mei, korea, sudah, terjual, n...","[rilis, tanggal, mei, korea, sudah, jual, nega..."
1,enak banget cuma karna polos bisa debut main f...,"[enak, banget, cuma, karna, polos, bisa, debut...","{'enak': 1, 'banget': 1, 'cuma': 1, 'karna': 1...","[enak, banget, karna, polos, debut, main, film...","[enak, banget, karna, polos, debut, main, film...","[enak, banget, karna, polos, debut, main, film..."
2,elon musk,"[elon, musk]","{'elon': 1, 'musk': 1}","[elon, musk]","[elon, musk]","[elon, musk]"
3,reyhan,[reyhan],{'reyhan': 1},[reyhan],[reyhan],[reyhan]
4,jahat ya iya,"[jahat, ya, iya]","{'jahat': 1, 'ya': 1, 'iya': 1}","[jahat, iya]","[jahat, iya]","[jahat, iya]"


In [9]:
TWEET_DATA = TWEET_DATA [['Tweet']]

In [10]:
TWEET_DATA.sort_values("Tweet", inplace = True)
TWEET_DATA.drop_duplicates(subset ="Tweet", keep = False, inplace = True)

In [11]:
TWEET_DATA.to_csv("Text_Preprocessing.csv")