In [None]:
import pandas as pd 
import numpy as np

TWEET_DATA = pd.read_csv("data_klasifikasi.csv", encoding = "ISO-8859-1")
TWEET_DATA['tweet'].str.encode('ascii', 'ignore')
TWEET_DATA.head()

Unnamed: 0,tweet,klasifikasi
0,pemerintah lampung bantuan mahasiswa derasnya ...,negative
1,pemerintah menerapkan herd immunity kemampuan ...,negative
2,lakukan makanan berbuka sahur 10 kepala keluar...,positive
3,warga positif corona bupati kondisi menuntut k...,positive
4,emosi banget kondisi disuruh liputan covid dos...,negative


In [None]:
 #------ Case Folding --------
# gunakan fungsi Series.str.lower() pada Pandas
TWEET_DATA['tweet'] = TWEET_DATA['tweet'].str.lower()


print('Case Folding Result : \n')
print(TWEET_DATA['tweet'].head(5))
print('\n\n\n')

Case Folding Result : 

0    pemerintah lampung bantuan mahasiswa derasnya ...
1    pemerintah menerapkan herd immunity kemampuan ...
2    lakukan makanan berbuka sahur 10 kepala keluar...
3    warga positif corona bupati kondisi menuntut k...
4    emosi banget kondisi disuruh liputan covid dos...
Name: tweet, dtype: object






In [None]:
import string 
import re #regex library
import nltk
nltk.download('punkt')
# import word_tokenize & FreqDist from NLTK
from nltk.tokenize import word_tokenize 
from nltk.probability import FreqDist

# ------ Tokenizing ---------

def remove_tweet_special(text):
    # remove tab, new line, ans back slice
    text = text.replace('\\t'," ").replace('\\n'," ").replace('\\u'," ").replace('\\',"")
    # remove non ASCII (emoticon, chinese word, .etc)
    text = text.encode('ascii', 'replace').decode('ascii')
    # remove mention, link, hashtag
    text = ' '.join(re.sub("([@#][A-Za-z0-9]+)|(\w+:\/\/\S+)"," ", text).split())
    # remove incomplete URL
    return text.replace("http://", " ").replace("https://", " ")
                
TWEET_DATA['tweet'] = TWEET_DATA['tweet'].apply(remove_tweet_special)

#remove number
def remove_number(text):
    return  re.sub(r"\d+", "", text)

TWEET_DATA['tweet'] = TWEET_DATA['tweet'].apply(remove_number)

#remove punctuation
def remove_punctuation(text):
    return text.translate(str.maketrans("","",string.punctuation))

TWEET_DATA['tweet'] = TWEET_DATA['tweet'].apply(remove_punctuation)

#remove whitespace leading & trailing
def remove_whitespace_LT(text):
    return text.strip()

TWEET_DATA['tweet'] = TWEET_DATA['tweet'].apply(remove_whitespace_LT)

#remove multiple whitespace into single whitespace
def remove_whitespace_multiple(text):
    return re.sub('\s+',' ',text)

TWEET_DATA['tweet'] = TWEET_DATA['tweet'].apply(remove_whitespace_multiple)

# remove single char
def remove_singl_char(text):
    return re.sub(r"\b[a-zA-Z]\b", "", text)

TWEET_DATA['tweet'] = TWEET_DATA['tweet'].apply(remove_singl_char)

# NLTK word rokenize 
def word_tokenize_wrapper(text):
    return word_tokenize(text)

TWEET_DATA['tweet_tokens'] = TWEET_DATA['tweet'].apply(word_tokenize_wrapper)

print('Tokenizing Result : \n') 
print(TWEET_DATA['tweet_tokens'].head())
print('\n\n\n')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
Tokenizing Result : 

0    [pemerintah, lampung, bantuan, mahasiswa, dera...
1    [pemerintah, menerapkan, herd, immunity, kemam...
2    [lakukan, makanan, berbuka, sahur, kepala, kel...
3    [warga, positif, corona, bupati, kondisi, menu...
4    [emosi, banget, kondisi, disuruh, liputan, cov...
Name: tweet_tokens, dtype: object






In [None]:
# NLTK calc frequency distribution
def freqDist_wrapper(text):
    return FreqDist(text)

TWEET_DATA['tweet_tokens_fdist'] = TWEET_DATA['tweet_tokens'].apply(freqDist_wrapper)

print('Frequency Tokens : \n') 
print(TWEET_DATA['tweet_tokens_fdist'].head().apply(lambda x : x.most_common()))

Frequency Tokens : 

0    [(pemerintah, 2), (lampung, 2), (bantuan, 1), ...
1    [(pemerintah, 2), (menerapkan, 1), (herd, 1), ...
2    [(lakukan, 1), (makanan, 1), (berbuka, 1), (sa...
3    [(bantaeng, 3), (warga, 1), (positif, 1), (cor...
4    [(disuruh, 2), (liputan, 2), (emosi, 1), (bang...
Name: tweet_tokens_fdist, dtype: object


In [None]:
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
# ----------------------- get stopword from NLTK stopword -------------------------------
# get stopword indonesia
list_stopwords = stopwords.words('indonesian')
print(len(list_stopwords))

# ---------------------------- manualy add stopword  ------------------------------------
# append additional stopword
list_stopwords.extend(["yg", "dg", "rt", "dgn", "ny", "d", 'klo', 
                       'kalo', 'amp', 'biar', 'bikin', 'bilang', 
                       'gak', 'ga', 'krn', 'nya', 'nih', 'sih', 
                       'si', 'tau', 'tdk', 'tuh', 'utk', 'ya', 
                       'jd', 'jgn', 'sdh', 'aja', 'n', 't', 
                       'nyg', 'hehe', 'pen', 'u', 'nan', 'loh', 'rt',
                       '&amp', 'yah'])
len(list_stopwords)
# ----------------------- add stopword from txt file ------------------------------------
# read txt stopword using pandas
txt_stopword = pd.read_csv("stopwordbahasa.csv", names= ["stopwords"], header = None)

# convert stopword string to list & append additional stopword
list_stopwords.extend(txt_stopword["stopwords"][0].split(' '))
len(list_stopwords)
# ---------------------------------------------------------------------------------------

# convert list to dictionary
list_stopwords = set(list_stopwords)


#remove stopword pada list token
def stopwords_removal(words):
    return [word for word in words if word not in list_stopwords]

TWEET_DATA['tweet_tokens_WSW'] = TWEET_DATA['tweet_tokens'].apply(stopwords_removal) 


print(TWEET_DATA['tweet_tokens_WSW'].head())

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
758
0    [pemerintah, lampung, bantuan, mahasiswa, dera...
1    [pemerintah, menerapkan, herd, immunity, kemam...
2    [lakukan, makanan, berbuka, sahur, kepala, kel...
3    [warga, positif, corona, bupati, kondisi, menu...
4    [emosi, banget, kondisi, disuruh, liputan, cov...
Name: tweet_tokens_WSW, dtype: object


In [None]:
normalizad_word = pd.read_csv("normal.csv")

normalizad_word_dict = {}

for index, row in normalizad_word.iterrows():
    if row[0] not in normalizad_word_dict:
        normalizad_word_dict[row[0]] = row[1] 

def normalized_term(document):
    return [normalizad_word_dict[term] if term in normalizad_word_dict else term for term in document]

TWEET_DATA['tweet_normalized'] = TWEET_DATA['tweet_tokens_WSW'].apply(normalized_term)

TWEET_DATA['tweet_normalized'].head(10)

0    [pemerintah, lampung, bantuan, mahasiswa, dera...
1    [pemerintah, menerapkan, herd, immunity, kemam...
2    [lakukan, makanan, berbuka, sahur, kepala, kel...
3    [warga, positif, corona, bupati, kondisi, menu...
4    [emosi, banget, kondisi, disuruh, liputan, cov...
5    [antisipa, penyebaran, pandemi, covid, membutu...
6    [bang, pemerintah, peduli, republik, rakyat, c...
7    [updates, terkini, covid, kamis, sumber, dinas...
8    [pie, menteri, keuangan, sri, mulyani, pemerin...
9    [beruntung, ribuan, warga, mati, terdaftar, ko...
Name: tweet_normalized, dtype: object

In [None]:
# import Sastrawi package
!pip install Sastrawi
!pip install swifter
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import swifter
# create stemmer
factory = StemmerFactory()
stemmer = factory.create_stemmer()

# stemmed
def stemmed_wrapper(term):
    return stemmer.stem(term)

term_dict = {}

for document in TWEET_DATA['tweet_normalized']:
    for term in document:
        if term not in term_dict:
            term_dict[term] = ' '
            
print(len(term_dict))
print("------------------------")

for term in term_dict:
    term_dict[term] = stemmed_wrapper(term)
    print(term,":" ,term_dict[term])
    
print(term_dict)
print("------------------------")


# apply stemmed term to dataframe
def get_stemmed_term(document):
    return [term_dict[term] for term in document]

TWEET_DATA['tweet_tokens_stemmed'] = TWEET_DATA['tweet_normalized'].swifter.apply(get_stemmed_term)
print(TWEET_DATA['tweet_tokens_stemmed'])

Collecting Sastrawi
  Downloading Sastrawi-1.0.1-py2.py3-none-any.whl (209 kB)
[?25l[K     |█▋                              | 10 kB 26.4 MB/s eta 0:00:01[K     |███▏                            | 20 kB 32.3 MB/s eta 0:00:01[K     |████▊                           | 30 kB 36.3 MB/s eta 0:00:01[K     |██████▎                         | 40 kB 39.0 MB/s eta 0:00:01[K     |███████▉                        | 51 kB 18.9 MB/s eta 0:00:01[K     |█████████▍                      | 61 kB 19.7 MB/s eta 0:00:01[K     |███████████                     | 71 kB 13.5 MB/s eta 0:00:01[K     |████████████▌                   | 81 kB 14.9 MB/s eta 0:00:01[K     |██████████████                  | 92 kB 13.8 MB/s eta 0:00:01[K     |███████████████▋                | 102 kB 15.0 MB/s eta 0:00:01[K     |█████████████████▏              | 112 kB 15.0 MB/s eta 0:00:01[K     |██████████████████▊             | 122 kB 15.0 MB/s eta 0:00:01[K     |████████████████████▎           | 133 kB 15.0 MB/s

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
maafmeninggal : maafmeninggal
covidkali : covidkali
fibrianto : fibrianto
dufn : dufn
county : county
giveawayalert : giveawayalert
dermaga : dermaga
hapus : hapus
pertalite : pertalite
menyelundupkan : selundup
menggolkan : gol
begni : begni
ajadah : ajadah
awaskorupsisaveekonomi : awaskorupsisaveekonomi
netralitas : netralitas
etik : etik
bahagianya : bahagia
herawati : herawati
menggodog : menggodog
surip : surip
kaloran : kalor
thileng : thileng
terbuncit : buncit
kwkwkwk : kwkwkwk
tapipemerintahnya : tapipemerintahnya
idafauziyah : idafauziyah
uni : uni
worldometers : worldometers
important : important
notice : notice
bangtan : bangtan
dengerinelshintaaja : dengerinelshintaaja
menyemprotkan : semprot
lokckdown : lokckdown
indosatcare : indosatcare
terimakasihbppt : terimakasihbppt
semparuk : semparuk
indonesiahariini : indonesiahariini
tetapproduktifamancovid : tetapproduktifamancovid
plnuntuknegeri : plnuntuknegeri


HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=36922.0, style=ProgressStyle(descripti…


0        [perintah, lampung, bantu, mahasiswa, deras, d...
1        [perintah, terap, herd, immunity, mampu, perin...
2        [laku, makan, buka, sahur, kepala, keluarga, d...
3        [warga, positif, corona, bupati, kondisi, tunt...
4        [emosi, banget, kondisi, suruh, liput, covid, ...
                               ...                        
36917    [hoaks, hoaks, edar, covid, perintah, kendali,...
36918    [tinggi, covid, majelis, musyawarat, rakyat, n...
36919    [pakai, masker, cuci, hand, sanitizer, nama, i...
36920    [kabupaten, aceh, selatan, zona, hijau, libat,...
36921    [keluyurann, rumah, kerja, beli, sembako, jala...
Name: tweet_tokens_stemmed, Length: 36922, dtype: object


In [None]:
TWEET_DATA.to_csv("Text_Preprocessing.csv")

In [None]:
TWEET_DATA.to_excel("Text_Preprocessing.xlsx")