In [11]:
import pandas as pd
import re
import string
from custom_stemmer import CustomStemmer
import swifter

# import word_tokenize & FreqDist from NLTK
from nltk.tokenize import word_tokenize 
from nltk.probability import FreqDist
from nltk.corpus import stopwords
import nltk
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

In [4]:
class CustomStemmer:
  def __init__(self, tweet_data):
      self.factory = StemmerFactory()
      self.stemmer = self.factory.create_stemmer()
      self.term_dict = {}
      for document in tweet_data:
        for term in document:
          if term not in self.term_dict:
            self.term_dict[term] = ''

      for term in self.term_dict:
        self.term_dict[term] = self.stemmed_wrapper(term)
        print(term,":" , self.term_dict[term])

  def stemmed_wrapper(self, term):
    return self.stemmer.stem(term)

  def get_stemmed_term(self, document):
    return [self.term_dict[term] for term in document]

In [12]:
# ----- Tokenizing -----
def remove_news_special(text):
  # remove tab, new line, and back slice
  text = text.replace('\\t'," ").replace('\\n'," ").replace('\\u'," ").replace('\\',"")
  # remove non ASCII (emoticon, kanji, .etc)
  text = text.encode('ascii', 'replace').decode('ascii')
  # remove mention, link, hashtag
  text = ' '.join(re.sub("([@#][A-Za-z0-9]+)|(\w+:\/\/\S+)"," ", text).split())
  # remove incomplete url
  return text.replace("http://", " ").replace("https://", " ")

#remove number
def remove_number(text):
  return re.sub(r"\d+", "", text)

#remove punctuation
def remove_punctuation(text):
  return text.translate(str.maketrans("", "", string.punctuation))

#remove whitespace leading & trailing
def remove_whitespace_LT(text):
  return text.strip()

#remove multiple whitespace into single whitespace
def remove_whitespace_multiple(text):
    return re.sub('\s+',' ',text)

#remove single char
def remove_single_char(text):
    return re.sub(r"\b[a-zA-Z]\b", "", text)

#NLTK word rokenize 
def word_tokenize_wrapper(text):
    return word_tokenize(text)

#NLTK calc frequency distribution
def freqDist_wrapper(text):
    return FreqDist(text)

def stopwords_removal(words):
  list_stopwords = stopwords.words('indonesian')
  list_stopwords.extend(["yg", "dg", "rt", "dgn", "ny", "d", 'klo', 
                       'kalo', 'amp', 'biar', 'bikin', 'bilang', 
                       'gak', 'ga', 'krn', 'nya', 'nih', 'sih', 
                       'si', 'tau', 'tdk', 'tuh', 'utk', 'ya', 
                       'jd', 'jgn', 'sdh', 'aja', 'n', 't', 
                       'nyg', 'hehe', 'pen', 'u', 'nan', 'loh', 'rt',
                       '&amp', 'yah'])
  list_stopwords = set(list_stopwords)

  return [word for word in words if word not in list_stopwords]

nltk.download('punkt')
news_data = pd.read_csv("scrapped_news.csv", encoding = "ISO-8859-1")
# print(news_data['description'].isnull())
# print(news_data['description'].isnull())
news_data['description'] = news_data['description'].apply(remove_news_special)
# news_data['description'] = news_data['description'].apply(remove_number)
news_data['description'] = news_data['description'].apply(remove_punctuation)
news_data['description'] = news_data['description'].apply(remove_whitespace_LT)
news_data['description'] = news_data['description'].apply(remove_whitespace_multiple)
news_data['description'] = news_data['description'].apply(remove_single_char)
news_data['description_tokens'] = news_data['description'].apply(word_tokenize_wrapper)
news_data['description_tokens_fdist'] = news_data['description_tokens'].apply(freqDist_wrapper)
news_data['description_tokens_wsw'] = news_data['description_tokens'].apply(freqDist_wrapper)

stemmer = CustomStemmer(news_data['description_tokens_wsw'])
news_data['description_tokens_stemmed'] = news_data['description_tokens_wsw'].swifter.apply(stemmer.get_stemmed_term)
# We will skip normalization word 

news_data.to_csv('text_preprocessing.csv')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/syubbanfakhriya/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


yang : yang
di : di
air : air
saya : saya
dan : dan
itu : itu
rumah : rumah
ke : ke
Desa : desa
banjir : banjir
saluran : salur
Sunarti : sunarti
dalam : dalam
pun : pun
saat : saat
ada : ada
kawasan : kawasan
cucunya : cucu
deras : deras
dari : dari
Jatinangor : jatinangor
kecil : kecil
merendam : rendam
tol : tol
luapan : luap
Banjir : banjir
menerjang : terjang
Sumedang : sumedang
warga : warga
Cipacing : cipacing
rumahnya : rumah
ia : ia
bersama : sama
masih : masih
kemudian : kemudian
semua : semua
cucu : cucu
satu : satu
tidak : tidak
berada : ada
juga : juga
untuk : untuk
Saat : saat
pada : pada
Jumat : jumat
24122021 : 24122021
bagi : bagi
sedang : sedang
masuk : masuk
mengaji : aji
ungkapnya : ungkap
sampai : sampai
bisa : bisa
terendam : rendam
lebih : lebih
meter : meter
tersebut : sebut
dua : dua
hujan : hujan
bawahnya : bawah
terjadi : jadi
Cileles : cileles
sempat : sempat
akan : akan
beberapa : beberapa
para : para
petani : tani
lahan : lahan
desa : desa
sore : sore
Sala

Pandas Apply: 100%|██████████| 39/39 [00:00<00:00, 18217.83it/s]
