In [36]:
import json
import re

import nltk
import pandas as pd
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer

nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [37]:
class RegexRemover(TransformerMixin):
    def __init__(self) -> None:
        self.patterns = [
            r"[^\w\s]|_|[¹²³⁴⁵⁶⁷⁸⁹]",  # Remove all special characters / punctuations
            r"\d+",  # Remove all numbers
            r"[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF\U0001F700-\U0001F77F\U0001F780-\U0001F7FF\U0001F800-\U0001F8FF\U0001F900-\U0001F9FF\U0001FA00-\U0001FA6F\U0001FA70-\U0001FAFF\U00002702-\U000027B0\U00002639\U0000263A]+",  # Remove all emoji / emoticon
        ]

    def remove_patterns(self, text: str) -> str:
        for pattern in self.patterns:
            text = re.sub(pattern, " ", text)
        return text

    def fit(self, df: pd.DataFrame, y=None):
        return self

    def transform(self, df: pd.DataFrame, y=None) -> pd.DataFrame:
        df_pre = df.copy()
        df_pre["clean_comment"] = df_pre["comment"].apply(self.remove_patterns)
        return df_pre

In [38]:
class TypoFixer(TransformerMixin):
    def __init__(self) -> None:
        with open("../data/typo_mapper.json", "r") as file:
            typo_mapper = json.load(file)
        self.typo_mapper = {
            key.lower(): value.lower() for key, value in typo_mapper.items()
        }

    def fix_typo(self, text: str) -> str:
        new_text = []

        for word in text.split(" "):
            if word.lower() in self.typo_mapper:
                new_text.append(self.typo_mapper[word.lower()])
            else:
                new_text.append(word.lower())

        new_text = " ".join(new_text)
        new_text = re.sub(r" +", " ", new_text)

        return new_text

    def fit(self, df: pd.DataFrame, y=None):
        return self

    def transform(self, df: pd.DataFrame, y=None) -> pd.DataFrame:
        df_pre = df.copy()
        df_pre["clean_comment"] = df_pre["clean_comment"].apply(self.fix_typo)
        return df_pre

In [39]:
class StopWordRemover(TransformerMixin):
    def __init__(self) -> None:
        self.stop_word_remover = StopWordRemoverFactory().create_stop_word_remover()
        self.stop_word_remover.get_dictionary().add("nya")
        self.stop_word_remover.get_dictionary().add_words(stopwords.words("indonesian"))

    def fit(self, df: pd.DataFrame, y=None):
        return self

    def transform(self, df: pd.DataFrame, y=None) -> pd.DataFrame:
        df_pre = df.copy()
        df_pre["clean_comment"] = df_pre["clean_comment"].apply(
            self.stop_word_remover.remove
        )
        return df_pre

In [40]:
class Stemmer(TransformerMixin):
    def __init__(self) -> None:
        self.stemmer = StemmerFactory().create_stemmer()

    def fit(self, df: pd.DataFrame, y=None):
        return self

    def transform(self, df: pd.DataFrame, y=None) -> pd.DataFrame:
        df_pre = df.copy()
        df_pre["clean_comment"] = df_pre["clean_comment"].apply(self.stemmer.stem)
        return df_pre

In [41]:
def to_lower_strip(df: pd.DataFrame) -> pd.DataFrame:
    df_pre = df.copy()
    df_pre.loc[:, "comment"] = df_pre["comment"].apply(lambda x: x.lower().strip())

    return df_pre


def remove_one_letter(df: pd.DataFrame) -> pd.DataFrame:
    df_pre = df.copy()

    def _remove_one_letter(text: str):
        words = text.split(" ")
        for word in words:
            if len(word) == 1:
                words.remove(word)

        return " ".join(words)

    df_pre["clean_comment"] = df_pre["clean_comment"].apply(_remove_one_letter)

    return df_pre

In [42]:
pipeline = Pipeline(
    [
        ("lower_case_strip", FunctionTransformer(func=to_lower_strip)),
        ("regex_remover", RegexRemover()),
        ("fix_typo", TypoFixer()),
        ("stop_word_remover", StopWordRemover()),
        ("stemmer", Stemmer()),
        ("remove_one_letter", FunctionTransformer(func=remove_one_letter)),
    ]
)

In [43]:
df = pd.read_csv("../data/tws-sentiment.csv")

In [44]:
clean_df = pipeline.fit_transform(df)

In [45]:
clean_df

Unnamed: 0,rating,comment,sentiment,clean_comment
0,5,bagus,positive,bagus
1,3,1tidak menyala,neutral,nyala
2,5,puas brangnya sesusai,positive,puas barang sesuai
3,5,lumayan lah walaupun bass nya kurang,positive,lumayan bass kurang
4,4,"barang udah sampai, packing bagus, barang mulu...",positive,barang udah packing bagus barang mulus lagi ca...
...,...,...,...,...
2333,5,berfungsi ke 2 nya. suara sesuai harga lumayanlah,positive,fungsi nya suara sesuai harga lumayan
2334,5,mantap,positive,mantap
2335,5,bagus,positive,bagus
2336,5,baik,positive,


In [46]:
clean_df.to_csv("../data/tws-clean.csv", index=False)