In [45]:
import re

import pandas as pd
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer

In [46]:
class RegexRemover(TransformerMixin):
    def __init__(self) -> None:
        self.patterns = [
            r"[^\w\s]|_",  # Remove all special characters / punctuations
            r"\d+",  # Remove all numbers
            r"[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF\U0001F700-\U0001F77F\U0001F780-\U0001F7FF\U0001F800-\U0001F8FF\U0001F900-\U0001F9FF\U0001FA00-\U0001FA6F\U0001FA70-\U0001FAFF\U00002702-\U000027B0\U00002639\U0000263A]+",  # Remove all emoji / emoticon
        ]

    def remove_patterns(self, text: str) -> str:
        for pattern in self.patterns:
            text = re.sub(pattern, "", text)
        return text

    def fit(self, df: pd.DataFrame, y=None):
        return self

    def transform(self, df: pd.DataFrame, y=None) -> pd.DataFrame:
        df_pre = df.copy()
        df_pre.loc[:, "comment"] = df_pre["comment"].apply(self.remove_patterns)
        return df_pre

In [47]:
class StopWordRemover(TransformerMixin):
    def __init__(self) -> None:
        self.stop_word_remover = StopWordRemoverFactory().create_stop_word_remover()
        self.stop_word_remover.get_dictionary().add("nya")

    def fit(self, df: pd.DataFrame, y=None):
        return self

    def transform(self, df: pd.DataFrame, y=None) -> pd.DataFrame:
        df_pre = df.copy()
        df_pre.loc[:, "comment"] = df_pre["comment"].apply(
            self.stop_word_remover.remove
        )
        return df_pre

In [48]:
class Stemmer(TransformerMixin):
    def __init__(self) -> None:
        self.stemmer = StemmerFactory().create_stemmer()

    def fit(self, df: pd.DataFrame, y=None):
        return self

    def transform(self, df: pd.DataFrame, y=None) -> pd.DataFrame:
        df_pre = df.copy()
        df_pre.loc[:, "comment"] = df_pre["comment"].apply(self.stemmer.stem)
        return df_pre

In [55]:
def to_lower_strip(df: pd.DataFrame) -> pd.DataFrame:
    df_pre = df.copy()
    df_pre.loc[:, "comment"] = df_pre["comment"].apply(lambda x: x.lower().strip())

    return df_pre


def remove_one_letter(df: pd.DataFrame) -> pd.DataFrame:
    df_pre = df.copy()

    def _remove_one_letter(text: str):
        words = text.split(" ")
        for word in words:
            if len(word) == 1:
                words.remove(word)

        return " ".join(words)

    df_pre.loc[:, "comment"] = df_pre["comment"].apply(_remove_one_letter)

    return df_pre

In [50]:
pipeline = Pipeline(
    [
        ("lower_case_strip", FunctionTransformer(func=to_lower_strip)),
        ("regex_remover", RegexRemover()),
        ("stop_word_remover", StopWordRemover()),
        ("stemmer", Stemmer()),
        ("remove_one_letter", FunctionTransformer(func=remove_one_letter)),
    ]
)

In [51]:
df = pd.read_csv("../data/tokopedia_tws_f9.csv")

In [52]:
clean_df = pipeline.fit_transform(df)

In [54]:
clean_df.loc[93]["comment"]

'barang sih oktp yaa kemas bagi dalam yg rusak bolong utk sellernya perhati kemas dalam meski barang oke dalam k'