In [27]:
import json
import re

import nltk
import numpy as np
import pandas as pd
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from sklearn.base import TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer

nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [28]:
class RegexRemover(TransformerMixin):
    def __init__(self) -> None:
        self.patterns = [
            r"[^\w\s]|_|[¹²³⁴⁵⁶⁷⁸⁹]",  # Remove all special characters / punctuations
            r"\d+",  # Remove all numbers
            r"[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF\U0001F700-\U0001F77F\U0001F780-\U0001F7FF\U0001F800-\U0001F8FF\U0001F900-\U0001F9FF\U0001FA00-\U0001FA6F\U0001FA70-\U0001FAFF\U00002702-\U000027B0\U00002639\U0000263A]+",  # Remove all emoji / emoticon
        ]

    def remove_patterns(self, text: str) -> str:
        for pattern in self.patterns:
            text = re.sub(pattern, " ", text)
        return text

    def fit(self, df: pd.DataFrame, y=None):
        return self

    def transform(self, df: pd.DataFrame, y=None) -> pd.DataFrame:
        df["clean_comment"] = df["comment"].apply(self.remove_patterns)

        return df

In [29]:
class TypoFixer(TransformerMixin):
    def __init__(self) -> None:
        with open("../data/typo_mapper.json", "r") as file:
            typo_mapper = json.load(file)
        self.typo_mapper = {
            key.lower(): value.lower() for key, value in typo_mapper.items()
        }

    def fix_typo(self, text: str) -> str:
        new_text = []

        for word in text.split(" "):
            if word.lower() in self.typo_mapper:
                new_text.append(self.typo_mapper[word.lower()])
            else:
                new_text.append(word.lower())

        new_text = " ".join(new_text)
        new_text = re.sub(r" +", " ", new_text)

        return new_text

    def fit(self, df: pd.DataFrame, y=None):
        return self

    def transform(self, df: pd.DataFrame, y=None) -> pd.DataFrame:
        df["clean_comment"] = df["clean_comment"].apply(self.fix_typo)

        return df

In [30]:
class StopWordRemover(TransformerMixin):
    def __init__(self) -> None:
        stop_word_remover = StopWordRemoverFactory().create_stop_word_remover()

        stop_words_list = stop_word_remover.get_dictionary().words
        # stop_words_list.extend(stopwords.words("indonesian"))

        excluded_word = [
            "tidak",
            "bukan",
            "usah",
            "ok",
            "baik",
            "belum",
            "bekerja",
            "bisa",
        ]
        self.stop_words_list = [
            word
            for word in stop_words_list
            if not any(exclude in word for exclude in excluded_word)
        ]

    def remove(self, text: str) -> str:
        new_text = []

        for word in text.split(" "):
            if word in self.stop_words_list:
                continue
            new_text.append(word)

        return " ".join(new_text)

    def fit(self, df: pd.DataFrame, y=None):
        return self

    def transform(self, df: pd.DataFrame, y=None) -> pd.DataFrame:
        df["clean_comment"] = df["clean_comment"].apply(self.remove)
        return df

In [31]:
class Stemmer(TransformerMixin):
    def __init__(self) -> None:
        self.stemmer = StemmerFactory().create_stemmer()

    def fit(self, df: pd.DataFrame, y=None):
        return self

    def transform(self, df: pd.DataFrame, y=None) -> pd.DataFrame:
        df["clean_comment"] = df["clean_comment"].apply(self.stemmer.stem)

        return df

In [32]:
def to_lower_strip(df: pd.DataFrame) -> pd.DataFrame:
    df["comment"] = df["comment"].apply(lambda x: x.lower().strip())

    return df


def remove_one_letter(df: pd.DataFrame) -> pd.DataFrame:
    def _remove_one_letter(text: str):
        words = text.split(" ")
        for word in words:
            if len(word) == 1:
                words.remove(word)

        return " ".join(words)

    df["clean_comment"] = df["clean_comment"].apply(_remove_one_letter)

    return df

In [33]:
pipeline = Pipeline(
    [
        ("lower_case_strip", FunctionTransformer(func=to_lower_strip)),
        ("regex_remover", RegexRemover()),
        ("fix_typo", TypoFixer()),
        ("stemmer", Stemmer()),
        ("stop_word_remover", StopWordRemover()),
        ("remove_one_letter", FunctionTransformer(func=remove_one_letter)),
    ]
)

In [34]:
df = pd.read_csv("../data/tws-sentiment.csv")

In [35]:
clean_df = pipeline.fit_transform(df)

In [36]:
clean_df["clean_comment"] = clean_df["clean_comment"].replace("", np.NaN)

In [37]:
clean_df.isna().sum()

rating            0
comment           0
sentiment         0
clean_comment    23
dtype: int64

In [38]:
clean_df = clean_df.dropna()

In [39]:
clean_df.isna().sum()

rating           0
comment          0
sentiment        0
clean_comment    0
dtype: int64

In [40]:
clean_df

Unnamed: 0,rating,comment,sentiment,clean_comment
0,5,bagus,positive,bagus
1,3,1tidak menyala,neutral,tidak nyala
2,5,puas brangnya sesusai,positive,puas barang sesuai
3,5,lumayan lah walaupun bass nya kurang,positive,lumayan walaupun bass nya kurang
4,4,"barang udah sampai, packing bagus, barang mulu...",positive,barang udah packing bagus barang mulus cas cum...
...,...,...,...,...
2333,5,berfungsi ke 2 nya. suara sesuai harga lumayanlah,positive,fungsi nya suara sesuai harga lumayan
2334,5,mantap,positive,mantap
2335,5,bagus,positive,bagus
2336,5,baik,positive,baik


In [41]:
index = clean_df.sample(1).index

before = clean_df.loc[index]["comment"].values[0]
after = clean_df.loc[index]["clean_comment"].values[0]

print(f"Sebelum preprocessing:\n{before}\n{'=' * 50}")
print(f"Setelah preprocessing:\n{after}\n{'=' * 50}")

Sebelum preprocessing:
baterai cepat ngedrop
Setelah preprocessing:
baterai cepat ngedrop


In [42]:
final_df = clean_df[["clean_comment", "sentiment"]]
final_df = final_df.rename(columns={"clean_comment": "text"})

final_df.head()

Unnamed: 0,text,sentiment
0,bagus,positive
1,tidak nyala,neutral
2,puas barang sesuai,positive
3,lumayan walaupun bass nya kurang,positive
4,barang udah packing bagus barang mulus cas cum...,positive


In [43]:
num_positives = (final_df["sentiment"] == "positive").sum()

positives = final_df[final_df["sentiment"] == "positive"]
neutrals = final_df[final_df["sentiment"] == "negative"].sample(
    num_positives, replace=True
)
negatives = final_df[final_df["sentiment"] == "neutral"].sample(
    num_positives, replace=True
)

over_sampled = pd.concat([positives, neutrals, negatives], axis=0).reset_index(
    drop=True
)

In [44]:
# X = final_df["text"]
# y = final_df["sentiment"]

X = over_sampled["text"]

y = over_sampled["sentiment"]

In [45]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=2024, stratify=y
)

In [46]:
train_df = pd.concat([X_train, y_train], axis=1).reset_index(drop=True)
test_df = pd.concat([X_test, y_test], axis=1).reset_index(drop=True)

In [47]:
train_df.to_csv("../data/tws-train.csv", index=False)
test_df.to_csv("../data/tws-test.csv", index=False)