# Preprocessing

## Instalasi dan Import Library

In [26]:
# Instalasi dan import library
!pip install Sastrawi --quiet

import os
import re
import csv
import string
import nltk
import pandas as pd
import requests
import matplotlib.pyplot as plt
from tqdm import tqdm
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from sklearn.model_selection import train_test_split

# Download resource NLTK
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

# Aktifkan progress bar pada pandas
tqdm.pandas()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


## Load Dataset

In [27]:
df = pd.read_csv('https://raw.githubusercontent.com/MIAbidin/Sentimen-Timnas-App/refs/heads/main/Notebook/dataset/dataset_raw.csv')
df

Unnamed: 0,id_file,name_file,created_at,username,text,sentiment
0,21.0,C2Se75epJj5,2024-01-19 16:37:56,arief_engrazefai,🔥🔥🔥,positive
1,38.0,C2o,2024-01-28 10:55:29,muhahsanitaqwim,🔥🔥🔥,neutral
2,21.0,C2Se75epJj5,2024-01-19 16:54:53,ugie07,Topp 🔥❤️,positive
3,33.0,C2fBBTFrLwh,2024-01-24 13:35:05,neneng_andriyani,"Alhamdulillah... Tidak apa-apa, yang penting b...",positive
4,38.0,C2o,2024-01-28 11:49:24,kampusturkicom,Doa terbaik untuk timnas Indonesia. Insya Alla...,positive
...,...,...,...,...,...,...
21040,25.0,C2TtGhvu4Bm,2024-01-20 04:34:16,herysajaherysaja,Semoga lawan Jepang pemain tetap rendah diri.....,positive
21041,4.0,C18P695JpH4,2024-01-11 02:16:40,gunturputrarizky,Gazzzz Garudaakkuuuu 🔥🔥🔥🙌🙌,positive
21042,5.0,C2Ek9lfL7uE,2024-01-14 07:39:31,aditya0484,@nadeowinataa yessss semangat bro 🔥🔥🔥🔥,positive
21043,9.0,C2H2QelL3Lt,2024-01-15 14:58:51,arya_________w,"Struick line up mulu anj, dari pertama main di...",negative


In [5]:
df.dropna(inplace=True)
df.isnull().sum()

Unnamed: 0,0
id_file,0
name_file,0
created_at,0
username,0
text,0
sentiment,0


## Preprocessing

In [9]:
slang_df = pd.read_csv("https://raw.githubusercontent.com/MIAbidin/Sentimen-Timnas-App/refs/heads/main/Notebook/dataset/slang_dict.csv")
slang_dict = dict(zip(slang_df['tidak_baku'], slang_df['kata_baku']))

# Text cleaning function
def clean_text(text):
    text = str(text)
    text = re.sub(r'@[A-Za-z0-9]+', '', text)  # Remove mentions
    text = re.sub(r'[_#]', '', text)  # Remove '#' and '_'
    text = re.sub(r'RT[\s]', '', text)  # Remove retweets
    text = re.sub(r"http\S+", '', text)  # Remove URLs
    text = re.sub(r'[0-9]+', '', text)  # Remove numbers
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF\U0001F700-\U0001F77F\U0001F780-\U0001F7FF\U0001F800-\U0001F8FF\U0001F900-\U0001F9FF\U0001FA00-\U0001FA6F\U0001FA70-\U0001FAFF\U00002702-\U000027B0\U000024C2-\U0001F251]', '', text)  # Remove emojis
    text = text.strip()  # Remove leading/trailing spaces
    return text

# Text processing pipeline functions
def casefold_text(text):
    return text.lower()

def fix_slangwords(text):
    words = text.split()
    return ' '.join([slang_dict.get(word.lower(), word) for word in words])

def tokenizing_text(text):
    return word_tokenize(text)

def remove_stopwords(text):
    listStopwords = set(stopwords.words('indonesian'))
    listStopwords1 = set(stopwords.words('english'))
    listStopwords.update(listStopwords1)
    listStopwords.update([ "yah", "abal", "yt", "yth", "yu", "yuk", "ml", 'iya', 'yaa', 'gak', 'nya', 'na', 'sih', 'ku', "di", "ga", "ya", "gaa", "loh", "kah", "woi", "woii", "woy", "bro", "dr", "mas", "wkwk", "aja", "bang", "si", "lu", "yg"])
    return [txt for txt in text if txt not in listStopwords]

def stemming_text(text):
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    return ' '.join([stemmer.stem(word) for word in text.split()])

## Proses Pipeline Preprocessing

In [10]:
df['text_clean'] = df['text'].progress_apply(clean_text)
df['text_casefolded'] = df['text_clean'].progress_apply(casefold_text)
df['text_slangfixed'] = df['text_casefolded'].progress_apply(fix_slangwords)
df['text_tokenized'] = df['text_slangfixed'].progress_apply(tokenizing_text)
df['text_filtered'] = df['text_tokenized'].progress_apply(remove_stopwords)
df['text_final'] = df['text_filtered'].progress_apply(lambda x: ' '.join(x))


100%|██████████| 21023/21023 [00:00<00:00, 87807.01it/s]
100%|██████████| 21023/21023 [00:00<00:00, 1080108.93it/s]
100%|██████████| 21023/21023 [00:00<00:00, 335986.06it/s]
100%|██████████| 21023/21023 [00:01<00:00, 10696.59it/s]
100%|██████████| 21023/21023 [00:10<00:00, 2057.51it/s]
100%|██████████| 21023/21023 [00:00<00:00, 798081.68it/s]


In [11]:
df

Unnamed: 0,id_file,name_file,created_at,username,text,sentiment,text_clean,text_casefolded,text_slangfixed,text_tokenized,text_filtered,text_final
0,21.0,C2Se75epJj5,2024-01-19 16:37:56,arief_engrazefai,🔥🔥🔥,positive,,,,[],[],
1,38.0,C2o,2024-01-28 10:55:29,muhahsanitaqwim,🔥🔥🔥,neutral,,,,[],[],
2,21.0,C2Se75epJj5,2024-01-19 16:54:53,ugie07,Topp 🔥❤️,positive,Topp,topp,topp,[topp],[topp],topp
3,33.0,C2fBBTFrLwh,2024-01-24 13:35:05,neneng_andriyani,"Alhamdulillah... Tidak apa-apa, yang penting b...",positive,Alhamdulillah Tidak apaapa yang penting bisa m...,alhamdulillah tidak apaapa yang penting bisa m...,alhamdulillah tidak apaapa yang penting bisa m...,"[alhamdulillah, tidak, apaapa, yang, penting, ...","[alhamdulillah, apaapa, mencetak, gol, gawang,...",alhamdulillah apaapa mencetak gol gawang jepang
4,38.0,C2o,2024-01-28 11:49:24,kampusturkicom,Doa terbaik untuk timnas Indonesia. Insya Alla...,positive,Doa terbaik untuk timnas Indonesia Insya Allah...,doa terbaik untuk timnas indonesia insya allah...,doa terbaik untuk timnas indonesia insya allah...,"[doa, terbaik, untuk, timnas, indonesia, insya...","[doa, terbaik, timnas, indonesia, insya, allah...",doa terbaik timnas indonesia insya allah menang
...,...,...,...,...,...,...,...,...,...,...,...,...
21040,25.0,C2TtGhvu4Bm,2024-01-20 04:34:16,herysajaherysaja,Semoga lawan Jepang pemain tetap rendah diri.....,positive,Semoga lawan Jepang pemain tetap rendah diri A...,semoga lawan jepang pemain tetap rendah diri a...,semoga lawan jepang pemain tetap rendah diri a...,"[semoga, lawan, jepang, pemain, tetap, rendah,...","[semoga, lawan, jepang, pemain, rendah, ayo, b...",semoga lawan jepang pemain rendah ayo buktikan
21041,4.0,C18P695JpH4,2024-01-11 02:16:40,gunturputrarizky,Gazzzz Garudaakkuuuu 🔥🔥🔥🙌🙌,positive,Gazzzz Garudaakkuuuu,gazzzz garudaakkuuuu,gazzzz garudaakkuuuu,"[gazzzz, garudaakkuuuu]","[gazzzz, garudaakkuuuu]",gazzzz garudaakkuuuu
21042,5.0,C2Ek9lfL7uE,2024-01-14 07:39:31,aditya0484,@nadeowinataa yessss semangat bro 🔥🔥🔥🔥,positive,yessss semangat bro,yessss semangat bro,yessss semangat bro,"[yessss, semangat, bro]","[yessss, semangat]",yessss semangat
21043,9.0,C2H2QelL3Lt,2024-01-15 14:58:51,arya_________w,"Struick line up mulu anj, dari pertama main di...",negative,Struick line up mulu anj dari pertama main di ...,struick line up mulu anj dari pertama main di ...,rafael struick line up mulu anjing dari perta...,"[rafael, struick, line, up, mulu, anjing, dari...","[rafael, struick, line, mulu, anjing, bermain,...",rafael struick line mulu anjing bermain timnas...


## Kolom & Hapus Kosong

In [12]:
df_select = df[['text_final', 'sentiment']]
df_select

Unnamed: 0,text_final,sentiment
0,,positive
1,,neutral
2,topp,positive
3,alhamdulillah apaapa mencetak gol gawang jepang,positive
4,doa terbaik timnas indonesia insya allah menang,positive
...,...,...
21040,semoga lawan jepang pemain rendah ayo buktikan,positive
21041,gazzzz garudaakkuuuu,positive
21042,yessss semangat,positive
21043,rafael struick line mulu anjing bermain timnas...,negative


In [15]:
df_select = df_select[df_select['text_final'].str.strip() != '']

In [17]:
df_select

Unnamed: 0,text_final,sentiment
2,topp,positive
3,alhamdulillah apaapa mencetak gol gawang jepang,positive
4,doa terbaik timnas indonesia insya allah menang,positive
6,gol membawa timnas lolos babak bahrain besok k...,positive
7,melia kitagaruda,positive
...,...,...
21038,hadeh lihat statistik komen,neutral
21040,semoga lawan jepang pemain rendah ayo buktikan,positive
21041,gazzzz garudaakkuuuu,positive
21042,yessss semangat,positive


## Split Data Train, Validasi, dan Test

In [24]:
train_val_df, test_df = train_test_split(
    df_select,
    test_size=0.2,
    random_state=42,
    stratify=df_select['sentiment']
)
train_df, val_df = train_test_split(
    train_val_df,
    test_size=0.2,
    random_state=42,
    stratify=train_val_df['sentiment']
)

print("Train shape:", train_df.shape)
print("Validation shape:", val_df.shape)
print("Test shape:", test_df.shape)


Train shape: (10976, 2)
Validation shape: (2744, 2)
Test shape: (3430, 2)


## Simpan Dataset

In [25]:
train_df.to_csv("train.tsv", sep="\t", index=False, header=False)
val_df.to_csv("valid.tsv", sep="\t", index=False, header=False)
test_df.to_csv("test.tsv", sep="\t", index=False, header=False)