In [1]:
!pip install nltk sastrawi scikit-learn gensim pandas numpy

import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Download resource NLTK (jika belum ada)
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)

print("Library berhasil diimport.")

Collecting sastrawi
  Downloading Sastrawi-1.0.1-py2.py3-none-any.whl.metadata (909 bytes)
Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading Sastrawi-1.0.1-py2.py3-none-any.whl (209 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m209.7/209.7 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m27.9/27.9 MB[0m [31m44.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sastrawi, gensim
Successfully installed gensim-4.4.0 sastrawi-1.0.1
Library berhasil diimport.


In [5]:
#load data
df_reviews = pd.read_csv('dataset_stella_sora_original.csv')
df_reviews.head()

Unnamed: 0,username,rating,text,date,thumbsUp,replyContent,replyDate,translated_text,clean_text
0,Pengguna Google,5,bagus ROUGEKILL,2025-10-27 16:47:42,0,,,bagus ROUGEKILL,bagus ROUGEKILL
1,Pengguna Google,5,game keren udah ga kikir,2025-10-27 16:24:45,0,,,game keren udah ga kikir,game keren udah ga kikir
2,Pengguna Google,5,bagus,2025-10-27 16:11:57,0,,,bagus,bagus
3,Pengguna Google,1,akhirnya berbenah ü§è,2025-10-27 16:08:43,0,,,akhirnya berbenah ü§è,akhirnya berbenah ü§è
4,Pengguna Google,4,"gacha mahal, kontrol agak ngelag UI, gameplay,...",2025-10-27 15:55:27,0,,,"gacha mahal, kontrol agak ngelag UI, gameplay,...","gacha mahal, kontrol agak ngelag UI, gameplay,..."


In [6]:
#cleaning
print(f"Jumlah data sebelum filtering: {len(df_reviews)}")
df_reviews = df_reviews[df_reviews['text'].notna()]
print(f"Setelah hapus text None: {len(df_reviews)}")
df_reviews = df_reviews[df_reviews['clean_text'].str.len() > 2]
print(f"Setelah hapus text < 5 karakter: {len(df_reviews)}")
df_reviews = df_reviews[df_reviews['clean_text'].str.strip() != '']
print(f"Setelah hapus text kosong: {len(df_reviews)}")

print(f"\nTotal data yang dihapus: {15000 - len(df_reviews)}")
print(f"Total data final: {len(df_reviews)}")

Jumlah data sebelum filtering: 15000
Setelah hapus text None: 15000
Setelah hapus text < 5 karakter: 14929
Setelah hapus text kosong: 14929

Total data yang dihapus: 71
Total data final: 14929


In [7]:
#preprosesing
factory = StopWordRemoverFactory()
stopword_remover = factory.create_stop_word_remover()

stemmer_factory = StemmerFactory()
stemmer = stemmer_factory.create_stemmer()

def clean_text(text):

    if pd.isna(text):
        return ""

    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#\w+', '', text)
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    text = stopword_remover.remove(text)
    text = stemmer.stem(text)

    return text

print("proses 'text'...")
df_reviews['clean_text'] = df_reviews['text'].apply(clean_text)
print("Preprocessing selesai!")

for i in range(5):
    print(f"\n[{i+1}]")
    print(f"Original: {df_reviews.iloc[i]['text'][:100]}...")
    print(f"Cleaned:  {df_reviews.iloc[i]['clean_text'][:100]}...")

proses 'text'...
Preprocessing selesai!

[1]
Original: bagus ROUGEKILL...
Cleaned:  bagus rougekill...

[2]
Original: game keren udah ga kikir...
Cleaned:  game keren udah ga kikir...

[3]
Original: bagus...
Cleaned:  bagus...

[4]
Original: akhirnya berbenah ü§è...
Cleaned:  akhir benah...

[5]
Original: gacha mahal, kontrol agak ngelag UI, gameplay, animasi, skill, desain, pergerakan, tampilan udah bag...
Cleaned:  gacha mahal kontrol ngelag ui gameplay animasi skill desain gera tampil udah bagus...


In [8]:
#labeling
def label_sentiment(rating):
    if rating <= 2:
        return 'Negatif'
    elif rating == 3:
        return 'Netral'
    else:
        return 'Positif'

df_reviews['sentiment'] = df_reviews['rating'].apply(label_sentiment)

sentiment_counts = df_reviews['sentiment'].value_counts()
print(sentiment_counts)

sentiment_pct = df_reviews['sentiment'].value_counts(normalize=True) * 100
for label, pct in sentiment_pct.items():
    print(f"{label}: {pct:.2f}%")

sentiment
Negatif    9619
Positif    4730
Netral      580
Name: count, dtype: int64
Negatif: 64.43%
Positif: 31.68%
Netral: 3.89%


In [9]:
#encoding
X = df_reviews['clean_text']
y = df_reviews['sentiment']

print(f"\nData yang akan digunakan untuk modeling:")
print(f"  X (Features) : kolom 'clean_text' ")
print(f"  y (Target)   : kolom 'sentiment' ")
print(f"  Total sampel : {len(X)}")

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

print(f"\n{'='*70}")
print("Mapping")
print(f"{'='*70}")
for i, label in enumerate(label_encoder.classes_):
    count = np.sum(y_encoded == i)
    print(f"  {label:8s} ‚Üí {i}  (jumlah: {count:5d} sampel | {count/len(y_encoded)*100:5.2f}%)")


Data yang akan digunakan untuk modeling:
  X (Features) : kolom 'clean_text' 
  y (Target)   : kolom 'sentiment' 
  Total sampel : 14929

Mapping
  Negatif  ‚Üí 0  (jumlah:  9619 sampel | 64.43%)
  Netral   ‚Üí 1  (jumlah:   580 sampel |  3.89%)
  Positif  ‚Üí 2  (jumlah:  4730 sampel | 31.68%)


In [10]:
X_train_3, X_test_3, y_train_3, y_test_3 = train_test_split(
    X, y_encoded,
    test_size=0.3,
    random_state=42,
    stratify=y_encoded
)

print(f"\nPembagian Data:")
print(f"  Training: {len(X_train_3)} samples ({len(X_train_3)/len(X)*100:.1f}%)")
print(f"  Testing:  {len(X_test_3)} samples ({len(X_test_3)/len(X)*100:.1f}%)")


print("\nMelakukan TF-IDF Vectorization...")
tfidf_vectorizer_3 = TfidfVectorizer(
    max_features=10000,
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.8
) # Added the missing closing parenthesis

X_train_tfidf_3 = tfidf_vectorizer_3.fit_transform(X_train_3)
X_test_tfidf_3 = tfidf_vectorizer_3.transform(X_test_3)

print(f"Shape X_train_tfidf_3: {X_train_tfidf_3.shape}")
print(f"Shape X_test_tfidf_3: {X_test_tfidf_3.shape}")


print("\nMelatih model Random Forest...")
model_rf_3 = RandomForestClassifier(
    n_estimators=200,
    max_depth=50,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1,
    class_weight='balanced'
)

model_rf_3.fit(X_train_tfidf_3, y_train_3)
print("selesai!")


y_train_pred_3 = model_rf_3.predict(X_train_tfidf_3)
y_test_pred_3 = model_rf_3.predict(X_test_tfidf_3)

train_acc_3 = accuracy_score(y_train_3, y_train_pred_3)
test_acc_3 = accuracy_score(y_test_3, y_test_pred_3)

print(f"Akurasi Training: {train_acc_3:.4f}")
print(f"Akurasi Testing:  {test_acc_3:.4f}")

skema3_results = {
    'skema': 'Skema 3',
    'model': 'Random Forest',
    'feature': 'TF-IDF',
    'split': '70/30',
    'train_acc': train_acc_3,
    'test_acc': test_acc_3
}

print("\n--- Hasil Evaluasi Model Random Forest ---")

# Menggunakan model_rf_3 yang sudah dilatih sebelumnya
y_test_pred_rf = model_rf_3.predict(X_test_tfidf_3)

print(f"Accuracy: {accuracy_score(y_test_3, y_test_pred_rf):.4f}")

print("\nConfusion Matrix:")
print(confusion_matrix(y_test_3, y_test_pred_rf))

print("\nClassification Report:")
print(classification_report(y_test_3, y_test_pred_rf))


Pembagian Data:
  Training: 10450 samples (70.0%)
  Testing:  4479 samples (30.0%)

Melakukan TF-IDF Vectorization...
Shape X_train_tfidf_3: (10450, 10000)
Shape X_test_tfidf_3: (4479, 10000)

Melatih model Random Forest...
selesai!
Akurasi Training: 0.9198
Akurasi Testing:  0.8656

--- Hasil Evaluasi Model Random Forest ---
Accuracy: 0.8656

Confusion Matrix:
[[2625    6  255]
 [  34  120   20]
 [ 282    5 1132]]

Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.91      0.90      2886
           1       0.92      0.69      0.79       174
           2       0.80      0.80      0.80      1419

    accuracy                           0.87      4479
   macro avg       0.87      0.80      0.83      4479
weighted avg       0.87      0.87      0.86      4479



In [11]:
!pip install pipreqs
!jupyter nbconvert --to script model.ipynb
!pipreqs . --force

print("\n--- Selesai! ---")
print("File requirements.txt telah dibuat atau diperbarui.")

Collecting pipreqs
  Downloading pipreqs-0.5.0-py3-none-any.whl.metadata (7.9 kB)
Collecting docopt==0.6.2 (from pipreqs)
  Downloading docopt-0.6.2.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting ipython==8.12.3 (from pipreqs)
  Downloading ipython-8.12.3-py3-none-any.whl.metadata (5.7 kB)
Collecting yarg==0.1.9 (from pipreqs)
  Downloading yarg-0.1.9-py2.py3-none-any.whl.metadata (4.6 kB)
Collecting jedi>=0.16 (from ipython==8.12.3->pipreqs)
  Downloading jedi-0.19.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting stack-data (from ipython==8.12.3->pipreqs)
  Downloading stack_data-0.6.3-py3-none-any.whl.metadata (18 kB)
Collecting executing>=1.2.0 (from stack-data->ipython==8.12.3->pipreqs)
  Downloading executing-2.2.1-py2.py3-none-any.whl.metadata (8.9 kB)
Collecting asttokens>=2.1.0 (from stack-data->ipython==8.12.3->pipreqs)
  Downloading asttokens-3.0.1-py3-none-any.whl.metadata (4.9 kB)
Collecting pure-eval (from stack-data->ipython==8.12.3->pipr

This application is used to convert notebook files (*.ipynb)
        to various other formats.


Options
The options below are convenience aliases to configurable class-options,
as listed in the "Equivalent to" description-line of the aliases.
To see all configurable class-options for some <cmd>, use:
    <cmd> --help-all

--debug
    set log level to logging.DEBUG (maximize logging output)
    Equivalent to: [--Application.log_level=10]
--show-config
    Show the application's configuration (human-readable format)
    Equivalent to: [--Application.show_config=True]
--show-config-json
    Show the application's configuration (json format)
    Equivalent to: [--Application.show_config_json=True]
--generate-config
    generate default config file
    Equivalent to: [--JupyterApp.generate_config=True]
-y
    Answer yes to any questions instead of prompting.
    Equivalent to: [--JupyterApp.answer_yes=True]
--execute
    Execute the notebook prior to export.
    Equivalent to: [--ExecutePr