In [33]:
import pandas as pd

# Baca data dari CSV
df = pd.read_csv('dataset_komentar_instagram_cyberbullying.csv')
print(df.head())  

df['Sentiment'] = df['Sentiment'].map({
    'negative': -1,
    'positive': 1
})

print(df['Sentiment'].head())

   Id Sentiment                             Instagram Comment Text
0   1  negative   <USERNAME> TOLOL!! Gak ada hubungan nya kegug...
1   2  negative  Geblek lo tata...cowo bgt dibela2in balikan......
2   3  negative  Kmrn termewek2 skr lengket lg duhhh kok labil ...
3   4  negative  Intinya kalau kesel dengan ATT nya, gausah ke ...
4   5  negative  hadewwwww permpuan itu lg!!!!sakit jiwa,knp ha...
0   -1
1   -1
2   -1
3   -1
4   -1
Name: Sentiment, dtype: int64


In [1]:
import nltk
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from nltk.corpus import stopwords
import re

nltk.download('stopwords')

# Inisialisasi stemmer
factory = StemmerFactory()
stemmer = factory.create_stemmer()

# Stopword bahasa Indonesia
stop_words = set(stopwords.words('indonesian'))

# Fungsi preprocessing
def preprocess(text):
    # Menghilangkan karakter non-alfabet
    text = re.sub(r'\W', ' ', text)

    # Mengubah teks menjadi huruf kecil
    text = text.lower()

    # Menghilangkan stopwords dan stemming
    words = text.split()
    words = [stemmer.stem(word) for word in words if word not in stop_words]

    return ' '.join(words)

# Terapkan preprocessing ke setiap teks
df['cleaned_text'] = df['Instagram Comment Text'].apply(preprocess)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ACER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!

KeyboardInterrupt



In [35]:
from sklearn.model_selection import train_test_split

X = df['cleaned_text']
y = df['Sentiment']  # Misalnya 0 untuk negatif, 1 untuk positif

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [36]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [37]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Inisialisasi model Naive Bayes
model = MultinomialNB()

# Latih model
model.fit(X_train_tfidf, y_train)

# Prediksi pada data uji
y_pred = model.predict(X_test_tfidf)

# Evaluasi model
print("Akurasi:", accuracy_score(y_test, y_pred))
print("Laporan Klasifikasi:\n", classification_report(y_test, y_pred))

Akurasi: 0.9125
Laporan Klasifikasi:
               precision    recall  f1-score   support

          -1       0.95      0.89      0.92        44
           1       0.87      0.94      0.91        36

    accuracy                           0.91        80
   macro avg       0.91      0.92      0.91        80
weighted avg       0.92      0.91      0.91        80



In [39]:
import joblib
from sklearn.ensemble import ExtraTreesClassifier

# Contoh model yang sudah dilatih, misalnya `model` adalah model Extra Trees atau Naive Bayes
model = ExtraTreesClassifier(n_estimators=100, random_state=42)
model.fit(X_train_tfidf, y_train)

# Simpan model
joblib.dump(model, 'sentiment_model.pkl')

# Simpan vectorizer (TF-IDF)
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')

['tfidf_vectorizer.pkl']