In [2]:
import pandas as pd

df = pd.read_csv('dataset_komentar_instagram_cyberbullying.csv')

df['Sentiment'] = df['Sentiment'].map({
    'negative' : -1,
    'positive' : 1
})

print(df.head())

   Id  Sentiment                             Instagram Comment Text
0   1         -1   <USERNAME> TOLOL!! Gak ada hubungan nya kegug...
1   2         -1  Geblek lo tata...cowo bgt dibela2in balikan......
2   3         -1  Kmrn termewek2 skr lengket lg duhhh kok labil ...
3   4         -1  Intinya kalau kesel dengan ATT nya, gausah ke ...
4   5         -1  hadewwwww permpuan itu lg!!!!sakit jiwa,knp ha...


In [3]:
import nltk
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from nltk.corpus import stopwords
import re

nltk.download('stopwords')

factory = StemmerFactory()
stemmer = factory.create_stemmer(stopwords)

stopwords = set(stopwords.words('indonesian'))

def preprocess(text) :
    text = re.sub(r'\W', ' ', text)
    text = text.lower()
    words = text.split()
    words = [stemmer.stem(word) for word in words if word not in stopwords]

    return ' '.join(words)

df['clean'] = df['Instagram Comment Text'].apply(preprocess)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ACER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
from sklearn.model_selection import train_test_split

x = df['clean']
y = df['Sentiment']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000)
xtrainfidf = vectorizer.fit_transform(x_train)
xtesttfidf = vectorizer.transform(x_test)

In [7]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

model = MultinomialNB()
model.fit(xtrainfidf, y_train)
ypred = model.predict(xtesttfidf)

print("Akurasi model : ", accuracy_score(y_test, ypred))
print("Detail laporan klasifikasi : \n", classification_report(y_test, ypred))

Akurasi model :  0.9125
Detail laporan klasifikasi : 
               precision    recall  f1-score   support

          -1       0.95      0.89      0.92        44
           1       0.87      0.94      0.91        36

    accuracy                           0.91        80
   macro avg       0.91      0.92      0.91        80
weighted avg       0.92      0.91      0.91        80



In [8]:
import joblib
from sklearn.ensemble import ExtraTreesClassifier

model = ExtraTreesClassifier(n_estimators=100, random_state=42)
model.fit(xtrainfidf, y_train)

joblib.dump(model, 'indonesiamodel.pkl')
joblib.dump(vectorizer, 'indonesiavectorizer.pkl')

['indonesiavectorizer.pkl']