In [1]:
pip install pandas scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [2]:
! pip install accelerate -U
! pip install tokenizers
! pip install transformers datasets evaluate

Collecting accelerate
  Downloading accelerate-1.0.0-py3-none-any.whl.metadata (19 kB)
Collecting torch>=1.10.0 (from accelerate)
  Downloading torch-2.4.1-cp310-cp310-win_amd64.whl.metadata (27 kB)
Collecting huggingface-hub>=0.21.0 (from accelerate)
  Downloading huggingface_hub-0.25.2-py3-none-any.whl.metadata (13 kB)
Collecting safetensors>=0.4.3 (from accelerate)
  Downloading safetensors-0.4.5-cp310-none-win_amd64.whl.metadata (3.9 kB)
Collecting filelock (from huggingface-hub>=0.21.0->accelerate)
  Downloading filelock-3.16.1-py3-none-any.whl.metadata (2.9 kB)
Collecting fsspec>=2023.5.0 (from huggingface-hub>=0.21.0->accelerate)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Collecting typing-extensions>=3.7.4.3 (from huggingface-hub>=0.21.0->accelerate)
  Downloading typing_extensions-4.12.2-py3-none-any.whl.metadata (3.0 kB)
Collecting networkx (from torch>=1.10.0->accelerate)
  Downloading networkx-3.3-py3-none-any.whl.metadata (5.1 kB)
Downloading accelerat

In [36]:
from datasets import load_dataset

poetry = load_dataset("aditdwi123/cyber-bullying-dataset")

print(poetry)

DatasetDict({
    train: Dataset({
        features: ['id', 'kalimat', 'sentimen'],
        num_rows: 1103
    })
})


In [14]:
print(poetry['train'])

Dataset({
    features: ['id', 'kalimat', 'sentimen'],
    num_rows: 1103
})


In [37]:
import re
import string

# Define the text cleaning function
def clean_text(example):
    # Menghapus URL
    example['cleaned_text'] = re.sub(r'http\S+|www\S+|https\S+', '', example['kalimat'], flags=re.MULTILINE)
    # Menghapus mention (@username) dan hashtag (#hashtag)
    example['cleaned_text'] = re.sub(r'\@\w+|\#', '', example['cleaned_text'])
    # Menghapus angka
    example['cleaned_text'] = re.sub(r'\d+', '', example['cleaned_text'])
    # Menghapus tanda baca
    example['cleaned_text'] = example['cleaned_text'].translate(str.maketrans('', '', string.punctuation))
    # Mengubah ke huruf kecil
    example['cleaned_text'] = example['cleaned_text'].lower()
    return example

# Apply the cleaning function to the 'train' split
poetry['train'] = poetry['train'].map(clean_text)

# Display the first few rows of the cleaned text
print(poetry['train'].select(range(5))['cleaned_text'])


['jual makanan anjing dog food happy dog murah harga promo', 'jual grosir makanan anjing makanan kucing untuk petshop', 'jangan mentangmentang lu anak gaul pas dikejar anjing bukannya lari malah bilang terus gue harus kabur sambil bilang wow gitu', 'males itu kalo kerja pagi trus gak ada yg nganter anjing', 'pagi ini cuma mau panggil anjing aja buat elo yang naik motornya ugal ugalan']


In [38]:
# Mapping 'sentimen' to 0 for 'positif' and 1 for other values (e.g., 'negatif')
def map_labels(example):
    # If 'sentimen' is 'positif', set 'CB' to 0 (non-cyberbullying), else set it to 1
    example['CB'] = 0 if example['sentimen'] == 'positif' else 1
    return example

# Apply the mapping function to the 'train' dataset
poetry['train'] = poetry['train'].map(map_labels)

# Select and display the 'sentimen' and 'CB' columns
print(poetry['train'].select(range(5)).to_pandas()[['sentimen', 'CB']])


Map:   0%|          | 0/1103 [00:00<?, ? examples/s]

  sentimen  CB
0  positif   0
1  positif   0
2  positif   0
3  negatif   1
4  negatif   1


In [41]:
X = poetry['train']['cleaned_text']
y = poetry['train']['CB']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [42]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000)

X_train_vec = vectorizer.fit_transform(X_train)

X_test_vec = vectorizer.transform(X_test)


In [43]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()

model.fit(X_train_vec, y_train)


In [44]:
from sklearn.metrics import classification_report, accuracy_score

# Melakukan prediksi pada data pengujian
y_pred = model.predict(X_test_vec)

# Menghitung akurasi
accuracy = accuracy_score(y_test, y_pred)
print(f"Akurasi: {accuracy:.2f}")

# Menampilkan laporan klasifikasi
print(classification_report(y_test, y_pred))


Akurasi: 0.86
              precision    recall  f1-score   support

           0       0.94      0.71      0.81        94
           1       0.82      0.97      0.89       127

    accuracy                           0.86       221
   macro avg       0.88      0.84      0.85       221
weighted avg       0.87      0.86      0.86       221



In [46]:
# Mendapatkan skor probabilitas
y_pred_proba = model.predict_proba(X_test_vec)[:, 1]  # Probabilitas kelas 'CB'

# Menampilkan teks, prediksi, dan skor kepercayaan
for i in range(len(X_test)):
    print(f"Teks: {X_test[i]}")  # Accessing directly as a list
    print(f"Prediksi: {'CB' if y_pred[i] == 1 else 'Non_CB'} dengan Kepercayaan: {y_pred_proba[i]*100:.2f}%")
    print("-" * 50)


Teks: intinya lo itu kaya bajingan
Prediksi: CB dengan Kepercayaan: 94.02%
--------------------------------------------------
Teks: babi hutan itu berlari sangat cepat
Prediksi: Non_CB dengan Kepercayaan: 36.35%
--------------------------------------------------
Teks: ketika teman makan teman maka teman itu ku panggil bangsat
Prediksi: CB dengan Kepercayaan: 67.98%
--------------------------------------------------
Teks: ah manda mah gatel ga bisa liyat cowo dikit lgsg disosor begok aje yg mau ama manda  kegatelan jadi cewe
Prediksi: CB dengan Kepercayaan: 58.90%
--------------------------------------------------
Teks: wah itu kerdus mulutnya busukjangan jangan sampah makanannya
Prediksi: CB dengan Kepercayaan: 67.96%
--------------------------------------------------
Teks: pernah ketemu sama mba artika sari disesemall di jakartaampunnn ayuneemana semampaimurah senyummmcakep pisan lah
Prediksi: CB dengan Kepercayaan: 59.42%
--------------------------------------------------
Teks: anjin

In [47]:
import pickle

# Path penyimpanan di Google Drive
save_path = 'model_logistic_regression.pkl'

# Menyimpan model menggunakan pickle
with open(save_path, 'wb') as file:
    pickle.dump(model, file)

print(f"Model berhasil disimpan di {save_path}")


Model berhasil disimpan di model_logistic_regression.pkl
