In [1]:
pip install pandas scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [2]:
! pip install accelerate -U
! pip install tokenizers
! pip install transformers datasets evaluate

Collecting accelerate
  Downloading accelerate-1.0.0-py3-none-any.whl.metadata (19 kB)
Collecting torch>=1.10.0 (from accelerate)
  Downloading torch-2.4.1-cp310-cp310-win_amd64.whl.metadata (27 kB)
Collecting huggingface-hub>=0.21.0 (from accelerate)
  Downloading huggingface_hub-0.25.2-py3-none-any.whl.metadata (13 kB)
Collecting safetensors>=0.4.3 (from accelerate)
  Downloading safetensors-0.4.5-cp310-none-win_amd64.whl.metadata (3.9 kB)
Collecting filelock (from huggingface-hub>=0.21.0->accelerate)
  Downloading filelock-3.16.1-py3-none-any.whl.metadata (2.9 kB)
Collecting fsspec>=2023.5.0 (from huggingface-hub>=0.21.0->accelerate)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Collecting typing-extensions>=3.7.4.3 (from huggingface-hub>=0.21.0->accelerate)
  Downloading typing_extensions-4.12.2-py3-none-any.whl.metadata (3.0 kB)
Collecting networkx (from torch>=1.10.0->accelerate)
  Downloading networkx-3.3-py3-none-any.whl.metadata (5.1 kB)
Downloading accelerat

In [5]:
import pandas as pd

poetry = pd.read_csv('merged_data.tsv', sep='\t', encoding='ISO-8859-1').drop(columns="no")

print(poetry)

                                                   text   label
0                       Klo fadli zon goblok, boleeeh?       CB
1     @detikcom Jangan nyinyirin pak fadli zon, dia ...      CB
2     @ahmadwaluy @bravo1282 @fadlizon Kritik siapa ...      CB
3      @fadlizon @jokowi Hati penuh dendam,  pikiran...      CB
4     @fadlizon @jokowi Kerja anda apa zon bisanya c...      CB
...                                                 ...     ...
1398                     babi itu sangat kotor dan bau.      CB
1399      anjing, hebat banget cara lu nyelesain tugas.      CB
1400           anjing itu sangat loyal pada pemiliknya.  Non_CB
1401                     monyet, lu selalu bikin ribet!      CB
1402   gue liat monyet bermain di hutan, sangat lincah.  Non_CB

[1403 rows x 2 columns]


In [10]:
import re
import string
import pandas as pd

def clean_text(text):
    # Menghapus URL
    cleaned_text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Menghapus mention (@username) dan hashtag (#hashtag)
    cleaned_text = re.sub(r'\@\w+|\#', '', cleaned_text)
    # Menghapus angka
    cleaned_text = re.sub(r'\d+', '', cleaned_text)
    # Menghapus tanda baca
    cleaned_text = cleaned_text.translate(str.maketrans('', '', string.punctuation))
    # Mengubah ke huruf kecil
    cleaned_text = cleaned_text.lower()
    return cleaned_text

poetry['cleaned_text'] = poetry['text'].apply(clean_text)

print(poetry[['text', 'cleaned_text']].head())


                                                text  \
0                    Klo fadli zon goblok, boleeeh?    
1  @detikcom Jangan nyinyirin pak fadli zon, dia ...   
2  @ahmadwaluy @bravo1282 @fadlizon Kritik siapa ...   
3   @fadlizon @jokowi Hati penuh dendam,  pikiran...   
4  @fadlizon @jokowi Kerja anda apa zon bisanya c...   

                                        cleaned_text  
0                      klo fadli zon goblok boleeeh   
1   jangan nyinyirin pak fadli zon dia gitu punya...  
2     kritik siapa bisa di penjara fadli zon krit...  
3     hati penuh dendam  pikiran selalu negatif m...  
4    kerja anda apa zon bisanya cuma nyinyi r aja...  


In [12]:
import pandas as pd

# Assuming your DataFrame is named `df`
poetry['label_numeric'] = poetry['label'].map({'CB': 1, 'Non_CB': 0})

# Display the first 5 rows to check the new column
print(poetry[['label', 'label_numeric']].head())

  label  label_numeric
0    CB              1
1    CB              1
2    CB              1
3    CB              1
4    CB              1


In [13]:
poetry.head()

Unnamed: 0,text,label,cleaned_text,label_numeric
0,"Klo fadli zon goblok, boleeeh?",CB,klo fadli zon goblok boleeeh,1
1,"@detikcom Jangan nyinyirin pak fadli zon, dia ...",CB,jangan nyinyirin pak fadli zon dia gitu punya...,1
2,@ahmadwaluy @bravo1282 @fadlizon Kritik siapa ...,CB,kritik siapa bisa di penjara fadli zon krit...,1
3,"@fadlizon @jokowi Hati penuh dendam, pikiran...",CB,hati penuh dendam pikiran selalu negatif m...,1
4,@fadlizon @jokowi Kerja anda apa zon bisanya c...,CB,kerja anda apa zon bisanya cuma nyinyi r aja...,1


In [38]:
from sklearn.model_selection import train_test_split

X = poetry['cleaned_text']
y = poetry['label_numeric']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [40]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000)

X_train_vec = vectorizer.fit_transform(X_train)

X_test_vec = vectorizer.transform(X_test)


In [41]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()

model.fit(X_train_vec, y_train)


In [42]:
from sklearn.metrics import classification_report, accuracy_score

# Melakukan prediksi pada data pengujian
y_pred = model.predict(X_test_vec)

# Menghitung akurasi
accuracy = accuracy_score(y_test, y_pred)
print(f"Akurasi: {accuracy:.2f}")

# Menampilkan laporan klasifikasi
print(classification_report(y_test, y_pred))


Akurasi: 0.83
              precision    recall  f1-score   support

           0       0.85      0.71      0.77       118
           1       0.81      0.91      0.86       163

    accuracy                           0.83       281
   macro avg       0.83      0.81      0.82       281
weighted avg       0.83      0.83      0.82       281



In [43]:
for i in range(len(X_test)):
    print(f"Teks: {X_test.iloc[i]}") 
    pred_label = 'CB' if y_pred[i] == 1 else 'Non_CB'
    confidence = y_pred_proba[i] if y_pred[i] == 1 else 1 - y_pred_proba[i]
    print(f"Prediksi: {pred_label} dengan Kepercayaan: {confidence*100:.2f}%")
    print("-" * 50)



Teks: biasalah kalo pelakor banyak yang lbh ancur mukanya dari istri sah barang bagus pasti cepat laku karena pelakor jelek nggak laku makanya ngembat laki orang ya udah sama gatel kok cocok wes  username
Prediksi: CB dengan Kepercayaan: 23.97%
--------------------------------------------------
Teks: sumpah lupa kemarin ngapain aja anjing
Prediksi: CB dengan Kepercayaan: 71.79%
--------------------------------------------------
Teks: cantik banget yak punya anak cewe begini puyeng kali yak jagain nya 
Prediksi: CB dengan Kepercayaan: 62.55%
--------------------------------------------------
Teks: jujur aja ya ni org malah bagusan kaga oplasada tu foto dia sblm dn sesudah oplasdioplas mukanya anehkyknya gagal oplasnya
Prediksi: CB dengan Kepercayaan: 7.02%
--------------------------------------------------
Teks: pagi jay bangsat jangan lupa sarapan nyet
Prediksi: CB dengan Kepercayaan: 91.29%
--------------------------------------------------
Teks: gue liat monyet kecil main bola
Predik

In [44]:
import pickle

save_path = 'model_logistic_regression.pkl'

with open(save_path, 'wb') as file:
    pickle.dump(model, file)

print(f"Model berhasil disimpan di {save_path}")


Model berhasil disimpan di model_logistic_regression.pkl
