In [1]:
# Import libraries
import pandas as pd
import numpy as np

# Read The Data

In [2]:
FILE_PATH = 'annotated/combined/csv/main.csv'
data = pd.read_csv(FILE_PATH)
data

Unnamed: 0,title,label,label_score
0,"Masuk Radar Pilwalkot Medan, Menantu Jokowi Be...",non-clickbait,0
1,Malaysia Sudutkan RI: Isu Kabut Asap hingga In...,non-clickbait,0
2,Viral! Driver Ojol di Bekasi Antar Pesanan Mak...,clickbait,1
3,"Kemensos Salurkan Rp 7,3 M bagi Korban Kerusuh...",non-clickbait,0
4,"Terkait Mayat Bayi Mengenaskan di Tangerang, S...",non-clickbait,0
...,...,...,...
14995,"Tolak RUU Pertanahan, Ribuan Petani Siap Gelar...",non-clickbait,0
14996,Ada Niat Tambah Momongan Tanpa Ikut Program Ha...,clickbait,1
14997,"Beredar Isu Internet Papua Diblokir Lagi, Telk...",non-clickbait,0
14998,"TXT Akan Segera Comeback, Soobin Akui Gatal I...",clickbait,1


# Split Data

In [3]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train, Y_test = train_test_split(data['title'], data['label_score'], test_size=0.2, random_state=30)
print("Train: ", X_train.shape, Y_train.shape, "\nTest: ", X_test.shape, Y_test.shape)

Train:  (12000,) (12000,) 
Test:  (3000,) (3000,)


In [4]:
# Cek distribusi kelas training
y_train_non = Y_train == 0
y_train_bait = Y_train == 1
print("Kelas non-clickbait training: ", (y_train_non.sum()/len(Y_train))*100, "%")
print("Kelas clickbait training: ", (y_train_bait.sum()/len(Y_train))*100, "%")

Kelas non-clickbait training:  57.775 %
Kelas clickbait training:  42.225 %


In [5]:
# Cek distribusi kelas latih
y_test_non = Y_test == 0
y_test_bait = Y_test == 1
print("Kelas non-clickbait testing: ", (y_test_non.sum()/len(Y_test))*100, "%")
print("Kelas clickbait testing: ", (y_test_bait.sum()/len(Y_test))*100, "%")

Kelas non-clickbait testing:  59.233333333333334 %
Kelas clickbait testing:  40.766666666666666 %


# TF-IDF

In [6]:
# print("TFIDF Vectorizer……")
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer= TfidfVectorizer(ngram_range = (1,2),
                    sublinear_tf = True)
tf_x_train = vectorizer.fit_transform(X_train)
tf_x_test = vectorizer.transform(X_test)
tf_x_train

<12000x88966 sparse matrix of type '<class 'numpy.float64'>'
	with 217777 stored elements in Compressed Sparse Row format>

In [7]:
vocab = vectorizer.get_feature_names()
print(vocab[39900:40000])
print("Vocabulary length:", len(vocab))

['kebingungan ganda', 'kebingungan giampaolo', 'kebo', 'kebo tidak', 'kebobolan', 'kebobolan di', 'kebobolan tiga', 'kebocaran', 'kebocaran data', 'kebocoran', 'kebocoran klep', 'kebocoran stok', 'kebohongan', 'kebohongan dan', 'kebohongan pinokio', 'kebon', 'kebon jeruk', 'kebon sirih', 'kebotakan', 'kebtke', 'kebtke gelar', 'kebudayaan', 'kebudayaan nasional', 'kebugaran', 'kebumen', 'kebumen 16', 'kebumen aparat', 'kebumen hal', 'kebumen minta', 'kebumen versi', 'kebun', 'kebun lada', 'kebun raya', 'kebun tebu', 'kebut', 'kebut pembangunan', 'kebut pengesahan', 'kebut persiapan', 'kebutaan', 'kebutaan ngaku', 'kebutuhan', 'kebutuhan hidup', 'kebutuhan keluarga', 'kebutuhan komersial', 'kebutuhan tle', 'kebutuhan warga', 'kebutuhan zat', 'kecam', 'kecam balik', 'kecam keras', 'kecam ketua', 'kecam pemuda', 'kecam rencana', 'kecam serangan', 'kecamatan', 'kecamatan di', 'kecamatan pinolosian', 'kecanduan', 'kecanduan game', 'kecanduan gawai', 'kecanduan kopi', 'kecanduan main', 'kecan

# Klasifikasi

In [8]:
import xgboost as xgb
from xgboost import XGBClassifier

xgb_model=XGBClassifier()
xgb_model.fit(tf_x_train,Y_train)
xgb_pred=xgb_model.predict(tf_x_test)
from sklearn.metrics import classification_report
print(classification_report(Y_test, xgb_pred))



              precision    recall  f1-score   support

           0       0.72      0.93      0.81      1777
           1       0.82      0.48      0.60      1223

    accuracy                           0.74      3000
   macro avg       0.77      0.70      0.71      3000
weighted avg       0.76      0.74      0.73      3000



In [9]:
from lightgbm import LGBMClassifier

lgbm = LGBMClassifier()
lgbm.fit(tf_x_train,Y_train)
lgbm_pred=lgbm.predict(tf_x_test)
print(classification_report(Y_test, lgbm_pred))

              precision    recall  f1-score   support

           0       0.76      0.90      0.82      1777
           1       0.79      0.58      0.67      1223

    accuracy                           0.77      3000
   macro avg       0.78      0.74      0.75      3000
weighted avg       0.77      0.77      0.76      3000



In [10]:
# importing random forest classifier from assemble module
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators = 100) 
 
# Training the model on the training dataset
# fit function is used to train the model using the training sets as parameters
clf.fit(tf_x_train,Y_train)
 
# performing predictions on the test dataset
y_pred = clf.predict(tf_x_test)

print(classification_report(Y_test, y_pred))

              precision    recall  f1-score   support

           0       0.76      0.90      0.83      1777
           1       0.81      0.59      0.68      1223

    accuracy                           0.77      3000
   macro avg       0.78      0.74      0.75      3000
weighted avg       0.78      0.77      0.77      3000

