In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, cross_val_predict, KFold, StratifiedKFold
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.metrics import confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, recall_score, f1_score, classification_report, confusion_matrix, make_scorer, precision_score
from sklearn.ensemble import AdaBoostClassifier
from sklearn.datasets import make_classification
from numpy import mean

In [2]:
#Input Dataset
dataset = pd.read_csv('hasilpreproces_labeled.csv', delimiter = ';', quoting = 3)
corpus = dataset['text'].tolist()
# corpus = ['suka tidak bisa sesuai tingkat bawa rasa',
#           'tidak usah alas guna tidak tahu sopan santun',
#           'suka sama sini tampar',
#          ]

In [3]:
dataset

Unnamed: 0,text,Label
0,desain rumah cinta darkjokes,1
1,sayang marah ayo natal lucu istighfar takut la...,-1
2,bahas politik perintah kelas epic bahas darkjo...,1
3,dirs canda sabar,1
4,kids better watch out santa likes witnesses,1
...,...,...
7687,nasehatin akun apa satir akun,0
7688,asa satir,0
7689,satir buat lemah paham cinta,1
7690,gus nur kritis said aqil yaqut penjara bln alm...,0


In [4]:
#TF IDF
vectorizer = TfidfVectorizer()
x = vectorizer.fit_transform(corpus).toarray()
y = dataset.iloc[:, 1].values

In [18]:
model = MultinomialNB()
fold = StratifiedKFold(10,shuffle=False)
for train_index, test_index in fold.split(x, y):
    X_train, X_test = x[train_index], x[test_index]
    y_train, y_test = y[train_index], y[test_index]
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred,average='weighted',zero_division=0)
    recall = recall_score(y_test, y_pred, average='weighted',zero_division=0)
    f1 = f1_score(y_test, y_pred, average='weighted',zero_division=0)
    print(cm)
    print('akurasi = ',accuracy)
    print('precision = ',precision)
    print('recall = ',recall)
    print('f1 = ',f1)
    print('========================')

[[154   0 118]
 [ 15   1 112]
 [ 26   0 344]]
akurasi =  0.6480519480519481
precision =  0.7331849567389637
recall =  0.6480519480519481
f1 =  0.5857628260106696
[[162   0 110]
 [ 25   5  98]
 [ 28   0 342]]
akurasi =  0.6610389610389611
precision =  0.7311968369896488
recall =  0.6610389610389611
f1 =  0.6047688341537641
[[116   0 156]
 [ 20   1 106]
 [ 20   0 350]]
akurasi =  0.6072821846553966
precision =  0.7033260304888852
recall =  0.6072821846553966
f1 =  0.5372835910423945
[[166   0 105]
 [ 18   3 107]
 [ 39   0 331]]
akurasi =  0.6501950585175552
precision =  0.7220731076576729
recall =  0.6501950585175552
f1 =  0.593332307386487
[[148   1 122]
 [  6   9 113]
 [ 23   0 347]]
akurasi =  0.6553966189856957
precision =  0.7313397199310063
recall =  0.6553966189856957
f1 =  0.6053005388280714
[[155   0 116]
 [ 12   7 109]
 [ 22   0 348]]
akurasi =  0.6631989596879063
precision =  0.7476732368445955
recall =  0.6631989596879063
f1 =  0.609870487559213
[[160   0 111]
 [ 13   3 112]


In [19]:
model_boost =  AdaBoostClassifier(base_estimator = MultinomialNB())
fold = StratifiedKFold(10,shuffle=False)
for train_index, test_index in fold.split(x, y):
    X_train, X_test = x[train_index], x[test_index]
    y_train, y_test = y[train_index], y[test_index]
    model_boost.fit(X_train, y_train)
    y_pred = model_boost.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred,average='weighted',zero_division=0)
    recall = recall_score(y_test, y_pred, average='weighted',zero_division=0)
    f1 = f1_score(y_test, y_pred, average='weighted',zero_division=0)
    print(cm)
    print('akurasi = ',accuracy)
    print('precision = ',precision)
    print('recall = ',recall)
    print('f1 = ',f1)
    print('========================')

[[  0   0 272]
 [  0   0 128]
 [  0   0 370]]
akurasi =  0.4805194805194805
precision =  0.2308989711587114
recall =  0.4805194805194805
f1 =  0.3119161540214172
[[  0   0 272]
 [  0   0 128]
 [  0   0 370]]
akurasi =  0.4805194805194805
precision =  0.2308989711587114
recall =  0.4805194805194805
f1 =  0.3119161540214172
[[  0   0 272]
 [  0   0 127]
 [  0   0 370]]
akurasi =  0.4811443433029909
precision =  0.23149987909246636
recall =  0.4811443433029909
f1 =  0.3125959736999238
[[  0   0 271]
 [  0   1 127]
 [  0   0 370]]
akurasi =  0.48244473342002603
precision =  0.3982512462071955
recall =  0.48244473342002603
f1 =  0.31545128175174125
[[  0   0 271]
 [  0   0 128]
 [  0   0 370]]
akurasi =  0.4811443433029909
precision =  0.23149987909246636
recall =  0.4811443433029909
f1 =  0.3125959736999238
[[  0   0 271]
 [  0   0 128]
 [  0   0 370]]
akurasi =  0.4811443433029909
precision =  0.23149987909246636
recall =  0.4811443433029909
f1 =  0.3125959736999238
[[  0   0 271]
 [  0  

In [7]:
model_boost =  AdaBoostClassifier()
fold = StratifiedKFold(10,shuffle=False)
for train_index, test_index in fold.split(x, y):
    X_train, X_test = x[train_index], x[test_index]
    y_train, y_test = y[train_index], y[test_index]
    model_boost.fit(X_train, y_train)
    y_pred = model_boost.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred,average='weighted',zero_division=0)
    recall = recall_score(y_test, y_pred, average='weighted',zero_division=0)
    f1 = f1_score(y_test, y_pred, average='weighted',zero_division=0)
    print(cm)
    print('akurasi = ',accuracy)
    print('precision = ',precision)
    print('recall = ',recall)
    print('f1 = ',f1)
    print('========================')

[[209  24  39]
 [  4 120   4]
 [137  51 182]]
akurasi =  0.6636363636363637
precision =  0.7019233464947751
recall =  0.6636363636363637
f1 =  0.6548731131093032
[[100  10 162]
 [  3 114  11]
 [ 11  21 338]]
akurasi =  0.7168831168831169
precision =  0.75839842353216
recall =  0.7168831168831169
f1 =  0.6905693444034321
[[236  11  25]
 [  2 111  14]
 [170  27 173]]
akurasi =  0.6762028608582574
precision =  0.7202575212646525
recall =  0.6762028608582574
f1 =  0.6643924702885241
[[ 97  15 159]
 [  2 117   9]
 [ 17  23 330]]
akurasi =  0.7074122236671001
precision =  0.7391575379659209
recall =  0.7074122236671001
f1 =  0.6801352282955904
[[ 77  12 182]
 [  0 122   6]
 [ 16  27 327]]
akurasi =  0.6840052015604682
precision =  0.7234098462169685
recall =  0.6840052015604682
f1 =  0.6451843681660008
[[222  10  39]
 [  2 125   1]
 [145  23 202]]
akurasi =  0.7139141742522757
precision =  0.7453178782807879
recall =  0.7139141742522757
f1 =  0.7075978256192247
[[229  11  31]
 [  1 127   0]


In [34]:
classifierBoost = AdaBoostClassifier(base_estimator = MultinomialNB())
results_boost = cross_validate(estimator= classifierBoost,
                                          X = x,
                                          y = y,
                                          cv=10,
                                          scoring=scoring)
results_boost

{'fit_time': array([16.52540612, 16.54448032, 16.43428326, 16.66120028, 16.51664162,
        16.57115126, 16.24582267, 16.86314607, 18.53725052, 16.92975783]),
 'score_time': array([0.85302711, 0.87496328, 0.87145543, 0.87206912, 0.89068151,
        0.87408352, 0.88496447, 0.9108429 , 1.01925778, 0.90326476]),
 'test_accuracy': array([0.48051948, 0.48051948, 0.48114434, 0.48244473, 0.48114434,
        0.48114434, 0.48244473, 0.48244473, 0.47984395, 0.47984395]),
 'test_precision': array([0.48051948, 0.48051948, 0.48114434, 0.48244473, 0.48114434,
        0.48114434, 0.48244473, 0.48244473, 0.47984395, 0.47984395]),
 'test_recall': array([0.48051948, 0.48051948, 0.48114434, 0.48244473, 0.48114434,
        0.48114434, 0.48244473, 0.48244473, 0.47984395, 0.47984395]),
 'test_f1_score': array([0.48051948, 0.48051948, 0.48114434, 0.48244473, 0.48114434,
        0.48114434, 0.48244473, 0.48244473, 0.47984395, 0.47984395])}

In [6]:
#Cross Val NBC
classifierNB = MultinomialNB()
cv_nb = cross_val_score(classifierNB, x, y, cv = 10)
print(cv_nb)
print("Rata - Rata Akurasi : ",mean(cv_nb))

[0.64805195 0.66103896 0.60728218 0.65019506 0.65539662 0.66319896
 0.66840052 0.60598179 0.57737321 0.62288687]
Rata - Rata Akurasi :  0.6359806123655278


In [19]:
#Cross Val NBC
classifierNB = MultinomialNB()
cv_nb_f1 = cross_val_score(classifierNB, x, y, cv = 10,scoring=recall_score(average='micro'))
print(cv_nb_f1)
print("Rata - Rata f1 : ",mean(cv_nb_f1))


TypeError: recall_score() missing 2 required positional arguments: 'y_true' and 'y_pred'

In [8]:
#Cross Val Adaboost NBC
classifierBoost = AdaBoostClassifier(base_estimator = classifierNB)
cv_ada = cross_val_score(classifierBoost, x, y, cv = 10)
print(cv_ada)
print("Rata - Rata Akurasi : ",mean(cv_ada))

[0.48051948 0.48051948 0.48114434 0.48244473 0.48114434 0.48114434
 0.48244473 0.48244473 0.47984395 0.47984395]
Rata - Rata Akurasi :  0.48114940975799236


In [41]:
#Split
x_train,x_test, y_train, y_test = train_test_split(x, y, test_size = 0.1)

In [51]:
#Test Klasifikasi NBC
classifierNB.fit(x_train, y_train)
preds = classifierNB.predict(x_test)

#Klasifikasi Data NBC
classifierNB.fit(x_train, y_train)
data_pred = classifierNB.predict(x)

In [52]:
#Confusion Matrix Test NBC
cm_nb = confusion_matrix(y_test, preds)
print('Confusion Matrix Data Test Naive Bayes :\n\n', cm_nb,'\n')

print('========================================')
print('========================================\n')

#Confusion Matrix Data NBC
cm_Dnb = confusion_matrix(y, data_pred)
print('Confusion Matrix Data Real Naive Bayes :\n\n', cm_Dnb)

Confusion Matrix Data Test Naive Bayes :

 [[148   0  97]
 [ 12   6 101]
 [ 23   1 382]] 


Confusion Matrix Data Real Naive Bayes :

 [[2204    2  509]
 [  98  398  783]
 [  61    1 3636]]


In [30]:
#Nilai Test Confusion Matrix NBC
score_nb = classification_report(y_test, preds, zero_division=0)
print('Score Confusion Matrix Data Test Naive Bayes :\n\n', score_nb)

print('========================================================')
print('========================================================\n')

#Nilai Data Confusion Matrix NBC
score_Dnb = classification_report(y, data_pred, zero_division=0)
print('Score Confusion Matrix Data Real Naive Bayes :\n\n', score_Dnb)

Score Confusion Matrix Data Test Naive Bayes :

               precision    recall  f1-score   support

          -1       0.79      0.53      0.64       270
           0       1.00      0.04      0.08       117
           1       0.62      0.95      0.75       383

    accuracy                           0.67       770
   macro avg       0.81      0.51      0.49       770
weighted avg       0.74      0.67      0.61       770


Score Confusion Matrix Data Real Naive Bayes :

               precision    recall  f1-score   support

          -1       0.94      0.80      0.86      2715
           0       1.00      0.32      0.49      1279
           1       0.73      0.98      0.84      3698

    accuracy                           0.81      7692
   macro avg       0.89      0.70      0.73      7692
weighted avg       0.85      0.81      0.79      7692



In [31]:
#Klasifikasi Test Adaboost NBC
classifierBoost.fit(x_train, y_train)
pred_boost = classifierBoost.predict(x_test)

#Klasifikasi Data Adaboost NBC
classifierBoost.fit(x_train, y_train)
Dpred_boost = classifierBoost.predict(x)

In [32]:
#Confusion Matrix Test Adaboost NBC
cm_ad = confusion_matrix(y_test, pred_boost)
print('Confusion Matrix Data Test AdaBoost Naive Bayes :\n\n', cm_ad,'\n')

print('========================================')
print('========================================\n')

#Confusion Matrix Test Adaboost NBC
cm_Dad = confusion_matrix(y, Dpred_boost)
print('Confusion Matrix Data Real Adaboost Naive Bayes :\n\n', cm_Dad)

Confusion Matrix Data Test AdaBoost Naive Bayes :

 [[  0   0 270]
 [  0   1 116]
 [  0   0 383]] 


Confusion Matrix Data Real Adaboost Naive Bayes :

 [[   0    0 2715]
 [   0    4 1275]
 [   0    0 3698]]


In [33]:
#Nilai Test Confusion Matrix Adaboost NBC
score_ad = classification_report(y_test,pred_boost, zero_division=0)
print('Score Confusion Matrix Data Test Adaboost Naive Bayes :\n\n', score_ad)

print('========================================================')
print('========================================================\n')

#Nilai Data Confusion Matrix Adaboost NBC
score_Dad =classification_report(y,Dpred_boost, zero_division=0)
print('Score Confusion Matrix Data Real Adaboost Naive Bayes :\n\n', score_Dad)

Score Confusion Matrix Data Test Adaboost Naive Bayes :

               precision    recall  f1-score   support

          -1       0.00      0.00      0.00       270
           0       1.00      0.01      0.02       117
           1       0.50      1.00      0.66       383

    accuracy                           0.50       770
   macro avg       0.50      0.34      0.23       770
weighted avg       0.40      0.50      0.33       770


Score Confusion Matrix Data Real Adaboost Naive Bayes :

               precision    recall  f1-score   support

          -1       0.00      0.00      0.00      2715
           0       1.00      0.00      0.01      1279
           1       0.48      1.00      0.65      3698

    accuracy                           0.48      7692
   macro avg       0.49      0.33      0.22      7692
weighted avg       0.40      0.48      0.31      7692



In [34]:
from sklearn.tree import DecisionTreeClassifier
classifierDT = DecisionTreeClassifier()
classifierDT.fit(x_train, y_train)
pred_DT = classifierDT.predict(x)
cm_DT = confusion_matrix(y, pred_DT)
print('Confusion Matrix Data Real DT :\n\n', cm_DT)

Confusion Matrix Data Real DT :

 [[2623   15   77]
 [  10 1258   11]
 [  71   38 3589]]


In [35]:
score_DT =classification_report(y,pred_DT, zero_division=0)
print('Score Confusion Matrix Data Real DT :\n\n', score_DT)

Score Confusion Matrix Data Real DT :

               precision    recall  f1-score   support

          -1       0.97      0.97      0.97      2715
           0       0.96      0.98      0.97      1279
           1       0.98      0.97      0.97      3698

    accuracy                           0.97      7692
   macro avg       0.97      0.97      0.97      7692
weighted avg       0.97      0.97      0.97      7692



In [36]:
classifier_DTBoost = AdaBoostClassifier()
classifier_DTBoost.fit(x_train, y_train)
pred_DTboost = classifier_DTBoost.predict(x)
cm_DTBoost = confusion_matrix(y, pred_DTboost)
print('Confusion Matrix Data Real DT Boost :\n\n', cm_DTBoost)

Confusion Matrix Data Real DT Boost :

 [[2110  285  320]
 [  13 1214   52]
 [1126  525 2047]]


In [37]:
score_DTboost = classification_report(y,pred_DTboost, zero_division=0)
print('Score Confusion Matrix Data Real DT :\n\n', score_DTboost)

Score Confusion Matrix Data Real DT :

               precision    recall  f1-score   support

          -1       0.65      0.78      0.71      2715
           0       0.60      0.95      0.74      1279
           1       0.85      0.55      0.67      3698

    accuracy                           0.70      7692
   macro avg       0.70      0.76      0.70      7692
weighted avg       0.74      0.70      0.69      7692

