In [1]:
# Imports

import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix

In [2]:
# Load individual predictions

nb = pd.read_csv('outputs/predictions_nb.csv')
rf = pd.read_csv('outputs/predictions_rf.csv')
svm = pd.read_csv('outputs/predictions_svm.csv')

In [3]:
# Calculate percentage of identical predictions for each pair of models

pct_identical_nb_rf = np.mean(nb['prediction'] == rf['prediction'])
pct_identical_nb_svm = np.mean(nb['prediction'] == svm['prediction'])
pct_identical_rf_svm = np.mean(rf['prediction'] == svm['prediction'])

print('Percentage of identical predictions between Naive Bayes and Random Forest:',
      round(pct_identical_nb_rf, 4))
print('Percentage of identical predictions between Naive Bayes and Support Vector Machine:',
      round(pct_identical_nb_svm, 4))
print('Percentage of identical predictions between Random Forest and Support Vector Machine:',
      round(pct_identical_rf_svm, 4))

Percentage of identical predictions between Naive Bayes and Random Forest: 0.9747
Percentage of identical predictions between Naive Bayes and Support Vector Machine: 0.9695
Percentage of identical predictions between Random Forest and Support Vector Machine: 0.9814


In [7]:
# Make predictions

ensemble = nb
ensemble['prediction'] = pd.concat([nb['prediction'], rf['prediction'], svm['prediction']], axis=1).mode(axis=1)

ensemble.head(15)

Unnamed: 0,text,clickbait,prediction
0,czech republic minister of transport banned fr...,0,0
1,the rocky horror picture show cast reunited an...,1,1
2,apple introduces iphone and apple tv,0,0
3,mayor of camden london arrested in benefit fra...,0,0
4,tibetans demand that china release panchen lam...,0,0
5,australian treasury related agencies spend 170...,0,0
6,14 struggles every person who is the last of t...,1,1
7,32 times spongebob perfectly summed up your life,1,1
8,27 types of drunk you have definitely been as ...,1,1
9,passenger plane crashes in nepal killing 18,0,0


In [5]:
# Accuracy measures

accuracy = accuracy_score(ensemble['clickbait'], ensemble['prediction'])
recall = recall_score(ensemble['clickbait'], ensemble['prediction'])
precision = precision_score(ensemble['clickbait'], ensemble['prediction'])
f1_score = f1_score(ensemble['clickbait'], ensemble['prediction'])
confusion_matrix = confusion_matrix(ensemble['clickbait'], ensemble['prediction'])

print('Accuracy:', round(accuracy, 4))
print('Recall:', round(recall, 4))
print('Precision:', round(precision, 4))
print('F1 score:', round(f1_score, 4))
print('Confusion matrix:')
print(confusion_matrix)

Accuracy: 0.9725
Recall: 0.9666
Precision: 0.9777
F1 score: 0.9721
Confusion matrix:
[[3153   70]
 [ 106 3071]]


In [6]:
# Save predictions to csv

ensemble.to_csv('outputs/prediction_ensemble.csv', index=False)