In [30]:
# Imports

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix

In [31]:
# Set max column width so that more characters are printed to the console

pd.set_option('display.max_colwidth', 500)

In [32]:
# Load data

data = pd.read_csv('intermediary_outputs/data_processed.csv')

In [33]:
# Train/test split

X_train, X_test, y_train, y_test = train_test_split(data['text'], data['clickbait'], test_size=0.20, random_state=188)

In [34]:
# Vectorise data

count_vectorizer = CountVectorizer(min_df=0.00005,
                                   strip_accents='ascii',
                                   ngram_range=(1, 3))

count_train = count_vectorizer.fit_transform(X_train)

vocabulary = count_vectorizer.get_feature_names_out()

print('First 100 words in vocabulary:', vocabulary.tolist()[:100])
print('Last 100 words in vocabulary:', vocabulary.tolist()[-100:])
print('Number of words:', len(vocabulary))

First 100 words in vocabulary: ['00s', '00s disney', '00s disney channel', '00s girls', '00s kid', '00s kids', '00s pop', '00s song', '00s teen', '00s teen movie', '00s teens', '00s that', '05', '10', '10 2008', '10 amazing', '10 billionth', '10 celebs', '10 delicious', '10 delicious recipes', '10 in', '10 insanely', '10 insanely delicious', '10 life', '10 life changing', '10 minutes', '10 most', '10 most popular', '10 of', '10 of the', '10 people', '10 questions', '10 reasons', '10 recipes', '10 recipes to', '10 ridiculously', '10 things', '10 things hate', '10 to', '10 year', '10 year old', '10 years', '10 years ago', '10 years later', '10 years old', '100', '100 actually', '100 actually said', '100 dead', '100 killed', '100 killed in', '100 need', '100 on', '100 people', '100 years', '100 years of', '1000', '10000', '100000', '100m', '100m sprint', '100th', '101', '102', '103', '105', '106', '109', '10th', '11', '11 adorable', '11 awkward', '11 beautiful', '11 best', '11 celebrity',

In [35]:
# Multinomial Naive Bayes

nb = MultinomialNB()

In [36]:
# Hyperparameter tuning

tuned_parameters = [{'alpha': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
                     'fit_prior': [True, False]}]

model = GridSearchCV(nb, tuned_parameters, cv=5, verbose=1)
model.fit(count_train, y_train)

scores = model.cv_results_['mean_test_score']
scores_std = model.cv_results_['std_test_score']

print('Average scores:', list(scores.round(4)))
print('Score standard deviations:', list(scores_std.round(3)))
print('Best parameters:', model.best_params_)
print('Best score:', round(model.best_score_, 4))

Fitting 5 folds for each of 14 candidates, totalling 70 fits
Average scores: [0.9724, 0.9725, 0.9748, 0.975, 0.977, 0.977, 0.9754, 0.9754, 0.9585, 0.9585, 0.9127, 0.9131, 0.877, 0.8782]
Score standard deviations: [0.002, 0.002, 0.002, 0.002, 0.003, 0.003, 0.002, 0.002, 0.004, 0.004, 0.002, 0.002, 0.002, 0.003]
Best parameters: {'alpha': 0.1, 'fit_prior': True}
Best score: 0.977


In [37]:
# Make predictions

count_test = count_vectorizer.transform(X_test)
y_pred = model.predict(count_test)

predictions = pd.DataFrame(pd.concat([X_test, y_test], axis=1), columns=['text', 'clickbait'])
predictions['prediction'] = y_pred

predictions.head(15)

Unnamed: 0,text,clickbait,prediction
25864,czech republic minister of transport banned from driving,0,0
7171,the rocky horror picture show cast reunited and it feels so good,1,1
24488,apple introduces iphone and apple tv,0,0
27776,mayor of camden london arrested in benefit fraud inquiry,0,0
25936,tibetans demand that china release panchen lama boy,0,0
27167,australian treasury related agencies spend 17000 aud on massages in 2004,0,0
15390,14 struggles every person who is the last of their friends to get married has,1,1
14108,32 times spongebob perfectly summed up your life,1,1
4234,27 types of drunk you have definitely been as told by the sims,1,1
28386,passenger plane crashes in nepal killing 18,0,0


In [38]:
# Accuracy measures

accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
f1_score = f1_score(y_test, y_pred)
confusion_matrix = confusion_matrix(y_test, y_pred)

print('Accuracy =', round(accuracy, 4))
print('Recall =', round(recall, 4))
print('Precision =', round(precision, 4))
print('F1 score =', round(f1_score, 4))
print('Confusion matrix')
print(confusion_matrix)

Accuracy = 0.9727
Recall = 0.9805
Precision = 0.965
F1 score = 0.9727
Confusion matrix
[[3110  113]
 [  62 3115]]


In [39]:
# Save predictions to csv

predictions.to_csv('outputs/predictions_nb.csv', index=False)