In [1]:
# Imports

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix

In [2]:
# Set max column width so that more characters are printed to the console

pd.set_option('display.max_colwidth', 500)

In [3]:
# Load data

data = pd.read_csv('intermediary_outputs/data_processed.csv')

In [4]:
# Train/test split

X_train, X_test, y_train, y_test = train_test_split(data['text'], data['clickbait'], test_size=0.20, random_state=188)

In [5]:
# Vectorise

count_vectorizer = CountVectorizer(min_df=0.0005,
                                   strip_accents='ascii',
                                   ngram_range=(1, 3))

count_train = count_vectorizer.fit_transform(X_train)

vocabulary = count_vectorizer.get_feature_names_out()

print('First 100 words in vocabulary:', vocabulary.tolist()[:100])
print('Last 100 words in vocabulary:', vocabulary.tolist()[-100:])
print('Number of words:', len(vocabulary))

First 100 words in vocabulary: ['00s', '10', '10 years', '100', '100 years', '11', '11 things', '12', '13', '13 things', '14', '15', '15 things', '15 times', '16', '16 things', '16 times', '17', '17 people', '17 people who', '17 pictures', '17 reasons', '17 things', '17 things you', '17 times', '18', '18 things', '18 things you', '18 times', '19', '19 of', '19 pictures', '19 pictures that', '19 reasons', '19 things', '19 things that', '19 things you', '19 times', '19 tweets', '1989', '20', '200', '2000s', '2004', '2005', '2006', '2007', '200708', '2008', '2009', '2010', '2011', '2012', '2015', '2016', '21', '21 of', '21 of the', '21 pictures', '21 pictures that', '21 reasons', '21 things', '21 things you', '21 times', '21 tweets', '22', '22 pictures', '22 things', '22 times', '23', '23 pictures', '23 things', '23 things you', '23 times', '24', '24 pictures', '24 pictures that', '24 things', '24 times', '25', '25 things', '25 times', '26', '27', '27 times', '28', '29', '30', '31', '32',

In [6]:
# Pipeline - feature scaling and SVC model

pipeline = Pipeline([('scaler', StandardScaler(with_mean=False)),
                     ('SVM', SVC())])

In [7]:
# Hyperparameter tuning

tuned_parameters = [{'SVM__C': [1],
                     'SVM__kernel': ['sigmoid'],
                     'SVM__degree': [2],
                     'SVM__gamma': ['scale']}]

model = GridSearchCV(pipeline, tuned_parameters, cv=3, verbose=1)
model.fit(count_train, y_train)

scores = model.cv_results_['mean_test_score']
scores_std = model.cv_results_['std_test_score']

print('Average scores:', list(scores.round(4)))
print('Score standard deviations:', list(scores_std.round(3)))
print('Best parameters:', model.best_params_)
print('Best score:', round(model.best_score_, 4))

Fitting 3 folds for each of 1 candidates, totalling 3 fits
Average scores: [0.9673]
Score standard deviations: [0.003]
Best parameters: {'SVM__C': 1, 'SVM__degree': 2, 'SVM__gamma': 'scale', 'SVM__kernel': 'sigmoid'}
Best score: 0.9673


In [8]:
# Make predictions

count_test = count_vectorizer.transform(X_test)
y_pred = model.predict(count_test)

predictions = pd.DataFrame(pd.concat([X_test, y_test], axis=1), columns=['text', 'clickbait'])
predictions['prediction'] = y_pred

predictions.head(15)

Unnamed: 0,text,clickbait,prediction
25864,czech republic minister of transport banned from driving,0,0
7171,the rocky horror picture show cast reunited and it feels so good,1,1
24488,apple introduces iphone and apple tv,0,0
27776,mayor of camden london arrested in benefit fraud inquiry,0,0
25936,tibetans demand that china release panchen lama boy,0,0
27167,australian treasury related agencies spend 17000 aud on massages in 2004,0,0
15390,14 struggles every person who is the last of their friends to get married has,1,1
14108,32 times spongebob perfectly summed up your life,1,1
4234,27 types of drunk you have definitely been as told by the sims,1,1
28386,passenger plane crashes in nepal killing 18,0,0


In [9]:
# Accuracy measures

accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
f1_score = f1_score(y_test, y_pred)
confusion_matrix = confusion_matrix(y_test, y_pred)

print('Accuracy =', round(accuracy, 4))
print('Recall =', round(recall, 4))
print('Precision =', round(precision, 4))
print('F1 score =', round(f1_score, 4))
print('Confusion matrix')
print(confusion_matrix)

Accuracy = 0.9681
Recall = 0.9547
Precision = 0.9806
F1 score = 0.9675
Confusion matrix
[[3163   60]
 [ 144 3033]]


In [10]:
# Save predictions to csv

predictions.to_csv('outputs/predictions_svm.csv', index=False)