In [1]:
import json
import pandas as pd
from sklearn.model_selection import train_test_split

with open('words_dictionary.json', 'r') as file:
    data = json.load(file)
words_array = list(data.keys())

with open('clickbait_data','r', encoding='utf-8') as file:
    clickbait_lines = [line.strip() for line in file.readlines()]
clickbait_df = pd.DataFrame({
    'Content': clickbait_lines,
    'Clickbait': ['clickbait'] * len(clickbait_lines)
})

with open('non_clickbait_data', 'r', encoding='utf-8') as file:
    nonclickbait_lines = [line.strip() for line in file.readlines()]
nonclickbait_df = pd.DataFrame({
    'Content': nonclickbait_lines,
    'Clickbait': ['non-clickbait'] * len(nonclickbait_lines)
})

df = pd.concat([clickbait_df, nonclickbait_df], ignore_index=True)

df.to_csv('labeled_data.csv', index=False)
df.dropna(subset=['Content'], inplace=True)
df = df[df['Content'].str.strip() != '']
df['Clickbait'] = df['Clickbait'].map({'clickbait': 1, 'non-clickbait': 0})
df = df.sample(frac=1).reset_index(drop=True)
print(df.shape)
print(df.head())

X = df['Content']
y = df['Clickbait']

(32000, 2)
                                             Content  Clickbait
0    A Familiar Path in Months Before Fatal Shooting          0
1  This Word Search Test Will Reveal A Deep Truth...          1
2  Judge Approves $33.3 Billion Federal Loan to G.M.          0
3  In St. Louis, Glorious Stadium Honors Glorious...          0
4  23 Pictures That Prove There Is Still Hope For...          1


In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(vocabulary=words_array, strip_accents='unicode')
X_vectorized = vectorizer.fit_transform(X)


In [3]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()

param_grid = {'n_neighbors': [3, 5, 7, 10]}

grid_search_knn = GridSearchCV(knn, param_grid, cv=5, scoring='accuracy')

X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y, test_size=0.25, random_state=42)

grid_search_knn.fit(X_train, y_train)

best_knn = grid_search_knn.best_estimator_
best_params = grid_search_knn.best_params_

cv_train_scores_accuracy = cross_val_score(best_knn, X_train, y_train, cv=5, scoring='accuracy')

best_knn.fit(X_train, y_train)
y_pred = best_knn.predict(X_test)

print("Best parameters:", best_params)
print("KNN CV Train Accuracy Mean:", cv_train_scores_accuracy.mean())
print("KNN CV Train Accuracy Std:", cv_train_scores_accuracy.std())
print("KNN Test Accuracy:", accuracy_score(y_test, y_pred))



Best parameters: {'n_neighbors': 10}
KNN CV Train Accuracy Mean: 0.9413333333333332
KNN CV Train Accuracy Std: 0.0028155076510876964
KNN Test Accuracy: 0.936625


In [4]:
from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB()

param_grid = {'alpha': [0.1, 0.5, 1.0, 1.5, 2.0]}

grid_search = GridSearchCV(nb, param_grid, cv=5, scoring='accuracy')

grid_search.fit(X_train, y_train)

best_nb = grid_search.best_estimator_
best_params = grid_search.best_params_

cv_train_scores_accuracy = cross_val_score(best_nb, X_train, y_train, cv=5, scoring='accuracy',  n_jobs=-1)

best_nb.fit(X_train, y_train)

y_pred_train_nb = best_nb.predict(X_train)
y_pred_test_nb = best_nb.predict(X_test)

print("Best parameters:", best_params)
print("Naive Bayes Train Accuracy Mean:", cv_train_scores_accuracy.mean())
print("Naive Bayes Train Accuracy Std:", cv_train_scores_accuracy.std())
print("Naive Bayes Test Accuracy:", accuracy_score(y_test, y_pred_test_nb))


Best parameters: {'alpha': 0.1}
Naive Bayes Train Accuracy Mean: 0.9651666666666665
Naive Bayes Train Accuracy Std: 0.001326806944007567
Naive Bayes Test Accuracy: 0.965375


In [5]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score

# List of classifiers with different hidden layer configurations
classifiers = [
    MLPClassifier(max_iter=200, hidden_layer_sizes=[2, 3], activation='tanh', early_stopping=True),
    MLPClassifier(max_iter=200, hidden_layer_sizes=[3, 2], activation='tanh', early_stopping=True),
    MLPClassifier(max_iter=200, hidden_layer_sizes=[2, 2], activation='tanh', early_stopping=True),
    MLPClassifier(max_iter=200, hidden_layer_sizes=[3, 3], activation='tanh', early_stopping=True),
    MLPClassifier(max_iter=200, hidden_layer_sizes=[4, 3], activation='tanh', early_stopping=True),
    MLPClassifier(max_iter=200, hidden_layer_sizes=[3, 4], activation='tanh', early_stopping=True),
    MLPClassifier(max_iter=200, hidden_layer_sizes=[4, 4], activation='tanh', early_stopping=True),
    MLPClassifier(max_iter=200, hidden_layer_sizes=[4, 5], activation='tanh', early_stopping=True),
    MLPClassifier(max_iter=200, hidden_layer_sizes=[5, 4], activation='tanh', early_stopping=True),
    MLPClassifier(max_iter=200, hidden_layer_sizes=[5, 5], activation='tanh', early_stopping=True)
]

# Perform 5-fold cross-validation and store results
for idx, clf in enumerate(classifiers):
    scores = cross_val_score(clf, X_train, y_train, cv=5, scoring='accuracy', n_jobs=-1)
    print(f"Classifier {idx+1}: Hidden layers {clf.hidden_layer_sizes}, Accuracy: {scores.mean():.4f} (+/- {scores.std():.4f})")


Classifier 1: Hidden layers [2, 3], Accuracy: 0.9686 (+/- 0.0028)
Classifier 2: Hidden layers [3, 2], Accuracy: 0.9714 (+/- 0.0012)
Classifier 3: Hidden layers [2, 2], Accuracy: 0.9706 (+/- 0.0017)
Classifier 4: Hidden layers [3, 3], Accuracy: 0.9711 (+/- 0.0018)
Classifier 5: Hidden layers [4, 3], Accuracy: 0.9705 (+/- 0.0011)
Classifier 6: Hidden layers [3, 4], Accuracy: 0.9708 (+/- 0.0010)
Classifier 7: Hidden layers [4, 4], Accuracy: 0.9705 (+/- 0.0014)
Classifier 8: Hidden layers [4, 5], Accuracy: 0.9696 (+/- 0.0012)
Classifier 9: Hidden layers [5, 4], Accuracy: 0.9710 (+/- 0.0011)
Classifier 10: Hidden layers [5, 5], Accuracy: 0.9716 (+/- 0.0009)


Classifier 1: Hidden layers [2, 3], Accuracy: 0.9686 (+/- 0.0028)
Classifier 2: Hidden layers [3, 2], Accuracy: 0.9714 (+/- 0.0012)
Classifier 3: Hidden layers [2, 2], Accuracy: 0.9706 (+/- 0.0017)
Classifier 4: Hidden layers [3, 3], Accuracy: 0.9711 (+/- 0.0018)
Classifier 5: Hidden layers [4, 3], Accuracy: 0.9705 (+/- 0.0011)
Classifier 6: Hidden layers [3, 4], Accuracy: 0.9708 (+/- 0.0010)
Classifier 7: Hidden layers [4, 4], Accuracy: 0.9705 (+/- 0.0014)
Classifier 8: Hidden layers [4, 5], Accuracy: 0.9696 (+/- 0.0012)
Classifier 9: Hidden layers [5, 4], Accuracy: 0.9710 (+/- 0.0011)
Classifier 10: Hidden layers [5, 5], Accuracy: 0.9716 (+/- 0.0009)


In [8]:
bestMLP = MLPClassifier(max_iter=200, hidden_layer_sizes=[5, 5], activation='tanh', early_stopping=True)
bestMLP.fit(X_train, y_train)

y_pred_test_MLP = bestMLP.predict(X_test)

print("Classifier 10 Test Accuracy:", accuracy_score(y_test, y_pred_test_MLP))

Classifier 10 Test Accuracy: 0.96975


- The data representation I used was Tfidf.
- The metric I selected to rank the models was accuracy.

#### Ranking based on accuracy, 1 being the best, 3 being the worst
1) KNeighbors classifier
2) Multinomial Naive Bayes classifier
3) MLP classifier

#### KNeighbors Metrics and Optimal Hyperparameters:
- Best parameters: {'n_neighbors': 10}
- KNN CV Train Accuracy Mean: 0.9413333333333332
- KNN CV Train Accuracy Std: 0.0028155076510876964
- KNN Test Accuracy: 0.936625

#### Multinomial Naive Bayes Metrics and Optimal Hyperparameters:
- Best parameters: {'alpha': 0.1}
- Naive Bayes Train Accuracy Mean: 0.9651666666666665
- Naive Bayes Train Accuracy Std: 0.001326806944007567
- Naive Bayes Test Accuracy: 0.965375

#### MLP Metrics and Optimal Hyper Parameters
- Best parameters: Hidden layers [5, 5], Classifier10
- MLP Train Accuracy Mean: 0.9716 
- MLP Train Accuracy Std: 0.0009
- MLP Test Accuracy: 0.97175

#### How can the classifier be used as a web browser plugin?
The classifier can be used with an automatic scraper that reads headlines of the current page of a search result. The scraper can gather all headlines from the search page by identifying a specific html tag, one with a classname related to the headlines given by the search engine. After collecting all these headlines, the trained classifier can be used to predict which of them are clickbait. Then the plugin can take these results and highlight all clickbait headlines in red in order to warn the user.