In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import ConfusionMatrixDisplay, precision_score, recall_score, PrecisionRecallDisplay, make_scorer
from sklearn.model_selection import train_test_split, cross_val_score, learning_curve, GridSearchCV, validation_curve
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.pipeline import Pipeline

In [None]:
df = pd.DataFrame(columns=["title", "target"])

In [None]:
# Formatting text files
titles = []
targets = []
file_path_dict = {'clickbait': './clickbait_data.txt', 'non clickbait': './non_clickbait_data.txt'}
for key, value in file_path_dict.items():
    with open(value, 'r') as file:
        for line_number, line in enumerate(file):
            line = line.strip()
            if line != "":
                titles.append(line)
                targets.append(key)
data_dict = {"title": titles, "target": targets}
df = pd.DataFrame(data_dict)


In [None]:
data = df.sample(frac=1, random_state=1).reset_index(drop=True)
display(data)

Unnamed: 0,title,target
0,UK guinea pig farm to close after owner's fami...,non clickbait
1,18 Sweet Pumpkin Treats You Won't Believe Are ...,clickbait
2,"A Guy Just Did The Most Epic ""Cha Cha Slide"" D...",clickbait
3,Premium gas discounted for a few hours,non clickbait
4,Sanctions on US products introduced by Brazil,non clickbait
...,...,...
31995,"Men, Stephen King Has A Really Important Messa...",clickbait
31996,Greek government faces censure motion by oppos...,non clickbait
31997,15 Holiday Cocktails That Are Basically Dessert,clickbait
31998,This Corgi And Baby Are Best Friends And It's ...,clickbait


In [None]:
X = data["title"]
y = data["target"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)
display(X_train.shape) 
display(X_test.shape)
display(y_train.shape)
display(y_test.shape)

(25600,)

(6400,)

(25600,)

(6400,)

In [None]:
def custom_tokenizer(text):
        from nltk.tokenize import word_tokenize
        from nltk.corpus import stopwords
        from nltk.stem import WordNetLemmatizer
        import re
        en_stopwords = stopwords.words('english')
        lemmatizer = WordNetLemmatizer()
        new_text = text.lower() #lowercase

        new_text = re.sub(r"([^\w\s])", "", new_text) #remove punctuation

        for word in new_text.split(): #remove stopwords
            if word in en_stopwords:
                new_text = new_text.replace(word, "")
        
        new_text = word_tokenize(new_text) #tokenize

        new_text = [lemmatizer.lemmatize(token) for token in new_text] #lemmatize
        return new_text

In [None]:
#Display message counts
clickbait = data[(data['target'] == 'clickbait')]
non_clickbait = data[(data['target'] == 'non clickbait')]
clickbait_count = clickbait.value_counts().sum()
non_clickbait_count = non_clickbait.value_counts().sum()
total_messages = clickbait_count + non_clickbait_count
spam_fraction = non_clickbait_count / total_messages

print("Number of clickbait messages: ", clickbait_count)
print("Number of non-clickbait messages: ", non_clickbait_count)

Number of clickbait messages:  15999
Number of non-clickbait messages:  16001


In [None]:
precision = make_scorer(precision_score, greater_is_better=True, pos_label='clickbait')
recall = make_scorer(recall_score, greater_is_better=True, pos_label='clickbait')
npv = make_scorer(precision_score, greater_is_better=True, pos_label='non clickbait')
specificity = make_scorer(recall_score, greater_is_better=True, pos_label='non clickbait')

In [None]:
clf_metrics = {'Accuracy': 'accuracy', 'Precision': precision, 'Recall': recall, 'Negative Predictive Value': npv, 'Specificity': specificity}
lc_dict = {}
vc_dict = {}
cvs_dict = {}

In [None]:
train_labels = y_train.unique()
test_labels = y_test.unique()