In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import ConfusionMatrixDisplay, precision_score, recall_score, PrecisionRecallDisplay, make_scorer
from sklearn.model_selection import train_test_split, cross_val_score, learning_curve, GridSearchCV, validation_curve
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.pipeline import Pipeline

In [2]:
df = pd.DataFrame(columns=["title", "target"])

In [3]:
# Formatting text files
titles = []
targets = []
file_path_dict = {'clickbait': './clickbait_data.txt', 'non clickbait': './non_clickbait_data.txt'}
for key, value in file_path_dict.items():
    with open(value, 'r') as file:
        for line_number, line in enumerate(file):
            line = line.strip()
            if line != "":
                titles.append(line)
                targets.append(key)
data_dict = {"title": titles, "target": targets}
df = pd.DataFrame(data_dict)


In [4]:
data = df.sample(frac=1, random_state=1).reset_index(drop=True)
display(data)

Unnamed: 0,title,target
0,Everyone Stop And Appreciate Zac Efron Touchin...,clickbait
1,Are You Ready To Sync Your Vagina With Your Sm...,clickbait
2,25 Insanely Delicious Things To Make With 3 In...,clickbait
3,MGM Mirage Reaches Deal on Las Vegas Project,non clickbait
4,This Is Why Soy Milk Goes Funny In Your Coffee,clickbait
...,...,...
31995,Police Chief Plays Down I.R.A. Groups in Ulster,non clickbait
31996,Demi Lovato Performs Next To Giant Boner,clickbait
31997,Regular People Try To Catch Passes From An NFL...,clickbait
31998,This Is Why You Don't Fuck With A Crocodile,clickbait


In [5]:
X = data["title"]
y = data["target"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=3)
display(X_train.shape) 
display(X_test.shape)
display(y_train.shape)
display(y_test.shape)

(25600,)

(6400,)

(25600,)

(6400,)

In [6]:
def custom_tokenizer(text):
        from nltk.tokenize import word_tokenize
        from nltk.corpus import stopwords
        from nltk.stem import WordNetLemmatizer
        import re
                # Get stopwords function and Word Lemmatizer
        en_stopwords = stopwords.words('english')
        lemmatizer = WordNetLemmatizer()

        # Process text
        new_text = text.lower() #lowercase
        new_text = re.sub(r"([^\w\s])", "", new_text) #remove punctuation
        new_text = word_tokenize(new_text) #tokenize
        for word in new_text: #remove stopwords
            if word in en_stopwords:
                new_text.remove(word)
        new_text = [lemmatizer.lemmatize(token) for token in new_text] #lemmatize
        return new_text

In [7]:
#Display message counts
clickbait = data[(data['target'] == 'clickbait')]
non_clickbait = data[(data['target'] == 'non clickbait')]
clickbait_count = clickbait.value_counts().sum()
non_clickbait_count = non_clickbait.value_counts().sum()
total_messages = clickbait_count + non_clickbait_count
spam_fraction = non_clickbait_count / total_messages

print("Number of clickbait messages: ", clickbait_count)
print("Number of non-clickbait messages: ", non_clickbait_count)

Number of clickbait messages:  15999
Number of non-clickbait messages:  16001


In [8]:
precision = make_scorer(precision_score, greater_is_better=True, pos_label='clickbait')
recall = make_scorer(recall_score, greater_is_better=True, pos_label='clickbait')
npv = make_scorer(precision_score, greater_is_better=True, pos_label='non clickbait')
specificity = make_scorer(recall_score, greater_is_better=True, pos_label='non clickbait')

In [9]:
clf_metrics = {'Accuracy': 'accuracy', 'Precision': precision, 'Recall': recall, 'Negative Predictive Value': npv, 'Specificity': specificity}
lc_dict = {}
vc_dict = {}
cvs_dict = {}

In [10]:
train_labels = y_train.unique()
test_labels = y_test.unique()