In [1]:
import nltk
import pandas as pd
import joblib
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from scipy.sparse import hstack
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer;
from sklearn.model_selection import train_test_split;
from sklearn.metrics import accuracy_score, classification_report
nltk.download("punkt", "stopwords")
stws = stopwords.words("english")

[nltk_data] Downloading package punkt to stopwords...
[nltk_data]   Package punkt is already up-to-date!


### Load dataset which we have already processed in the data analysis stage

In [2]:
df = pd.read_csv("../data/pre_processed_fake_news.csv")
df.head()

Unnamed: 0,text_lower,fake_or_factual
0,yearold girl who had apparently given birth sh...,Fake News
1,buried in trump s bonkers interview with new y...,Fake News
2,women make up over 50 percent of this country ...,Fake News
3,backed fighters in syria was being discussed a...,Factual News
4,sharing government britain s minister for the ...,Factual News


## Tokenization and Remove stopwords

In [3]:
#Tokenise and remove stopwords from  our data using using the nltk word tokenizer
df["tokens"] = df["text_lower"].apply(lambda x: [word for word in word_tokenize(x) if word not in stws])

## Lemmatization

In [4]:
lemmatizer = WordNetLemmatizer()
# Lemmatize and join words back into sentences
df["lemmatized"] = df["tokens"].apply(lambda x: " ".join([lemmatizer.lemmatize(word) for word in x]))
df.head()

Unnamed: 0,text_lower,fake_or_factual,tokens,lemmatized
0,yearold girl who had apparently given birth sh...,Fake News,"[yearold, girl, apparently, given, birth, shor...",yearold girl apparently given birth shortly ar...
1,buried in trump s bonkers interview with new y...,Fake News,"[buried, trump, bonkers, interview, new, york,...",buried trump bonkers interview new york time r...
2,women make up over 50 percent of this country ...,Fake News,"[women, make, 50, percent, country, grossly, u...",woman make 50 percent country grossly underrep...
3,backed fighters in syria was being discussed a...,Factual News,"[backed, fighters, syria, discussed, highest, ...",backed fighter syria discussed highest level c...
4,sharing government britain s minister for the ...,Factual News,"[sharing, government, britain, minister, regio...",sharing government britain minister region sai...


In [5]:
training_data = df[["lemmatized", "fake_or_factual"]].copy()
training_data.rename(columns = {"lemmatized": "news", "fake_or_factual": "label"}, inplace= True)

## Classification model training pipeline

In [6]:
def custom_classifier_pipeline(data, model_class, vectorizer_type="count", pos_data=None, test_size=0.35, random_state=42):
    """
    data: dictionary with keys "news" and "label" or list of tuples [(label, text), ...]
    model_class: a scikit-learn classifier class, e.g., LogisticRegression, GaussianNB
    vectorizer_type: "count" or "tfidf"
    pos_data: optional list of POS-tagged features aligned with data["data"]
    """

    if isinstance(data, list):
        y_data, x_data = zip(*data)
    else:
        x_data = data["news"]
        y_data = data["label"]
    
    x_train_texts, x_test_texts, y_train, y_test = train_test_split(
        x_data, y_data, test_size=test_size, random_state=random_state
    )
    
    if vectorizer_type == "count":
        vectorizer = CountVectorizer()
    elif vectorizer_type == "tfidf":
        vectorizer = TfidfVectorizer(ngram_range=(1,3), max_features=5000)
    else:
        raise ValueError("vectorizer_type must be 'count' or 'tfidf'")
    
    x_train = vectorizer.fit_transform(x_train_texts)
    x_test = vectorizer.transform(x_test_texts)
    
    if pos_data is not None:
        pos_train = [pos_data[i] for i in range(len(x_train_texts))]
        pos_test = [pos_data[i] for i in range(len(x_train_texts), len(data["data"]))]
        
        pos_vec_train = vectorizer.fit_transform(pos_train)
        pos_vec_test = vectorizer.transform(pos_test)
        
        x_train = hstack([x_train, pos_vec_train])
        x_test = hstack([x_test, pos_vec_test])
    
    model = model_class()
    
    # GaussianNB needs dense arrays
    if issubclass(model_class, GaussianNB):
        x_train = x_train.toarray()
        x_test = x_test.toarray()

    if issubclass(model_class, LogisticRegression):
       model = model_class(max_iter=2000, C=1.0,  class_weight='balanced', solver='liblinear')
    
    model.fit(x_train, y_train)
    predictions = model.predict(x_test)
    
    acc_score = accuracy_score(y_test, predictions)
    print("Accuracy score:", acc_score)
    print(classification_report(y_test, predictions))
    
    return {"model": model, "vectorizer": vectorizer, "accuracy_score": acc_score}

In [7]:
model1 = custom_classifier_pipeline(training_data, LogisticRegression, "tfidf")

Accuracy score: 0.8714285714285714
              precision    recall  f1-score   support

Factual News       0.85      0.92      0.88        36
   Fake News       0.90      0.82      0.86        34

    accuracy                           0.87        70
   macro avg       0.87      0.87      0.87        70
weighted avg       0.87      0.87      0.87        70



In [8]:
model2 = custom_classifier_pipeline(training_data, LogisticRegression)

Accuracy score: 0.8142857142857143
              precision    recall  f1-score   support

Factual News       0.77      0.92      0.84        36
   Fake News       0.89      0.71      0.79        34

    accuracy                           0.81        70
   macro avg       0.83      0.81      0.81        70
weighted avg       0.83      0.81      0.81        70



In [9]:
model3 = custom_classifier_pipeline(training_data, GaussianNB, "tfidf")

Accuracy score: 0.8142857142857143
              precision    recall  f1-score   support

Factual News       0.85      0.78      0.81        36
   Fake News       0.78      0.85      0.82        34

    accuracy                           0.81        70
   macro avg       0.82      0.82      0.81        70
weighted avg       0.82      0.81      0.81        70



In [10]:
model4 = custom_classifier_pipeline(training_data, GaussianNB)

Accuracy score: 0.7714285714285715
              precision    recall  f1-score   support

Factual News       0.78      0.78      0.78        36
   Fake News       0.76      0.76      0.76        34

    accuracy                           0.77        70
   macro avg       0.77      0.77      0.77        70
weighted avg       0.77      0.77      0.77        70



## Conclusion 
After evaluating four model configurations combining Logistic Regression and GaussianNB with Count and TF-IDF vectorizers, we found that Logistic Regression paired with TF-IDF (1–3 n-grams, max_features=5000) achieved the highest accuracy of 87%. TF-IDF effectively captures contextual patterns in text, giving Logistic Regression an edge over GaussianNB. Overall, Logistic Regression consistently outperformed GaussianNB across both vectorization methods, making it the preferred choice for our fake news detection model.

In [11]:
joblib.dump(model1["model"], "kbap_model.pkl")
joblib.dump(model1["vectorizer"], "kbap_vectorizer.pkl")

joblib.dump(model3["model"], "quab_model.pkl")
joblib.dump(model3["vectorizer"], "quab_vectorizer.pkl")

['quab_vectorizer.pkl']