Import libraries

In [1]:
import pandas as pd
import numpy as np
import re

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report

Load the dataset

In [None]:
df = pd.read_csv("pol_spo.csv")

print(df.head())
print(df['label'].value_counts())

                                                text  label
0  Budget to set scene for election\n \n Gordon B...      0
1  Army chiefs in regiments decision\n \n Militar...      0
2  Howard denies split over ID cards\n \n Michael...      0
3  Observers to monitor UK election\n \n Minister...      0
4  Kilroy names election seat target\n \n Ex-chat...      0
label
1    511
0    417
Name: count, dtype: int64


Basic text cleaning

In [3]:
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"[^a-z\s]", "", text)  # remove punctuation & numbers
    text = re.sub(r"\s+", " ", text).strip()
    return text

df['clean_text'] = df['text'].apply(clean_text)

X = df['clean_text']
y = df['label']

Train-Test split (70-30)

In [16]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)
# Removes any ordering bias from the dataset and ensures that the class distribution is maintained in both training and testing sets.

print("Train size:", len(X_train))
print("Test size:", len(X_test))

Train size: 649
Test size: 279


Feature Extraction

A) Bag of Words

In [None]:
bow_vectorizer = CountVectorizer(stop_words='english', max_features=5000) # max vocabulary size to limit features
X_train_bow = bow_vectorizer.fit_transform(X_train)
X_test_bow = bow_vectorizer.transform(X_test)

B) TF-IDF (unigrams + bigrams)

In [None]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1,2), max_features=5000) # max vocabulary size and uni- + bi-grams
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

Train & Evaluate Models

In [9]:
def train_and_evaluate(model, X_train_vec, X_test_vec, y_train, y_test, model_name):
    model.fit(X_train_vec, y_train)
    y_pred = model.predict(X_test_vec)

    acc = accuracy_score(y_test, y_pred)
    print(f"\n===== {model_name} =====")
    print("Accuracy:", acc)
    print(classification_report(y_test, y_pred, target_names=["Politics", "Sports"]))

    return acc

Model 1 — Naive Bayes (BoW)

In [11]:
nb = MultinomialNB()
acc_nb_bow = train_and_evaluate(nb, X_train_bow, X_test_bow, y_train, y_test, "Naive Bayes (BoW)")


===== Naive Bayes (BoW) =====
Accuracy: 0.996415770609319
              precision    recall  f1-score   support

    Politics       0.99      1.00      1.00       125
      Sports       1.00      0.99      1.00       154

    accuracy                           1.00       279
   macro avg       1.00      1.00      1.00       279
weighted avg       1.00      1.00      1.00       279



Model 2 — Logistic Regression (TF-IDF)

In [12]:
lr = LogisticRegression(max_iter=1000)
acc_lr_tfidf = train_and_evaluate(lr, X_train_tfidf, X_test_tfidf, y_train, y_test, "Logistic Regression (TF-IDF)")


===== Logistic Regression (TF-IDF) =====
Accuracy: 1.0
              precision    recall  f1-score   support

    Politics       1.00      1.00      1.00       125
      Sports       1.00      1.00      1.00       154

    accuracy                           1.00       279
   macro avg       1.00      1.00      1.00       279
weighted avg       1.00      1.00      1.00       279



Model 3 — SVM (TF-IDF)

In [13]:
svm = LinearSVC()
acc_svm_tfidf = train_and_evaluate(svm, X_train_tfidf, X_test_tfidf, y_train, y_test, "SVM (TF-IDF)")


===== SVM (TF-IDF) =====
Accuracy: 1.0
              precision    recall  f1-score   support

    Politics       1.00      1.00      1.00       125
      Sports       1.00      1.00      1.00       154

    accuracy                           1.00       279
   macro avg       1.00      1.00      1.00       279
weighted avg       1.00      1.00      1.00       279



Compare results in a table

In [14]:
results = pd.DataFrame({
    "Model": ["Naive Bayes (BoW)", "Logistic Regression (TF-IDF)", "SVM (TF-IDF)"],
    "Accuracy": [acc_nb_bow, acc_lr_tfidf, acc_svm_tfidf]
})

results

Unnamed: 0,Model,Accuracy
0,Naive Bayes (BoW),0.996416
1,Logistic Regression (TF-IDF),1.0
2,SVM (TF-IDF),1.0


End