In [1]:
!pip install numpy pandas scikit-learn nltk spacy gensim tqdm psutil
!python -m spacy download en_core_web_sm


Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m27.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.4.0
Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m68.1 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencie

In [1]:
# ===== STEP 1: Create master.csv (ONE-TIME) =====

import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold

# Path to uploaded BBC dataset in Colab
SRC = "/content/bbc-text.csv"   # columns: text, category
OUT = "master.csv"
SEED = 137  # fixed seed for reproducibility

# Load dataset
df = pd.read_csv(SRC)

# Rename category -> label and keep required columns
df = df.rename(columns={"category": "label"})[["text", "label"]]

# Create unique sequential IDs
df["id"] = [f"bbc_{i:05d}" for i in range(len(df))]

# Create 5 stratified folds
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
folds = np.zeros(len(df), dtype=int)

for f, (_, val_idx) in enumerate(skf.split(df["text"], df["label"])):
    folds[val_idx] = f

df["fold5"] = folds

# Save master.csv
cols = ["id", "text", "label", "fold5"]
df[cols].to_csv(OUT, index=False, encoding="utf-8")

print("Saved:", OUT)
print("Total rows:", len(df))


Saved: master.csv
Total rows: 2225


In [2]:
# ===== Sanity checks for master.csv =====

check = pd.read_csv("master.csv")

# 1. Check columns
print("Columns:", list(check.columns))

# 2. Check unique IDs
print("IDs unique:", check["id"].is_unique)

# 3. Check fold range
print("Fold values:", sorted(check["fold5"].unique()))

# 4. Check rows per fold
print(check["fold5"].value_counts().sort_index())


Columns: ['id', 'text', 'label', 'fold5']
IDs unique: True
Fold values: [np.int64(0), np.int64(1), np.int64(2), np.int64(3), np.int64(4)]
fold5
0    445
1    445
2    445
3    445
4    445
Name: count, dtype: int64


In [4]:

# ===== STEP 2: Roll-based TRAIN / DEV / TEST split =====

import pandas as pd
import zlib

ROLL = "SE22UARI163"   # <<< PUT YOUR ROLL NUMBER HERE EXACTLY

# Load master.csv
df = pd.read_csv("master.csv")

# Compute folds deterministically from roll
r = zlib.crc32(ROLL.encode())
dev_fold  = r % 5
test_fold = (r // 5) % 5
if test_fold == dev_fold:
    test_fold = (test_fold + 1) % 5

# Split
DEV   = df[df.fold5 == dev_fold].copy()
TEST  = df[df.fold5 == test_fold].copy()
TRAIN = df[~df.fold5.isin([dev_fold, test_fold])].copy()

print("DEV fold :", dev_fold, "| docs:", len(DEV))
print("TEST fold:", test_fold, "| docs:", len(TEST))
print("TRAIN docs:", len(TRAIN))



DEV fold : 1 | docs: 445
TEST fold: 3 | docs: 445
TRAIN docs: 1335


In [5]:
# ===== Sanity checks =====

print("Overlap DEV & TEST:", set(DEV.id) & set(TEST.id))
print("Overlap TRAIN & DEV:", set(TRAIN.id) & set(DEV.id))
print("Overlap TRAIN & TEST:", set(TRAIN.id) & set(TEST.id))


Overlap DEV & TEST: set()
Overlap TRAIN & DEV: set()
Overlap TRAIN & TEST: set()


In [6]:
# ===== STEP 3: Preprocessing setup =====

import re
import nltk
import spacy
from nltk.corpus import stopwords

nltk.download("stopwords")

stop_words = set(stopwords.words("english"))

# Load spaCy model (already installed)
nlp = spacy.load("en_core_web_sm", disable=["ner", "parser"])


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [7]:
# ===== Preprocessing function =====

def preprocess(text):
    # Lowercase
    text = text.lower()

    # Remove punctuation
    text = re.sub(r"[^\w\s]", " ", text)

    # Normalize whitespace
    text = re.sub(r"\s+", " ", text).strip()

    # Tokenize + lemmatize
    doc = nlp(text)

    tokens = []
    for tok in doc:
        if tok.text not in stop_words:
            tokens.append(tok.lemma_)

    return tokens


In [8]:
# ===== Test preprocessing =====

sample = "The markets aren't doing well, but companies are not giving up!"
print(preprocess(sample))


['market', 'well', 'company', 'give']


In [9]:
# ===== Apply preprocessing =====

TRAIN["tokens"] = TRAIN["text"].apply(preprocess)
DEV["tokens"]   = DEV["text"].apply(preprocess)
TEST["tokens"]  = TEST["text"].apply(preprocess)

print("Preprocessing complete.")


Preprocessing complete.


In [10]:
# ===== Sanity check =====

print("Sample TRAIN tokens:")
print(TRAIN.tokens.iloc[0][:20])

print("\nEmpty token docs in TRAIN:", (TRAIN.tokens.str.len() == 0).sum())
print("Empty token docs in TEST :", (TEST.tokens.str.len() == 0).sum())


Sample TRAIN tokens:
['tv', 'future', 'hand', 'viewer', 'home', 'theatre', 'system', 'plasma', 'high', 'definition', 'tv', 'digital', 'video', 'recorder', 'move', 'living', 'room', 'way', 'people', 'watch']

Empty token docs in TRAIN: 0
Empty token docs in TEST : 0


In [11]:
# ===== OHE: Build vocabulary from TRAIN =====

from collections import Counter

# Count token frequencies in TRAIN
token_counter = Counter(tok for doc in TRAIN.tokens for tok in doc)

# Top 2000 tokens
V_OHE = 2000
ohe_vocab = [w for w, _ in token_counter.most_common(V_OHE)]

# Word → index mapping
ohe_word2idx = {w: i for i, w in enumerate(ohe_vocab)}

print("OHE vocabulary size:", len(ohe_vocab))
print("Sample tokens:", ohe_vocab[:10])


OHE vocabulary size: 2000
Sample tokens: ['say', 'year', 'mr', 'would', 'make', 'also', 'people', 'one', 'new', 'go']


In [12]:
# ===== OHE vectorizer =====

import numpy as np

def ohe_vector(tokens, word2idx):
    vec = np.zeros(len(word2idx), dtype=int)
    for t in set(tokens):   # binary (0/1)
        if t in word2idx:
            vec[word2idx[t]] = 1
    return vec


In [13]:
# ===== OHE matrices =====

X_train_ohe = np.vstack(TRAIN.tokens.apply(lambda x: ohe_vector(x, ohe_word2idx)))
X_dev_ohe   = np.vstack(DEV.tokens.apply(lambda x: ohe_vector(x, ohe_word2idx)))
X_test_ohe  = np.vstack(TEST.tokens.apply(lambda x: ohe_vector(x, ohe_word2idx)))

print("OHE TRAIN shape:", X_train_ohe.shape)
print("OHE TEST shape :", X_test_ohe.shape)


OHE TRAIN shape: (1335, 2000)
OHE TEST shape : (445, 2000)


In [14]:
# ===== OHE health metrics =====

N_docs = X_train_ohe.shape[0]
V = X_train_ohe.shape[1]
nnz = X_train_ohe.sum()

sparsity = 1 - nnz / (N_docs * V)

print("Vocab size (V):", V)
print("Non-zeros (nnz):", nnz)
print("Sparsity:", round(sparsity, 4))


Vocab size (V): 2000
Non-zeros (nnz): 147345
Sparsity: 0.9448


In [15]:
# ===== OOV rate (TEST) =====

train_vocab = set(ohe_vocab)
test_tokens = [tok for doc in TEST.tokens for tok in doc]

oov_count = sum(1 for t in test_tokens if t not in train_vocab)
oov_rate = oov_count / max(1, len(test_tokens))

print("OOV rate (TEST):", round(oov_rate, 4))


OOV rate (TEST): 0.2418


In [16]:
# ===== BoW (Unigram) =====

from sklearn.feature_extraction.text import CountVectorizer

bow_vec = CountVectorizer(
    tokenizer=lambda x: x,
    preprocessor=lambda x: x,
    min_df=2
)

X_train_bow = bow_vec.fit_transform(TRAIN.tokens)
X_dev_bow   = bow_vec.transform(DEV.tokens)
X_test_bow  = bow_vec.transform(TEST.tokens)

print("BoW TRAIN shape:", X_train_bow.shape)
print("BoW TEST shape :", X_test_bow.shape)




BoW TRAIN shape: (1335, 10571)
BoW TEST shape : (445, 10571)


In [17]:
# ===== BoW health metrics =====

N_docs = X_train_bow.shape[0]
V = X_train_bow.shape[1]
nnz = X_train_bow.nnz

sparsity = 1 - nnz / (N_docs * V)

print("Vocab size (V):", V)
print("Non-zeros (nnz):", nnz)
print("Sparsity:", round(sparsity, 4))


Vocab size (V): 10571
Non-zeros (nnz): 193560
Sparsity: 0.9863


In [18]:
# ===== OOV rate (TEST) =====

train_vocab = set(bow_vec.vocabulary_.keys())
test_tokens = [tok for doc in TEST.tokens for tok in doc]

oov_count = sum(1 for t in test_tokens if t not in train_vocab)
oov_rate = oov_count / max(1, len(test_tokens))

print("OOV rate (TEST):", round(oov_rate, 4))


OOV rate (TEST): 0.06


In [19]:
# ===== N-grams (1,2) =====

from sklearn.feature_extraction.text import CountVectorizer

ngram_vec = CountVectorizer(
    tokenizer=lambda x: x,
    preprocessor=lambda x: x,
    ngram_range=(1, 2),
    min_df=3
)

X_train_ng = ngram_vec.fit_transform(TRAIN.tokens)
X_dev_ng   = ngram_vec.transform(DEV.tokens)
X_test_ng  = ngram_vec.transform(TEST.tokens)

print("N-gram TRAIN shape:", X_train_ng.shape)
print("N-gram TEST shape :", X_test_ng.shape)


N-gram TRAIN shape: (1335, 19826)
N-gram TEST shape : (445, 19826)


In [20]:
# ===== N-gram health metrics =====

N_docs = X_train_ng.shape[0]
V = X_train_ng.shape[1]
nnz = X_train_ng.nnz

sparsity = 1 - nnz / (N_docs * V)

print("Vocab size (V):", V)
print("Non-zeros (nnz):", nnz)
print("Sparsity:", round(sparsity, 4))


Vocab size (V): 19826
Non-zeros (nnz): 249812
Sparsity: 0.9906


In [21]:
# ===== OOV rate (TEST) =====

train_vocab = set(ngram_vec.vocabulary_.keys())
test_tokens = [tok for doc in TEST.tokens for tok in doc]

oov_count = sum(1 for t in test_tokens if t not in train_vocab)
oov_rate = oov_count / max(1, len(test_tokens))

print("OOV rate (TEST):", round(oov_rate, 4))


OOV rate (TEST): 0.079


In [22]:
# ===== TF-IDF (1,2)-grams =====

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vec = TfidfVectorizer(
    tokenizer=lambda x: x,
    preprocessor=lambda x: x,
    ngram_range=(1, 2),
    min_df=2,
    norm='l2'
)

X_train_tfidf = tfidf_vec.fit_transform(TRAIN.tokens)
X_dev_tfidf   = tfidf_vec.transform(DEV.tokens)
X_test_tfidf  = tfidf_vec.transform(TEST.tokens)

print("TF-IDF TRAIN shape:", X_train_tfidf.shape)
print("TF-IDF TEST shape :", X_test_tfidf.shape)


TF-IDF TRAIN shape: (1335, 45899)
TF-IDF TEST shape : (445, 45899)


In [23]:
# ===== TF-IDF health metrics =====

N_docs = X_train_tfidf.shape[0]
V = X_train_tfidf.shape[1]
nnz = X_train_tfidf.nnz

sparsity = 1 - nnz / (N_docs * V)

print("Vocab size (V):", V)
print("Non-zeros (nnz):", nnz)
print("Sparsity:", round(sparsity, 4))


Vocab size (V): 45899
Non-zeros (nnz): 301958
Sparsity: 0.9951


In [24]:
# ===== OOV rate (TEST) =====

train_vocab = set(tfidf_vec.vocabulary_.keys())
test_tokens = [tok for doc in TEST.tokens for tok in doc]

oov_count = sum(1 for t in test_tokens if t not in train_vocab)
oov_rate = oov_count / max(1, len(test_tokens))

print("OOV rate (TEST):", round(oov_rate, 4))


OOV rate (TEST): 0.06


In [25]:
# ===== Models & metrics =====

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report


In [26]:
# ===== Labels =====

y_train = TRAIN.label
y_dev   = DEV.label
y_test  = TEST.label


In [27]:
# ===== Multinomial NB =====

mnb = MultinomialNB()
mnb.fit(X_train_tfidf, y_train)

y_pred_mnb = mnb.predict(X_test_tfidf)


In [28]:
# ===== Logistic Regression =====

lr = LogisticRegression(max_iter=1000)
lr.fit(X_train_tfidf, y_train)

y_pred_lr = lr.predict(X_test_tfidf)


In [29]:
# ===== Linear SVM =====

svm = LinearSVC()
svm.fit(X_train_tfidf, y_train)

y_pred_svm = svm.predict(X_test_tfidf)


In [30]:
# ===== Evaluation =====

def evaluate(name, y_true, y_pred):
    print(f"\n{name}")
    print("Accuracy:", accuracy_score(y_true, y_pred))
    print(classification_report(y_true, y_pred))

evaluate("Multinomial NB", y_test, y_pred_mnb)
evaluate("Logistic Regression", y_test, y_pred_lr)
evaluate("Linear SVM", y_test, y_pred_svm)



Multinomial NB
Accuracy: 0.9707865168539326
               precision    recall  f1-score   support

     business       0.95      1.00      0.98       102
entertainment       0.99      0.90      0.94        77
     politics       0.98      0.98      0.98        84
        sport       0.98      0.99      0.99       102
         tech       0.96      0.97      0.97        80

     accuracy                           0.97       445
    macro avg       0.97      0.97      0.97       445
 weighted avg       0.97      0.97      0.97       445


Logistic Regression
Accuracy: 0.9797752808988764
               precision    recall  f1-score   support

     business       0.97      1.00      0.99       102
entertainment       0.96      0.96      0.96        77
     politics       0.98      0.96      0.97        84
        sport       0.99      0.99      0.99       102
         tech       1.00      0.97      0.99        80

     accuracy                           0.98       445
    macro avg       

In [31]:
# ===== Model comparison table =====

import pandas as pd
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

models = {
    "Multinomial NB": y_pred_mnb,
    "Logistic Regression": y_pred_lr,
    "Linear SVM": y_pred_svm
}

rows = []

for name, preds in models.items():
    acc = accuracy_score(y_test, preds)
    p, r, f, _ = precision_recall_fscore_support(y_test, preds, average='macro')
    rows.append([name, acc, p, r, f])

df_results = pd.DataFrame(
    rows,
    columns=["Model", "Accuracy", "Precision", "Recall", "F1-score"]
)

df_results


Unnamed: 0,Model,Accuracy,Precision,Recall,F1-score
0,Multinomial NB,0.970787,0.971744,0.967498,0.96907
1,Logistic Regression,0.979775,0.979713,0.978104,0.978829
2,Linear SVM,0.98427,0.983643,0.983083,0.983318


In [32]:
# ===== Save test predictions =====

import pandas as pd

pred_df = pd.DataFrame({
    "true_label": y_test.values,
    "predicted_label": y_pred_svm   # best model
})

pred_df.to_csv("SE22UCSE222/preds_test.csv", index=False)

print("preds_test.csv created")


OSError: Cannot save file into a non-existent directory: 'SE22UCSE222'