In [19]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn import metrics

RANDOM_STATE = 42
TEST_SIZE = 0.20

In [None]:
# Load dataset
df = pd.read_csv('dataset\\text classifcation.csv')

In [4]:
df.head()

Unnamed: 0,text,label
0,DEbATinG IF BuRgER🍔 Or bIRYanI is THe TRUe kIn...,food
1,LATEst SMartpHONE bY opeNai dROPpEd tOdAy 🔥 wi...,tech
2,cRicKet COMmeNTArY FelT bIasEd SmH BUT sTILL W...,sports
3,sOfTwaRE upDatE HaD BuGZzZ again 😂 usErs on Tw...,tech
4,soFTwarE updatE Had bugZzz AGAIN 😂 useRs On Tw...,tech


In [7]:
# Data cleaning and preprocessing
SLANG = {
"lol": "laugh", "lmao": "laugh", "omg": "oh my god", "smh": "shaking my head",
"btw": "by the way", "idk": "i do not know", "imo": "in my opinion", "imho": "in my humble opinion",
"u": "you", "ur": "your", "gr8": "great", "ppl": "people", "pls": "please", "thx": "thanks",
}

emoji_rejex = re.compile("[^\x00-\x7F]+")
re_multi_space = re.compile(r"\s+")

# function for normalizing repeating words
def normalize_repeating_words(word: str)-> str:
    return re.sub(r"(.)\1{2,}", r"\1\1", word)

def clean_text(text: str)-> str:
    text = text.lower()
    text = emoji_rejex.sub(r'', text)  # remove emojis
    text = re_multi_space.sub(' ', text)  # remove multiple spaces
    text = ' '.join([SLANG[word] if word in SLANG else word for word in text.split()])  # replace slang words
    text = ' '.join([normalize_repeating_words(word) for word in text.split()])  # normalize repeating words
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)  # remove URLs
    text = re.sub(r'\@\w+|\#','', text)  # remove mentions and hashtags
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # remove special characters and numbers
    return text.strip()

df['cleaned_text'] = df['text'].apply(clean_text)
df.head()

Unnamed: 0,text,label,cleaned_text
0,DEbATinG IF BuRgER🍔 Or bIRYanI is THe TRUe kIn...,food,debating if burger or biryani is the true king...
1,LATEst SMartpHONE bY opeNai dROPpEd tOdAy 🔥 wi...,tech,latest smartphone by openai dropped today with...
2,cRicKet COMmeNTArY FelT bIasEd SmH BUT sTILL W...,sports,cricket commentary felt biased shaking my head...
3,sOfTwaRE upDatE HaD BuGZzZ again 😂 usErs on Tw...,tech,software update had bugzz again users on twitt...
4,soFTwarE updatE Had bugZzz AGAIN 😂 useRs On Tw...,tech,software update had bugzz again users on twitt...


In [12]:
# Train and Test Split
X = df["text"].astype(str)
Y = df["label"].astype(str)

X_train, X_test, y_train, y_test = train_test_split(
X, Y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=Y
)
print(len(X_train), len(X_test))

8000 2000


In [13]:
""" Why: TF‑IDF converts text to numeric features. Using the clean_text as the preprocessor ensures every sample is normalized the same way during training and inference.

Design choices

ngram_range=(1, 2): Unigrams + bigrams help short texts.

min_df=2: drop extremely rare terms (noise).

max_df=0.9: drop overly common terms. """

tfidf = TfidfVectorizer(
preprocessor=clean_text,
ngram_range=(1, 2),
min_df=2,
max_df=0.9,
)

In [14]:
pipe_logreg = Pipeline([
("tfidf", tfidf),
("clf", LogisticRegression(max_iter=200, solver="lbfgs", multi_class="auto")),
])

pipe_nb = Pipeline([
("tfidf", tfidf),
("clf", MultinomialNB()),
])

In [15]:
# Model Training
pipe_logreg.fit(X_train, y_train)
pipe_nb.fit(X_train, y_train)

In [20]:
pred_lr = pipe_logreg.predict(X_test)
pred_nb = pipe_nb.predict(X_test)

acc_lr = metrics.accuracy_score(y_test, pred_lr)
acc_nb = metrics.accuracy_score(y_test, pred_nb)

print("=== ACCURACY ===")
print(f"Logistic Regression: {acc_lr:.4f}")
print(f"Multinomial Naive Bayes: {acc_nb:.4f}")

print("\n=== CLASSIFICATION REPORT: Logistic Regression ===")
print(metrics.classification_report(y_test, pred_lr, digits=3))

print("\n=== CLASSIFICATION REPORT: Multinomial Naive Bayes ===")
print(metrics.classification_report(y_test, pred_nb, digits=3))

=== ACCURACY ===
Logistic Regression: 1.0000
Multinomial Naive Bayes: 1.0000

=== CLASSIFICATION REPORT: Logistic Regression ===
               precision    recall  f1-score   support

entertainment      1.000     1.000     1.000       400
         food      1.000     1.000     1.000       400
     politics      1.000     1.000     1.000       400
       sports      1.000     1.000     1.000       400
         tech      1.000     1.000     1.000       400

     accuracy                          1.000      2000
    macro avg      1.000     1.000     1.000      2000
 weighted avg      1.000     1.000     1.000      2000


=== CLASSIFICATION REPORT: Multinomial Naive Bayes ===
               precision    recall  f1-score   support

entertainment      1.000     1.000     1.000       400
         food      1.000     1.000     1.000       400
     politics      1.000     1.000     1.000       400
       sports      1.000     1.000     1.000       400
         tech      1.000     1.000     1.