<a href="https://colab.research.google.com/github/Karthikt04/NM/blob/main/Topic_Modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [20]:
# 1️ Install dependencies
!pip install -q pandas scikit-learn nltk

In [21]:
import os, re
import pandas as pd
import nltk
from nltk.corpus import stopwords
from google.colab import files
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

In [23]:
# Download stopwords
nltk.download('stopwords', quiet=True)
stop_words = set(stopwords.words('english'))

In [24]:
# 3️ Prompt user to upload exactly 5 CSV files
print("Please select and upload all train data at once:")
uploaded = files.upload()
if len(uploaded) != 5:
    raise ValueError("please upload all files.")

Please select and upload all train data at once:


Saving business_data.csv to business_data (2).csv
Saving education_data.csv to education_data (1).csv
Saving entertainment_data.csv to entertainment_data (1).csv
Saving sports_data.csv to sports_data (1).csv
Saving technology_data.csv to technology_data (1).csv


In [25]:
dfs = []
for filename in uploaded.keys():
    df_part = pd.read_csv(filename)
    # Convert column names to lowercase and strip spaces
    df_part.columns = df_part.columns.str.strip().str.lower()
    required_cols = {'headlines', 'description', 'content', 'url', 'category'} # Lowercased required columns
    if not required_cols.issubset(df_part.columns):
        raise ValueError(
            f"File {filename} is missing one of the required columns: {required_cols}"
        )
    dfs.append(df_part)
df = pd.concat(dfs, ignore_index=True)
print(f"Combined dataset shape: {df.shape}")

Combined dataset shape: (10000, 5)


In [26]:
# 5️ Preprocess and combine text fields
def preprocess(text):
    text = str(text).lower()
    text = re.sub(r'[^a-z\s]', ' ', text)
    tokens = [w for w in text.split() if w not in stop_words]
    return ' '.join(tokens)

df['text'] = (
    df['headlines'].fillna('') + ' ' +
    df['description'].fillna('') + ' ' +
    df['content'].fillna('')
).apply(preprocess)

In [27]:
# 6️ Feature extraction with TF-IDF (unigrams + bigrams)
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2), min_df=2)
X = vectorizer.fit_transform(df['text'])
y = df['category']

In [28]:
# 7️ Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [29]:
# 8️ Train Logistic Regression classifier
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train, y_train)

In [30]:
# Evaluate model
y_pred = model.predict(X_test)
print("Classification Report:\n")
print(classification_report(y_test, y_pred))
cm = confusion_matrix(y_test, y_pred, labels=model.classes_)
print("Confusion Matrix:")
print(pd.DataFrame(cm, index=model.classes_, columns=model.classes_))

Classification Report:

               precision    recall  f1-score   support

     business       0.98      0.97      0.97       400
    education       1.00      0.99      0.99       400
entertainment       0.99      0.99      0.99       400
       sports       0.99      0.99      0.99       400
   technology       0.95      0.97      0.96       400

     accuracy                           0.98      2000
    macro avg       0.98      0.98      0.98      2000
 weighted avg       0.98      0.98      0.98      2000

Confusion Matrix:
               business  education  entertainment  sports  technology
business            388          0              1       0          11
education             2        395              0       0           3
entertainment         0          0            395       1           4
sports                0          0              0     397           3
technology            6          0              1       3         390


In [31]:
# Inference function for user input headline
def predict_category(headline: str) -> str:
    clean = preprocess(headline)
    vec = vectorizer.transform([clean])
    return model.predict(vec)[0]

# Prompt for user input and display result
user_headline = input("\nEnter a news headline to classify: ").strip()
if not user_headline:
    raise ValueError("Empty input provided.")
print("→ Predicted Category:", predict_category(user_headline))


Enter a news headline to classify: Chandrayaan-3 lander Vikram is now a landmark on Moon
→ Predicted Category: technology
