In [1]:
#Project 7: News Topic Classification
#An End-to-End NLP application that automatically scrapes, processes, and categorizes news articles into specific domains (e.g., Business vs. Tech).

import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

# STEP 1: FETCH DATA DIRECTLY WITH PANDAS 
# The URL for the BBC News dataset
url_csv = "https://raw.githubusercontent.com/suraj-deshmukh/BBC-Dataset-News-Classification/master/dataset/dataset.csv"

print("Fetching and reading data...")

try:
    df = pd.read_csv(url_csv, encoding='latin-1')
except Exception as e:
    print(f"Latin-1 failed, trying cp1252. Error: {e}")
    df = pd.read_csv(url_csv, encoding='cp1252')



Fetching and reading data...


In [2]:
# STEP 2: LOAD & FILTER DATA
# Rename columns for clarity
df.columns = ['news', 'category']

# Filter: Focus on 'business' and 'tech' for this demo
target_categories = ['business', 'tech']
df_filtered = df[df['category'].isin(target_categories)].copy()

print(f"Data Loaded Successfully. Shape: {df_filtered.shape}")
print(df_filtered.head())



Data Loaded Successfully. Shape: (911, 2)
                                                news  category
0  China had role in Yukos split-up\n \n China le...  business
1  Oil rebounds from weather effect\n \n Oil pric...  business
2  Indonesia 'declines debt freeze'\n \n Indonesi...  business
3  $1m payoff for former Shell boss\n \n Shell is...  business
4  US bank in $515m SEC settlement\n \n Five Bank...  business


In [3]:
# STEP 3: TEXT PREPROCESSING (CLEANING)
def clean_text(text):
    # Ensure input is string (handle potential NaN/float values)
    text = str(text) 
    # 1. Lowercase
    text = text.lower()
    # 2. Remove special characters and numbers using Regex
    text = re.sub(r'[^a-z\s]', '', text)
    # 3. Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Apply cleaning
df_filtered['clean_news'] = df_filtered['news'].apply(clean_text)



In [4]:
# STEP 4: FEATURE ENGINEERING (TF-IDF)
tfidf = TfidfVectorizer(stop_words='english', max_features=1000)
X = tfidf.fit_transform(df_filtered['clean_news'])
y = df_filtered['category']

# Split Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [5]:
# STEP 5: TRAIN MODEL
model = MultinomialNB()
model.fit(X_train, y_train) # .fit(): The training step where the model learns the relationship between word frequencies (X) and labels (y).

# MODEL: Multinomial Naive Bayes (MultinomialNB)
# Probabilistic Classifier.
# It is specifically designed for data that represents "counts" or "frequencies" (like word counts in text classification).

# ALGORITHM: Multinomial Naive Bayes with TF-IDF.
# 1. TF-IDF (Term Frequency-Inverse Document Frequency): Converts raw text into a matrix of meaningful numbers by evaluating how unique and important a word is to a specific document.
# 2. MultinomialNB: A probabilistic classifier optimized for text analysis. It predicts the news category based on the distribution of word frequencies found by TF-IDF.





In [6]:
# STEP 6: EVALUATE & TEST
predictions = model.predict(X_test)
print("\n Classification Report")
print(classification_report(y_test, predictions))

# Test with a real-world example
new_headline = ["The stock market crashed due to inflation fears"]
new_headline_clean = [clean_text(new_headline[0])]
new_vector = tfidf.transform(new_headline_clean)
pred = model.predict(new_vector)

print(f"Headline: '{new_headline[0]}'")
print(f"Predicted Category: {pred[0]}")


 Classification Report
              precision    recall  f1-score   support

    business       0.99      0.99      0.99       103
        tech       0.99      0.99      0.99        80

    accuracy                           0.99       183
   macro avg       0.99      0.99      0.99       183
weighted avg       0.99      0.99      0.99       183

Headline: 'The stock market crashed due to inflation fears'
Predicted Category: business


**EVALUATION RESULTS**

**1. Overall Accuracy:** 99% (Exceptionally high).

**2. Class Performance:** Both 'Business' and 'Tech' categories achieved 0.99 Precision and Recall.
- This indicates the model separates the two domains almost perfectly.
- Reason: The vocabulary gap between Business (e.g., "market", "economy") and Tech (e.g., "software", "digital") is distinct.
  
**3. Real-world Test:**
- Input: "The stock market crashed due to inflation fears"
- Prediction: "Business" (Correct).
- The model successfully captured domain-specific keywords like "stock" and "inflation".