<a href="https://colab.research.google.com/github/Jabed-Hasan/python/blob/main/Naive_Bayes_Classifier_exercise.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#**Green University of Bangladesh**
#**Department of CSE**


---


## CSE 412: Machine Learning Lab
## CLP - 05
### Naive Bayes Classifier

#####**Student Name:** Farhan Sadik
#####**Student ID:** 221002982
#####**Instructor:** Md. Jahid Tanvir  
#####**Date:** Aug 10, 2025


In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# ----------------------
# Scraper Function
# ----------------------
def scrape_bbc(category_url, label, max_articles=50):
    headers = {"User-Agent": "Mozilla/5.0"}
    articles = []

    response = requests.get(category_url, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Select article links that contain the category keyword
    links = soup.select('a[href^="/news"]') if "politics" in category_url else soup.select('a[href^="/sport"]')

    seen_urls = set()

    for link in links:
        href = link.get('href')
        if not href or href in seen_urls:
            continue

        # Build full URL if relative
        if href.startswith("/"):
            article_url = "https://www.bbc.com" + href
        else:
            article_url = href

        seen_urls.add(href)

        try:
            art_resp = requests.get(article_url, headers=headers)
            art_soup = BeautifulSoup(art_resp.text, 'html.parser')

            # Headline
            headline_tag = art_soup.find('h1')
            if not headline_tag:
                continue
            headline = headline_tag.get_text(strip=True)

            # Content paragraphs
            paragraphs = art_soup.find_all('p')
            content = " ".join([p.get_text(strip=True) for p in paragraphs])

            if content and len(content.split()) > 50:  # only keep articles with enough words
                articles.append({"category": label, "text": content})

            # Stop if enough articles collected
            if len(articles) >= max_articles:
                break

            time.sleep(1)  # polite delay

        except Exception as e:
            print(f"Error scraping {article_url}: {e}")

    return articles

# ----------------------
# Scrape Data
# ----------------------
sports_data = scrape_bbc("https://www.bbc.com/sport", "sport", max_articles=50)
politics_data = scrape_bbc("https://www.bbc.com/news/politics", "politics", max_articles=50)

# Combine and Save
df = pd.DataFrame(sports_data + politics_data)
df.to_csv("news_dataset.csv", index=False)

print("Data collected:", df['category'].value_counts())
print(df.head())

# ----------------------
# Train/Test Split
# ----------------------
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['category'], test_size=0.3, random_state=42)

# Vectorize Text
vectorizer = CountVectorizer(stop_words='english')
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Train Naïve Bayes Model
model = MultinomialNB()
model.fit(X_train_vec, y_train)

# Predictions
y_pred = model.predict(X_test_vec)

# ----------------------
# Evaluation
# ----------------------
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2%}")
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred, labels=['politics', 'sport']))
print("Classification Report:\n", classification_report(y_test, y_pred))


Data collected: category
sport       50
politics    50
Name: count, dtype: int64
  category                                               text
0    sport  'World-class potential' - Sesko joins Man Utd ...
1    sport  Tavernier rescues 10-man Rangers against Dunde...
2    sport  The Hundred: Fire chasing 164 to beat Spirit W...
3    sport  Norris and Piastri 'will not properly fall out...
4    sport  Watch: England start strongly against France i...
Accuracy: 96.67%
Confusion Matrix:
 [[12  1]
 [ 0 17]]
Classification Report:
               precision    recall  f1-score   support

    politics       1.00      0.92      0.96        13
       sport       0.94      1.00      0.97        17

    accuracy                           0.97        30
   macro avg       0.97      0.96      0.97        30
weighted avg       0.97      0.97      0.97        30



In [None]:
print("Data collected:", df['category'].value_counts())
print(df.head(100))


Data collected: category
sport       50
politics    50
Name: count, dtype: int64
    category                                               text
0      sport  'World-class potential' - Sesko joins Man Utd ...
1      sport  Tavernier rescues 10-man Rangers against Dunde...
2      sport  The Hundred: Fire chasing 164 to beat Spirit W...
3      sport  Norris and Piastri 'will not properly fall out...
4      sport  Watch: England start strongly against France i...
..       ...                                                ...
95  politics  Senior government figures believe they are on ...
96  politics  One of the major reasons why Britain's prime m...
97  politics  Who is in charge? You might think the answer s...
98  politics  "There's only one relationship that really mat...
99  politics  By the time polls closed at 10pm on 4 July 202...

[100 rows x 2 columns]


In [None]:
# ----------------------
# User Input Prediction
# ----------------------
while True:
    user_input = input("Enter a news headline or article (or type 'exit' to quit): ")
    if user_input.lower() == "exit":
        break

    # Convert to vector using the same vocabulary
    user_vec = vectorizer.transform([user_input])

    # Predict
    prediction = model.predict(user_vec)[0]
    print(f"\nPredicted Category: {prediction}\n")


Enter a news headline or article (or type 'exit' to quit): The Scourge of ‘Spot‑Fixing’ Is Coming for American Sports

Predicted Category: sport

Enter a news headline or article (or type 'exit' to quit): Trump says he will meet Putin in Alaska next Friday

Predicted Category: politics

Enter a news headline or article (or type 'exit' to quit): exit
