In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [None]:
# Download necessary NLTK data
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)

In [None]:
# Load the dataset
df = pd.read_csv('Data/News_Categories.csv')

In [None]:
# Data Preprocessing
def preprocess_text(text):
    if isinstance(text, str):
        text = text.lower()
        tokens = word_tokenize(text)
        stop_words = set(stopwords.words('english'))
        tokens = [token for token in tokens if token.isalpha() and token not in stop_words]
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(token) for token in tokens]
        return ' '.join(tokens)
    else:
        return ''

In [None]:
# Apply preprocessing
df['headline'] = df['headline'].fillna('')
df['short_description'] = df['short_description'].fillna('')
df['processed_text'] = df['headline'] + ' ' + df['short_description']
df['processed_text'] = df['processed_text'].apply(preprocess_text)
df = df[df['processed_text'] != '']

In [None]:
# Statistical Analysis
print("Category Distribution:")
print(df['category'].value_counts())

In [None]:
plt.figure(figsize=(12, 6))
df['category'].value_counts().plot(kind='bar')
plt.title('Distribution of News Categories')
plt.xlabel('Category')
plt.ylabel('Count')
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

In [None]:
# Prepare data for modeling
X = df['processed_text']
y = df['category']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Vectorization
vectorizer = TfidfVectorizer(max_features=5000)
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

In [None]:
# Model training and evaluation
models = {
    'Naive Bayes': MultinomialNB(),
    'SVM': LinearSVC(),
    'Random Forest': RandomForestClassifier()
}

for name, model in models.items():
    print(f"\nTraining {name}...")
    model.fit(X_train_vectorized, y_train)
    y_pred = model.predict(X_test_vectorized)
    
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    # Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f'Confusion Matrix - {name}')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.tight_layout()
    plt.show()

In [None]:
# Feature Importance (for Random Forest)
rf_model = models['Random Forest']
feature_importance = rf_model.feature_importances_
feature_names = vectorizer.get_feature_names_out()

top_features = sorted(zip(feature_importance, feature_names), reverse=True)[:20]

plt.figure(figsize=(10, 8))
plt.barh([f[1] for f in top_features], [f[0] for f in top_features])
plt.title('Top 20 Important Features')
plt.xlabel('Importance')
plt.tight_layout()
plt.show()

print("\nAnalysis complete.")