In [1]:
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import precision_score, recall_score, classification_report


nltk.download('stopwords')

stemmer = PorterStemmer()
stop_words = set(stopwords.words("english"))

def preprocess(text):
    text = text.lower()                             # Lowercase
    text = "".join([ch for ch in text if ch not in string.punctuation])  # Remove punctuation
    words = text.split()
    words = [stemmer.stem(word) for word in words if word not in stop_words]  # Remove stopwords and stem
    return " ".join(words)


# Load a subset for faster execution (e.g., 4 categories)
categories = ['sci.space', 'rec.autos', 'comp.graphics', 'talk.politics.mideast']
data = fetch_20newsgroups(subset='all', categories=categories, remove=('headers', 'footers', 'quotes'))

# Apply preprocessing
processed_docs = [preprocess(doc) for doc in data.data]


# Convert to TF-IDF vectors
vectorizer = TfidfVectorizer(max_df=0.5, max_features=5000)
X_tfidf = vectorizer.fit_transform(processed_docs)

# Apply feature selection using chi-squared test
selector = SelectKBest(chi2, k=1000)
X_selected = selector.fit_transform(X_tfidf, data.target)


X_train, X_test, y_train, y_test = train_test_split(X_selected, data.target, test_size=0.2, random_state=42)

# Train classifier
model = MultinomialNB()
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)


print("Classification Report:\n")
print(classification_report(y_test, y_pred, target_names=categories))


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\karti\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


Classification Report:

                       precision    recall  f1-score   support

            sci.space       0.74      0.96      0.83       186
            rec.autos       0.95      0.81      0.87       221
        comp.graphics       0.97      0.83      0.90       206
talk.politics.mideast       0.90      0.92      0.91       165

             accuracy                           0.88       778
            macro avg       0.89      0.88      0.88       778
         weighted avg       0.89      0.88      0.88       778

