<a href="https://colab.research.google.com/github/Mananpatel25/nlp-assignments/blob/main/NLP_HWK4_PART_1ipynb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
import os
import zipfile
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

# Define paths
train_zip_path = "/content/20news-bydate-train.zip"
test_zip_path = "/content/20news-bydate-test.zip"
train_extract_path = "/content/20news_train"
test_extract_path = "/content/20news_test"

# Extract zip files
def extract_zip(zip_path, extract_path):
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_path)

extract_zip(train_zip_path, train_extract_path)
extract_zip(test_zip_path, test_extract_path)

# Load data
def load_data(data_path):
    texts, labels = [], []
    for category in sorted(os.listdir(data_path)):
        category_path = os.path.join(data_path, category)
        if os.path.isdir(category_path):
            for file_name in os.listdir(category_path):
                file_path = os.path.join(category_path, file_name)
                try:
                    with open(file_path, 'r', encoding='latin1') as file:
                        texts.append(file.read())
                        labels.append(category)
                except Exception as e:
                    print(f"Skipping {file_path} due to error: {e}")
    return texts, labels

# Prepare training and testing data
X_train, y_train = load_data(os.path.join(train_extract_path, "20news-bydate-train"))
X_test, y_test = load_data(os.path.join(test_extract_path, "20news-bydate-test"))

# Vectorization
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train Naive Bayes Model
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)
y_pred_nb = nb_model.predict(X_test_tfidf)
print("Naive Bayes Results:")
print(classification_report(y_test, y_pred_nb))

# Train Logistic Regression Model
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train_tfidf, y_train)
y_pred_lr = lr_model.predict(X_test_tfidf)
print("Logistic Regression Results:")
print(classification_report(y_test, y_pred_lr))


Naive Bayes Results:
                          precision    recall  f1-score   support

             alt.atheism       0.74      0.66      0.70       319
           comp.graphics       0.63      0.72      0.67       389
 comp.os.ms-windows.misc       0.71      0.75      0.73       394
comp.sys.ibm.pc.hardware       0.63      0.71      0.67       392
   comp.sys.mac.hardware       0.79      0.76      0.78       385
          comp.windows.x       0.79      0.75      0.77       395
            misc.forsale       0.80      0.84      0.82       390
               rec.autos       0.83      0.86      0.85       396
         rec.motorcycles       0.84      0.91      0.88       398
      rec.sport.baseball       0.90      0.90      0.90       397
        rec.sport.hockey       0.89      0.96      0.93       399
               sci.crypt       0.91      0.90      0.91       396
         sci.electronics       0.73      0.61      0.66       393
                 sci.med       0.89      0.72      0.7