<a href="https://colab.research.google.com/github/Mananpatel25/nlp-assignments/blob/main/NLP_HWK4_PART_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import zipfile
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report

# Extracted dataset
train_zip_path = '/content/20news-bydate-train.zip'
test_zip_path = '/content/20news-bydate-test.zip'
train_extract_path = '/content/20news_train'
test_extract_path = '/content/20news_test'

with zipfile.ZipFile(train_zip_path, 'r') as zip_ref:
    zip_ref.extractall(train_extract_path)
with zipfile.ZipFile(test_zip_path, 'r') as zip_ref:
    zip_ref.extractall(test_extract_path)

# Load data
def load_data(base_path):
    texts, labels = [], []
    for category in sorted(os.listdir(base_path)):
        category_path = os.path.join(base_path, category)
        if os.path.isdir(category_path):
            for filename in os.listdir(category_path):
                file_path = os.path.join(category_path, filename)
                with open(file_path, 'r', encoding='latin-1') as file:
                    texts.append(file.read())
                    labels.append(category)
    return texts, labels

train_texts, train_labels = load_data(os.path.join(train_extract_path, '20news-bydate-train'))
test_texts, test_labels = load_data(os.path.join(test_extract_path, '20news-bydate-test'))

# Encoded labels
label_encoder = LabelEncoder()
train_labels_encoded = label_encoder.fit_transform(train_labels)
test_labels_encoded = label_encoder.transform(test_labels)
num_classes = len(label_encoder.classes_)

# Tokenized and padded sequences
tokenizer = Tokenizer(num_words=20000, oov_token="<OOV>")
tokenizer.fit_on_texts(train_texts)
train_sequences = tokenizer.texts_to_sequences(train_texts)
test_sequences = tokenizer.texts_to_sequences(test_texts)
max_length = 500
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding='post', truncating='post')
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding='post', truncating='post')

# Built CNN Model
model = keras.Sequential([
    keras.layers.Embedding(input_dim=20000, output_dim=128, input_length=max_length),
    keras.layers.Conv1D(filters=128, kernel_size=5, activation='relu'),
    keras.layers.GlobalMaxPooling1D(),
    keras.layers.Dense(128, activation='relu'),
    keras.layers.Dense(num_classes, activation='softmax')
])

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Trained model
model.fit(train_padded, train_labels_encoded, epochs=5, batch_size=32, validation_data=(test_padded, test_labels_encoded))

# Predicted on test set
y_pred = np.argmax(model.predict(test_padded), axis=1)

# Classification report
print(classification_report(test_labels_encoded, y_pred, target_names=label_encoder.classes_))


Epoch 1/5




[1m354/354[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m89s[0m 245ms/step - accuracy: 0.2405 - loss: 2.5925 - val_accuracy: 0.7076 - val_loss: 1.0398
Epoch 2/5
[1m354/354[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m140s[0m 241ms/step - accuracy: 0.8844 - loss: 0.4563 - val_accuracy: 0.7975 - val_loss: 0.7166
Epoch 3/5
[1m354/354[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m132s[0m 212ms/step - accuracy: 0.9892 - loss: 0.0670 - val_accuracy: 0.8172 - val_loss: 0.6754
Epoch 4/5
[1m354/354[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m91s[0m 239ms/step - accuracy: 0.9989 - loss: 0.0146 - val_accuracy: 0.8205 - val_loss: 0.6827
Epoch 5/5
[1m354/354[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 239ms/step - accuracy: 0.9988 - loss: 0.0102 - val_accuracy: 0.8165 - val_loss: 0.7203
[1m236/236[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 48ms/step
                          precision    recall  f1-score   support

             alt.atheism       0.83      0.