# 1. Dataset Preparation: 

1.1. Download and extract the Reuters News Dataset:

In [32]:
import nltk
nltk.download('reuters')
from nltk.corpus import reuters

# Get the list of fileids for training and testing sets
train_docs = [d for d in reuters.fileids() if d.startswith("train")]
test_docs = [d for d in reuters.fileids() if d.startswith("test")]

# Load the dataset
train_data = [reuters.raw(doc_id) for doc_id in train_docs]
train_labels = [reuters.categories(doc_id)[0] for doc_id in train_docs]
test_data = [reuters.raw(doc_id) for doc_id in test_docs]
test_labels = [reuters.categories(doc_id)[0] for doc_id in test_docs]

[nltk_data] Downloading package reuters to
[nltk_data]     C:\Users\Owaise\AppData\Roaming\nltk_data...
[nltk_data]   Package reuters is already up-to-date!


In [33]:
! pip install nltk
! pip install scikit-learn

^C




1.2. Preprocess the text data by removing stop words, stemming, and lemmatizing:

In [34]:
import nltk
from nltk.corpus import reuters
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [35]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Owaise\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Owaise\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [36]:
# Load the dataset
train_docs = [d for d in reuters.fileids() if d.startswith("train")]
test_docs = [d for d in reuters.fileids() if d.startswith("test")]

In [63]:
train_data = [reuters.raw(doc_id) for doc_id in train_docs]
train_labels = [reuters.categories(doc_id)[0] for doc_id in train_docs]
test_data = [reuters.raw(doc_id) for doc_id in test_docs]
test_labels = [reuters.categories(doc_id)[0] for doc_id in test_docs]

In [38]:
# Preprocessing
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

In [39]:
def preprocess(text):
    tokens = nltk.word_tokenize(text.lower())  # Tokenization and lowercasing
    tokens = [token for token in tokens if token.isalpha()]  # Remove non-alphabetic tokens
    tokens = [token for token in tokens if token not in stop_words]  # Remove stop words
    tokens = [stemmer.stem(token) for token in tokens]  # Perform stemming
    tokens = [lemmatizer.lemmatize(token) for token in tokens]  # Perform lemmatization
    return " ".join(tokens)

In [40]:
train_data_processed = [preprocess(text) for text in train_data]
test_data_processed = [preprocess(text) for text in test_data]

1.3. Split the datasets into training and testing sets in a ratio of 80:20:

In [41]:
# Splitting into training and testing sets
train_data_split, val_data_split, train_labels_split, val_labels_split = train_test_split(train_data_processed, train_labels, test_size=0.2, random_state=42)

# 2. Feature Extraction: 

2.1. Convert the preprocessed text data into numerical vectors using one of the following feature extraction techniques: Bag of Words, TF-IDF, or Word Embeddings.

### feature extraction using TF-IDF with scikit-learn:

In [42]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit the vectorizer on the training data
train_vectors = tfidf_vectorizer.fit_transform(train_data_split)

# Transform the validation data into vectors
val_vectors = tfidf_vectorizer.transform(val_data_split)

# Transform the test data into vectors
test_vectors = tfidf_vectorizer.transform(test_data_processed)

# 3. Model Implementation: 

Define the model parameters

input_size = ...  # Input size (e.g., number of features or word embeddings dimension)


In [43]:
input_size = len(tfidf_vectorizer.vocabulary_)

Hidden size (hidden_size): 

This represents the number of units or dimensions in the hidden state of the LSTM layer. Typically, values ranging from 32 to 512 are used.

In [45]:
hidden_size = 64

Number of layers (num_layers): 

This determines the depth of the LSTM network. More layers can capture complex dependencies but can also lead to increased training time and overfitting.

Start with a small number of layers (e.g., 1 or 2) and gradually increase it if necessary.

In [46]:
num_layers = 1

Output size (output_size): 

This corresponds to the number of classes or categories in your classification task.

In [48]:
output_size = len(set(train_labels))

3.1. Implement Simple RNN:

In [49]:
import torch
import torch.nn as nn

# Define the Simple RNN model
class SimpleRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(SimpleRNN, self).__init__()
        self.hidden_size = hidden_size

        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        out, _ = self.rnn(x)
        out = self.fc(out[:, -1, :])
        return out


In [55]:
# Create an instance of the Simple RNN model
model_rnn = SimpleRNN(input_size, hidden_size, output_size)

In [54]:
model_rnn

SimpleRNN(
  (rnn): RNN(15751, 64, batch_first=True)
  (fc): Linear(in_features=64, out_features=74, bias=True)
)

3.2. Implement LSTM:

In [51]:
import torch
import torch.nn as nn

# Define the LSTM model
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(LSTMModel, self).__init__()
        self.hidden_size = hidden_size

        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        out, _ = self.lstm(x)
        out = self.fc(out[:, -1, :])
        return out


In [52]:
# Create an instance of the LSTM model
model_lstm = LSTMModel(input_size, hidden_size, output_size)

In [53]:
model_lstm

LSTMModel(
  (lstm): LSTM(15751, 64, batch_first=True)
  (fc): Linear(in_features=64, out_features=74, bias=True)
)

# 4. Model Comparison:

### 4.1. Compare the performance of both models based on accuracy and loss values

Training and Evaluation for Simple RNN: