In [1]:
!pip install nltk
!pip install numpy
!pip install torch
!pip install torchtext
!pip install pandas
!pip install scikit-learn
!pip install tqdm
!pip uninstall jax jaxlib --yes

[0m

In [2]:
import nltk
import torch
import pickle
import random
import numpy as np
import pandas as pd
import torch.nn as nn
from tqdm import tqdm
from collections import Counter
from scipy.sparse import lil_matrix
from torch.utils.data import Dataset
from sklearn.utils.extmath import randomized_svd
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix

### Download NLTK Tokenizer

In [3]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/harshbansal/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Checking for Optimal Available Device

In [4]:
device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu')
device

device(type='mps')

## Read Data from CSV

In [7]:
trainDataPath = "train.csv"
df = pd.read_csv(trainDataPath)
df.head()

Unnamed: 0,Class Index,Description
0,3,"Reuters - Short-sellers, Wall Street's dwindli..."
1,3,Reuters - Private investment firm Carlyle Grou...
2,3,Reuters - Soaring crude prices plus worries\ab...
3,3,Reuters - Authorities have halted oil export\f...
4,3,"AFP - Tearaway world oil prices, toppling reco..."


In [8]:
corpus = df["Description"].apply(nltk.word_tokenize)

### Extracting 40,000 Sentences

In [9]:
corpus = corpus[:40000]

In [10]:
UNKNOWN_TOKEN = "<UNK>"

## Build Dataset and Vocabulary

In [11]:
class EntityDataset(Dataset):
    def __init__(self, data, window_size, vocabulary=None):
        """Initialize the dataset."""
        self.sentences = data
        self.window_size = window_size

        if not isinstance(data, pd.Series):
            raise TypeError("Input data must be a Pandas Series.")

        if vocabulary is not None:
            self.vocabulary = vocabulary


        word_counter = Counter()
        for sentence in data:
            for word in sentence:
                word_counter[word] += 1

        # Replace infrequent words with <UNK>
        for i, sentence in enumerate(data):
            for j, (word) in enumerate(sentence):
                if word_counter[word] < 10:  # Cutoff Frequency
                    self.sentences[i][j] = UNKNOWN_TOKEN        

        if vocabulary is None:
            self.vocabulary = build_vocab_from_iterator(self.sentences)
            self.vocabulary.set_default_index(self.vocabulary[UNKNOWN_TOKEN])

        self.index_sentences = [[self.vocabulary[word] for word in sentence] for sentence in self.sentences]
        self.index_to_word = self.vocabulary.get_itos()
        self.positive_sample, self.negative_sample, self.target_words = self.generate_sampling()

    def __len__(self):
        """Returns number of datapoints."""
        return len(self.target_words)

    def __getitem__(self, index):
        """Get the datapoint at `index`."""
        return torch.tensor(self.positive_sample[index]).to(device), torch.tensor(self.negative_sample[index]).to(device), torch.tensor(self.target_words[index]).to(device)

    def get_index_to_word(self, index):
        """Get the word corresponding to a given index."""
        return self.index_to_word[index]

    def get_word_to_index(self, word):
        """Get the word corresponding to a given index."""
        return self.vocabulary[word]

    def generate_sampling(self):
        """Sample positive and negative context words."""
        positive_sample = []
        negative_sample = []
        target_words = []
        window = self.window_size

        for sentence in tqdm(self.index_sentences, desc="Generating Positive & Negative Samples"):
            for i, target in enumerate(sentence):
                for k in range(max(0, i - window), min(len(sentence), i + window + 1)):
                    if k != i:
                        positive_sample.append(sentence[k])
                        target_words.append(target)

                        # Generate Negative Samples
                        negative_samples = []
                        while len(negative_samples) < 2 :
                            sample = random.randint(0, len(self.vocabulary) - 1)
                            if sample != target and sample not in sentence[max(0, i - window): min(len(sentence), i + window + 1)]:
                                negative_samples.append(sample)
                        negative_sample.append(negative_samples)

        return positive_sample,negative_sample, target_words

## Skip-Gram Model

In [12]:
class SkipGram(nn.Module):
    def __init__(self, vocab_size, embedding_dim=300):
        super(SkipGram, self).__init__()
        self.embedding_dim = embedding_dim
        self.context_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.target_embeddings = nn.Embedding(vocab_size, embedding_dim)

    def forward(self,positive_sample, negative_sample,target_words,learning_rate=0.01):

        target_embedding = self.target_embeddings(target_words)
        positive_embedding = self.context_embeddings(positive_sample)

        # Positive Sample
        dot_product = torch.bmm(positive_embedding.unsqueeze(1), target_embedding.unsqueeze(-1)).squeeze()
        probability = torch.sigmoid(dot_product)

        grad = (probability - 1).unsqueeze(-1) * positive_embedding
        loss = -torch.log(probability + 1e-10) 
        self.context_embeddings.weight.data[positive_sample] -= learning_rate * (probability - 1).unsqueeze(-1) * target_embedding

        num_negative_samples = negative_sample.shape[1]

        for i in range(num_negative_samples):
            reshaped_negative_sample = negative_sample[:, i:i+1]  # Reshape to (100, 1)
            embedding = self.context_embeddings(reshaped_negative_sample).squeeze(1)
            dot_product = torch.bmm(embedding.unsqueeze(1), target_embedding.unsqueeze(-1)).squeeze()
            probability = torch.sigmoid(dot_product)
            grad += (probability).unsqueeze(-1) * embedding
            loss += -torch.log(1 - probability + 1e-10) 
            self.context_embeddings.weight.data[reshaped_negative_sample.squeeze(1)] -= learning_rate * (probability).unsqueeze(-1) * target_embedding

        self.target_embeddings.weight.data[target_words] -= learning_rate * grad     

        total_loss = loss.mean() / 100
        return total_loss       

### Training the Skip-Gram Model

### Hyper-Parameters

In [13]:
context_windows = [3, 4, 5]
learning_rate = 0.1
num_epochs = 10

In [14]:
best_model = None
max_loss = 1000.0

In [13]:
for context_window in context_windows:
    dataset = EntityDataset(corpus, context_window)
    model = SkipGram(len(dataset.vocabulary), 300).to(device)


    dataloader = DataLoader(dataset, batch_size=256, shuffle=True)

    # Training loop
    for epoch in range(num_epochs):
        total_loss = 0
        predictions = []
        targets = []

        with tqdm(total=len(dataloader), desc=f'Context Window: {context_window}, Epoch {epoch+1}/{num_epochs}', unit='batch') as pbar:
            for batch, (positive_sample,negative_sample, target_words) in enumerate(dataloader):
                model.train()

                loss = model(positive_sample.to(device), negative_sample.to(device),target_words.to(device))  # Get Loss from model
                total_loss += loss.item()

                pbar.set_postfix({'loss': total_loss / (batch + 1)})
                pbar.update(1)

        print(f'Context Window: {context_window}, Epoch: {epoch+1}, Loss: {total_loss / len(dataloader):.4f}')

        average_loss = total_loss / len(dataloader)

        # Update best model 
        if average_loss < max_loss:
            max_loss = average_loss
            best_model = model

# Save the best model
torch.save(best_model, 'best_model.pt')

Generating Positive & Negative Samples: 100%|██████████| 40000/40000 [01:27<00:00, 455.44it/s]
Context Window: 3, Epoch 1/10: 100%|██████████| 32110/32110 [12:29<00:00, 42.83batch/s, loss=0.05]


Context Window: 3, Epoch: 1, Loss: 0.0500


Context Window: 3, Epoch 2/10: 100%|██████████| 32110/32110 [12:41<00:00, 42.16batch/s, loss=0.0186]


Context Window: 3, Epoch: 2, Loss: 0.0186


Context Window: 3, Epoch 3/10: 100%|██████████| 32110/32110 [12:32<00:00, 42.66batch/s, loss=0.0132]


Context Window: 3, Epoch: 3, Loss: 0.0132


Context Window: 3, Epoch 4/10: 100%|██████████| 32110/32110 [12:33<00:00, 42.62batch/s, loss=0.0109]


Context Window: 3, Epoch: 4, Loss: 0.0109


Context Window: 3, Epoch 5/10: 100%|██████████| 32110/32110 [12:42<00:00, 42.13batch/s, loss=0.0097]


Context Window: 3, Epoch: 5, Loss: 0.0097


Context Window: 3, Epoch 6/10: 100%|██████████| 32110/32110 [12:46<00:00, 41.88batch/s, loss=0.00898]


Context Window: 3, Epoch: 6, Loss: 0.0090


Context Window: 3, Epoch 7/10: 100%|██████████| 32110/32110 [12:46<00:00, 41.87batch/s, loss=0.00852]


Context Window: 3, Epoch: 7, Loss: 0.0085


Context Window: 3, Epoch 8/10: 100%|██████████| 32110/32110 [12:50<00:00, 41.68batch/s, loss=0.00822]


Context Window: 3, Epoch: 8, Loss: 0.0082


Context Window: 3, Epoch 9/10: 100%|██████████| 32110/32110 [12:45<00:00, 41.97batch/s, loss=0.008]


Context Window: 3, Epoch: 9, Loss: 0.0080


Context Window: 3, Epoch 10/10: 100%|██████████| 32110/32110 [12:42<00:00, 42.12batch/s, loss=0.00783]


Context Window: 3, Epoch: 10, Loss: 0.0078


Generating Positive & Negative Samples: 100%|██████████| 40000/40000 [01:52<00:00, 354.98it/s]
Context Window: 4, Epoch 1/10: 100%|██████████| 42188/42188 [16:42<00:00, 42.08batch/s, loss=0.0446]


Context Window: 4, Epoch: 1, Loss: 0.0446


Context Window: 4, Epoch 2/10: 100%|██████████| 42188/42188 [16:40<00:00, 42.16batch/s, loss=0.0171]


Context Window: 4, Epoch: 2, Loss: 0.0171


Context Window: 4, Epoch 3/10: 100%|██████████| 42188/42188 [16:41<00:00, 42.13batch/s, loss=0.0126]


Context Window: 4, Epoch: 3, Loss: 0.0126


Context Window: 4, Epoch 4/10: 100%|██████████| 42188/42188 [16:38<00:00, 42.25batch/s, loss=0.0107]


Context Window: 4, Epoch: 4, Loss: 0.0107


Context Window: 4, Epoch 5/10: 100%|██████████| 42188/42188 [16:36<00:00, 42.32batch/s, loss=0.00972]


Context Window: 4, Epoch: 5, Loss: 0.0097


Context Window: 4, Epoch 6/10: 100%|██████████| 42188/42188 [16:36<00:00, 42.34batch/s, loss=0.00911]


Context Window: 4, Epoch: 6, Loss: 0.0091


Context Window: 4, Epoch 7/10: 100%|██████████| 42188/42188 [16:39<00:00, 42.20batch/s, loss=0.00871]


Context Window: 4, Epoch: 7, Loss: 0.0087


Context Window: 4, Epoch 8/10: 100%|██████████| 42188/42188 [16:43<00:00, 42.05batch/s, loss=0.00844]


Context Window: 4, Epoch: 8, Loss: 0.0084


Context Window: 4, Epoch 9/10: 100%|██████████| 42188/42188 [16:46<00:00, 41.92batch/s, loss=0.00825]


Context Window: 4, Epoch: 9, Loss: 0.0083


Context Window: 4, Epoch 10/10: 100%|██████████| 42188/42188 [16:46<00:00, 41.91batch/s, loss=0.0081]


Context Window: 4, Epoch: 10, Loss: 0.0081


Generating Positive & Negative Samples: 100%|██████████| 40000/40000 [02:17<00:00, 290.79it/s]
Context Window: 5, Epoch 1/10: 100%|██████████| 51953/51953 [20:39<00:00, 41.90batch/s, loss=0.0408]


Context Window: 5, Epoch: 1, Loss: 0.0408


Context Window: 5, Epoch 2/10: 100%|██████████| 51953/51953 [20:30<00:00, 42.23batch/s, loss=0.0161]


Context Window: 5, Epoch: 2, Loss: 0.0161


Context Window: 5, Epoch 3/10: 100%|██████████| 51953/51953 [20:27<00:00, 42.32batch/s, loss=0.0122]


Context Window: 5, Epoch: 3, Loss: 0.0122


Context Window: 5, Epoch 4/10: 100%|██████████| 51953/51953 [20:27<00:00, 42.31batch/s, loss=0.0106]


Context Window: 5, Epoch: 4, Loss: 0.0106


Context Window: 5, Epoch 5/10: 100%|██████████| 51953/51953 [20:25<00:00, 42.41batch/s, loss=0.00969]


Context Window: 5, Epoch: 5, Loss: 0.0097


Context Window: 5, Epoch 6/10: 100%|██████████| 51953/51953 [20:26<00:00, 42.36batch/s, loss=0.00917]


Context Window: 5, Epoch: 6, Loss: 0.0092


Context Window: 5, Epoch 7/10: 100%|██████████| 51953/51953 [20:23<00:00, 42.45batch/s, loss=0.00882]


Context Window: 5, Epoch: 7, Loss: 0.0088


Context Window: 5, Epoch 8/10: 100%|██████████| 51953/51953 [20:21<00:00, 42.54batch/s, loss=0.00858]


Context Window: 5, Epoch: 8, Loss: 0.0086


Context Window: 5, Epoch 9/10: 100%|██████████| 51953/51953 [20:18<00:00, 42.65batch/s, loss=0.00841]


Context Window: 5, Epoch: 9, Loss: 0.0084


Context Window: 5, Epoch 10/10: 100%|██████████| 51953/51953 [20:17<00:00, 42.68batch/s, loss=0.00828]

Context Window: 5, Epoch: 10, Loss: 0.0083





### Extracting the Embeddings

In [14]:
context_embeddings = best_model.context_embeddings.weight.data.cpu().numpy()
target_embeddings = best_model.target_embeddings.weight.data.cpu().numpy()

In [15]:
print(np.shape(context_embeddings))
print(np.shape(target_embeddings))

(10271, 300)
(10271, 300)


### Generate embeddings by Summing Context and Target Embeddings

In [16]:
word_embeddings = context_embeddings + target_embeddings

# Normalize the embeddings
word_embeddings /= np.linalg.norm(word_embeddings, axis=1, keepdims=True)
word_embeddings

array([[ 0.03290069, -0.09047558, -0.02650456, ..., -0.01732464,
        -0.10081857,  0.05301237],
       [ 0.04733349, -0.09536434, -0.01010186, ..., -0.01380922,
        -0.07548727,  0.03111346],
       [-0.0078561 , -0.05011781, -0.00989369, ..., -0.04713387,
        -0.11423495,  0.00470114],
       ...,
       [-0.00151321, -0.00382992,  0.02200773, ..., -0.06155645,
         0.09315622,  0.06318127],
       [ 0.1515942 , -0.00441719,  0.00960962, ..., -0.06585304,
         0.08873884, -0.05674504],
       [-0.11488904, -0.05626446, -0.04980468, ..., -0.03089284,
         0.05593587, -0.03656518]], dtype=float32)

In [17]:
np.shape(word_embeddings)

(10271, 300)

In [18]:
vocab_size = len(dataset.vocabulary)

word_embeddings_final = {}
for index in range(vocab_size):
    word_embeddings_final[dataset.get_index_to_word(index)] = word_embeddings[index]

word_embeddings = word_embeddings_final

In [19]:
word_embeddings

{'<UNK>': array([ 3.29006873e-02, -9.04755816e-02, -2.65045632e-02, -2.25856975e-02,
        -1.16214894e-01, -3.13257575e-02, -2.64157113e-02,  3.88094336e-02,
         1.91144980e-02,  1.01873679e-02,  7.02615157e-02, -6.14453070e-02,
        -1.52079970e-01,  6.47794157e-02, -6.99902028e-02, -4.95540723e-02,
        -1.34107634e-01,  4.59381603e-02, -8.72353017e-02,  4.12183069e-02,
        -2.61653755e-02,  2.38437522e-02, -1.75188154e-01,  2.21003424e-02,
         5.74601479e-02,  2.64663063e-02, -4.31910828e-02, -6.22191466e-02,
         1.63922850e-02, -3.40284333e-02, -1.12691019e-02,  9.77391750e-02,
         1.01151668e-01,  1.40777573e-01, -1.84631739e-02, -7.82908313e-03,
         1.03231575e-02, -6.97100759e-02,  9.23168438e-04, -1.48079684e-03,
         3.89396511e-02, -1.20637223e-01,  3.55775282e-02,  1.47097483e-02,
         6.76514357e-02,  2.18245890e-02,  9.94022191e-02, -2.33385451e-02,
        -1.30520025e-02,  6.62544668e-02,  5.80714410e-03,  5.66067882e-02,
   

### Saving the Embeddings and the Vocabulary

In [20]:
# Save vocabulary to a pickle file
with open('skip-gram-vocabulary.pkl', 'wb') as f:
    pickle.dump(dataset.vocabulary, f)

# Save word embeddings to a pickle file
with open('skip-gram-word-vectors.pkl', 'wb') as f:
    pickle.dump(word_embeddings, f)

torch.save(word_embeddings,"skip-gram-vocabulary.pt")
torch.save(dataset.vocabulary,"skip-gram-word-vectors.pt")

## Training RNN on Word Embeddings

### Extracting the Labels

In [39]:
labels = df["Class Index"][:len(corpus)].values - 1
labels

array([2, 2, 2, ..., 1, 0, 0])

### Generate Sentence Embeddings

In [40]:
def generate_sentence_embeddings(word_embeddings, sentences):
    sentence_embeddings = []
    for sentence in sentences:
        embeddings = [torch.tensor(word_embeddings[word]) for word in sentence if word in word_embeddings]
        if embeddings:
            # Average the embeddings to get the sentence embedding
            sentence_embedding = torch.mean(torch.stack(embeddings), dim=0)
            sentence_embeddings.append(sentence_embedding)
    return torch.stack(sentence_embeddings)

In [41]:
sentence_embeddings = generate_sentence_embeddings(word_embeddings,corpus)
print(np.shape(sentence_embeddings))

torch.Size([39999, 300])


In [43]:
labels = torch.Tensor(labels)
labels.shape

torch.Size([39999])

### Defining the LSTM Model

In [44]:
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(LSTMModel, self).__init__()
        self.hidden_size = hidden_size
        self.lstm = nn.LSTM(input_size, hidden_size)
        self.linear1 = nn.Linear(hidden_size, 128)  
        self.linear2 = nn.Linear(128, output_size) 

    def forward(self, x):
        out, _ = self.lstm(x.unsqueeze(1))  
        out = torch.relu(self.linear1(out[:, -1, :]))  
        out = self.linear2(out)
        return out

In [45]:
sentence_embeddings = sentence_embeddings.to(device)
labels = labels.to(device)

### Define Hyperparameters

In [46]:
input_size = sentence_embeddings.size(1) 
hidden_size = 256
output_size = 4  
learning_rate = 0.001
num_epochs = 25

### Create DataLoader

In [47]:
dataset = TensorDataset(sentence_embeddings, labels)
dataset

<torch.utils.data.dataset.TensorDataset at 0x2fc629ca0>

In [48]:
loader = DataLoader(dataset, batch_size=100, shuffle=True)
loader

<torch.utils.data.dataloader.DataLoader at 0x2fc629430>

### Initialize Model, Loss Function, and Optimizer

In [49]:
model = LSTMModel(input_size, hidden_size, output_size).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

### Training Loop

In [50]:
for epoch in range(num_epochs):
    running_loss = 0.0
    model.train()  # Set model to training mode
    for inputs, label in tqdm(loader, desc=f'Epoch {epoch + 1}/{num_epochs}', leave=False):                
        label = label.type(torch.LongTensor)
        inputs, label = inputs.to(device), label.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, label)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {running_loss / len(loader)}")

    # Evaluation on train set
    model.eval()  # Set model to evaluation mode
    train_preds = []
    train_label = []
    for inputs, label in loader:
        inputs, label = inputs.to(device), label.to(device)
        outputs = model(inputs)
        _, predicted = torch.max(outputs, 1)
        train_preds.extend(predicted.tolist())
        train_label.extend(label.tolist())

    train_accuracy = accuracy_score(train_label, train_preds)
    train_f1 = f1_score(train_label, train_preds, average='weighted', zero_division=0)
    train_precision = precision_score(train_label, train_preds, average='weighted', zero_division=0)
    train_recall = recall_score(train_label, train_preds, average='weighted', zero_division=0)
    train_confusion = confusion_matrix(train_label, train_preds)

    print("Train Dataset Metrics:")
    print(f"Accuracy: {train_accuracy}")
    print(f"F1 Score: {train_f1}")
    print(f"Precision: {train_precision}")
    print(f"Recall: {train_recall}")
    print("Confusion Matrix:")
    print(train_confusion)

print("Training complete")

                                                             

Epoch 1/25, Loss: 0.663850618749857
Train Dataset Metrics:
Accuracy: 0.8187704692617316
F1 Score: 0.8173933949254503
Precision: 0.8225304943955631
Recall: 0.8187704692617316
Confusion Matrix:
[[7893 1112  851  705]
 [ 197 9431   79  190]
 [ 297  375 7114 1591]
 [ 426  553  873 8312]]


                                                             

Epoch 2/25, Loss: 0.5236777817457914
Train Dataset Metrics:
Accuracy: 0.817870446761169
F1 Score: 0.8185176863573443
Precision: 0.8224176558630509
Recall: 0.817870446761169
Confusion Matrix:
[[9179  429  527  426]
 [1087 8350  109  351]
 [ 825   95 7232 1225]
 [1026  145 1040 7953]]


                                                             

Epoch 3/25, Loss: 0.4954723051190376
Train Dataset Metrics:
Accuracy: 0.8400710017750443
F1 Score: 0.8398846150141717
Precision: 0.8407644177709297
Recall: 0.8400710017750443
Confusion Matrix:
[[8576  616  654  715]
 [ 312 9202  133  250]
 [ 427  191 7429 1330]
 [ 494  279  996 8395]]


                                                             

Epoch 4/25, Loss: 0.4842077482491732
Train Dataset Metrics:
Accuracy: 0.8311457786444661
F1 Score: 0.8318645646323527
Precision: 0.8358107395950648
Recall: 0.8311457786444661
Confusion Matrix:
[[8706  472  912  471]
 [ 455 8819  365  258]
 [ 367  107 8019  884]
 [ 649  177 1637 7701]]


                                                             

Epoch 5/25, Loss: 0.47485834911465646
Train Dataset Metrics:
Accuracy: 0.8431710792769819
F1 Score: 0.8424303307161679
Precision: 0.8431896888401927
Recall: 0.8431710792769819
Confusion Matrix:
[[8721  701  670  469]
 [ 251 9376  131  139]
 [ 444  253 7680 1000]
 [ 601  356 1258 7949]]


                                                             

Epoch 6/25, Loss: 0.4670703833550215
Train Dataset Metrics:
Accuracy: 0.8395709892747318
F1 Score: 0.8399948965238722
Precision: 0.8469707645146917
Recall: 0.8395709892747318
Confusion Matrix:
[[8262  603  654 1042]
 [ 213 9102  123  459]
 [ 306  159 7221 1691]
 [ 288  194  685 8997]]


                                                             

Epoch 7/25, Loss: 0.4597069603204727
Train Dataset Metrics:
Accuracy: 0.8478211955298882
F1 Score: 0.8466824256143801
Precision: 0.848386042571287
Recall: 0.8478211955298882
Confusion Matrix:
[[8850  679  412  620]
 [ 262 9456   65  114]
 [ 539  327 7131 1380]
 [ 533  466  690 8475]]


                                                             

Epoch 8/25, Loss: 0.44986847989261153
Train Dataset Metrics:
Accuracy: 0.8322958073951848
F1 Score: 0.8326686473559082
Precision: 0.8437002022597336
Recall: 0.8322958073951848
Confusion Matrix:
[[7614  577  924 1446]
 [ 172 9097  187  441]
 [ 163  150 7774 1290]
 [ 151  203 1004 8806]]


                                                             

Epoch 9/25, Loss: 0.4472394424304366
Train Dataset Metrics:
Accuracy: 0.8450461261531538
F1 Score: 0.8444802867175745
Precision: 0.8505147418662361
Recall: 0.8450461261531538
Confusion Matrix:
[[9115  455  302  689]
 [ 388 9028   87  394]
 [ 791  146 6713 1727]
 [ 571  186  462 8945]]


                                                              

Epoch 10/25, Loss: 0.43727160383015873
Train Dataset Metrics:
Accuracy: 0.855246381159529
F1 Score: 0.854355452371812
Precision: 0.8560933166569389
Recall: 0.855246381159529
Confusion Matrix:
[[8727  749  508  577]
 [ 195 9486   70  146]
 [ 419  313 7366 1279]
 [ 441  388  705 8630]]


                                                              

Epoch 11/25, Loss: 0.42811433259397746
Train Dataset Metrics:
Accuracy: 0.8546213655341384
F1 Score: 0.8546696852418502
Precision: 0.8555848422000438
Recall: 0.8546213655341384
Confusion Matrix:
[[9067  444  568  482]
 [ 387 9105  195  210]
 [ 446  131 7964  836]
 [ 577  194 1345 8048]]


                                                              

Epoch 12/25, Loss: 0.42407161235809326
Train Dataset Metrics:
Accuracy: 0.858246456161404
F1 Score: 0.8575600390290171
Precision: 0.861253521387167
Recall: 0.858246456161404
Confusion Matrix:
[[8583  752  460  766]
 [ 166 9464   62  205]
 [ 378  285 7357 1357]
 [ 314  317  608 8925]]


                                                              

Epoch 13/25, Loss: 0.4205828637629747
Train Dataset Metrics:
Accuracy: 0.8613715342883572
F1 Score: 0.8613378089568515
Precision: 0.8615040612893982
Recall: 0.8613715342883572
Confusion Matrix:
[[9275  363  466  457]
 [ 477 9088  139  193]
 [ 550  132 7657 1038]
 [ 625  204  901 8434]]


                                                              

Epoch 14/25, Loss: 0.4119603930413723
Train Dataset Metrics:
Accuracy: 0.8650466261656541
F1 Score: 0.8649568374606497
Precision: 0.8670461461826524
Recall: 0.8650466261656541
Confusion Matrix:
[[8728  527  573  733]
 [ 218 9345   97  237]
 [ 314  184 7665 1214]
 [ 301  264  736 8863]]


                                                              

Epoch 15/25, Loss: 0.4019022246822715
Train Dataset Metrics:
Accuracy: 0.8644966124153104
F1 Score: 0.8640235193658753
Precision: 0.8655267632884529
Recall: 0.8644966124153104
Confusion Matrix:
[[9312  413  343  493]
 [ 466 9190   69  172]
 [ 697  180 7295 1205]
 [ 569  238  575 8782]]


                                                              

Epoch 16/25, Loss: 0.39621602587401866
Train Dataset Metrics:
Accuracy: 0.8584714617865447
F1 Score: 0.8585310254347185
Precision: 0.8614764638962132
Recall: 0.8584714617865447
Confusion Matrix:
[[9088  442  721  310]
 [ 354 9167  242  134]
 [ 324  133 8192  728]
 [ 606  210 1457 7891]]


                                                              

Epoch 17/25, Loss: 0.38831158250570297
Train Dataset Metrics:
Accuracy: 0.861046526163154
F1 Score: 0.8607440352033409
Precision: 0.8651472081104515
Recall: 0.861046526163154
Confusion Matrix:
[[8387  658  955  561]
 [ 162 9505  102  128]
 [ 139  229 8089  920]
 [ 251  369 1084 8460]]


                                                              

Epoch 18/25, Loss: 0.3827487313374877
Train Dataset Metrics:
Accuracy: 0.8714967874196855
F1 Score: 0.87126813993923
Precision: 0.872153002753742
Recall: 0.8714967874196855
Confusion Matrix:
[[9431  375  331  424]
 [ 396 9161  127  213]
 [ 628  131 7496 1122]
 [ 581  176  636 8771]]


                                                              

Epoch 19/25, Loss: 0.3780499527230859
Train Dataset Metrics:
Accuracy: 0.8713467836695917
F1 Score: 0.8705112439129572
Precision: 0.8715578112365757
Recall: 0.8713467836695917
Confusion Matrix:
[[9449  426  401  285]
 [ 309 9421   98   69]
 [ 522  192 7969  694]
 [ 672  365 1113 8014]]


                                                              

Epoch 20/25, Loss: 0.3698314280807972
Train Dataset Metrics:
Accuracy: 0.87627190679767
F1 Score: 0.8765663835030651
Precision: 0.8778144251693833
Recall: 0.87627190679767
Confusion Matrix:
[[8963  423  650  525]
 [ 237 9206  175  279]
 [ 261  111 8026  979]
 [ 342  164  803 8855]]


                                                              

Epoch 21/25, Loss: 0.3684497325122356
Train Dataset Metrics:
Accuracy: 0.8709467736693417
F1 Score: 0.8700813323697582
Precision: 0.8733122799072368
Recall: 0.8709467736693417
Confusion Matrix:
[[9167  559  304  531]
 [ 191 9477   30  199]
 [ 522  246 7174 1435]
 [ 455  242  448 9019]]


                                                              

Epoch 22/25, Loss: 0.35696240186691286
Train Dataset Metrics:
Accuracy: 0.8838970974274357
F1 Score: 0.8837627573492234
Precision: 0.8847388665748149
Recall: 0.8838970974274357
Confusion Matrix:
[[9125  451  461  524]
 [ 220 9395   79  203]
 [ 325  143 7816 1093]
 [ 359  213  573 9019]]


                                                              

Epoch 23/25, Loss: 0.3560157145187259
Train Dataset Metrics:
Accuracy: 0.8856721418035451
F1 Score: 0.8851286902913743
Precision: 0.8855566988648874
Recall: 0.8856721418035451
Confusion Matrix:
[[9253  526  437  345]
 [ 184 9561   58   94]
 [ 323  227 8083  744]
 [ 459  334  842 8529]]


                                                              

Epoch 24/25, Loss: 0.3454150556027889
Train Dataset Metrics:
Accuracy: 0.8850471261781545
F1 Score: 0.8848572773425559
Precision: 0.8859204953973905
Recall: 0.8850471261781545
Confusion Matrix:
[[9070  500  531  460]
 [ 164 9473   90  170]
 [ 259  146 7863 1109]
 [ 336  225  608 8995]]


                                                              

Epoch 25/25, Loss: 0.3405021871626377
Train Dataset Metrics:
Accuracy: 0.8903722593064827
F1 Score: 0.8900546313696643
Precision: 0.8915315986504361
Recall: 0.8903722593064827
Confusion Matrix:
[[9244  435  372  510]
 [ 186 9514   55  142]
 [ 321  183 7733 1140]
 [ 318  254  469 9123]]
Training complete


In [59]:
torch.save(model,"skip-gram-classification-model.pt")

## Testing

### Read Data from CSV

In [60]:
testDataPath = "test.csv"
df_test = pd.read_csv(testDataPath)
df_test.head()

Unnamed: 0,Class Index,Description
0,3,Unions representing workers at Turner Newall...
1,4,"SPACE.com - TORONTO, Canada -- A second\team o..."
2,4,AP - A company founded by a chemistry research...
3,4,AP - It's barely dawn when Mike Fitzpatrick st...
4,4,AP - Southern California's smog-fighting agenc...


In [61]:
corpus_test = df_test["Description"].apply(nltk.word_tokenize)

In [62]:
corpus_test

0       [Unions, representing, workers, at, Turner, Ne...
1       [SPACE.com, -, TORONTO, ,, Canada, --, A, seco...
2       [AP, -, A, company, founded, by, a, chemistry,...
3       [AP, -, It, 's, barely, dawn, when, Mike, Fitz...
4       [AP, -, Southern, California, 's, smog-fightin...
                              ...                        
7595    [Ukrainian, presidential, candidate, Viktor, Y...
7596    [With, the, supply, of, attractive, pitching, ...
7597    [Like, Roger, Clemens, did, almost, exactly, e...
7598    [SINGAPORE, :, Doctors, in, the, United, State...
7599    [EBay, plans, to, buy, the, apartment, and, ho...
Name: Description, Length: 7600, dtype: object

### Extracting the Labels

In [63]:
labels_test = df_test["Class Index"].values - 1
labels_test

array([2, 3, 3, ..., 1, 2, 2])

### Generate Sentence Embeddings

In [64]:
sentence_embeddings_test = generate_sentence_embeddings(word_embeddings,corpus_test)
print(np.shape(sentence_embeddings_test))

torch.Size([7600, 300])


In [65]:
labels_test = torch.Tensor(labels_test)
print(np.shape(labels_test))

torch.Size([7600])


In [66]:
# Convert test data to PyTorch tensors
sentence_embeddings_test = sentence_embeddings_test.to(device)
labels_test = labels_test.to(device)

# Create DataLoader for test data
test_dataset = TensorDataset(sentence_embeddings_test, labels_test)
test_loader = DataLoader(test_dataset, batch_size=100, shuffle=False)

# Evaluation loop
model.eval()  
test_preds = []
test_labels = []

with torch.no_grad(): 
    for inputs, label in test_loader:
        inputs, label = inputs.to(device), label.to(device)
        outputs = model(inputs)
        _, predicted = torch.max(outputs, 1)       
    
        test_preds.extend(predicted.tolist())
        test_labels.extend(label.tolist())

    test_accuracy = accuracy_score(test_labels, test_preds)
    test_f1 = f1_score(test_labels, test_preds, average='weighted')
    test_precision = precision_score(test_labels, test_preds, average='weighted')
    test_recall = recall_score(test_labels, test_preds, average='weighted')
    test_confusion = confusion_matrix(test_labels, test_preds)

    print("Test Dataset Metrics:")
    print(f"Accuracy: {test_accuracy}")
    print(f"F1 Score: {test_f1}")
    print(f"Precision: {test_precision}")
    print(f"Recall: {test_recall}")
    print("Confusion Matrix:")
    print(test_confusion)

Test Dataset Metrics:
Accuracy: 0.8398684210526316
F1 Score: 0.839065590449206
Precision: 0.840891304604194
Recall: 0.8398684210526316
Confusion Matrix:
[[1592   86  100  122]
 [  48 1792   29   31]
 [ 104   57 1412  327]
 [  81   82  150 1587]]
