<a href="https://colab.research.google.com/github/MahdiTheGreat/Intro-to-language-modeling/blob/main/neural_network.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install ipdb
!pip install -U spacy
!pip install torch torchvision --index-url https://download.pytorch.org/whl/cpu

Collecting ipdb
  Downloading ipdb-0.13.13-py3-none-any.whl.metadata (14 kB)
Collecting jedi>=0.16 (from ipython>=7.31.1->ipdb)
  Downloading jedi-0.19.1-py2.py3-none-any.whl.metadata (22 kB)
Downloading ipdb-0.13.13-py3-none-any.whl (12 kB)
Downloading jedi-0.19.1-py2.py3-none-any.whl (1.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m21.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: jedi, ipdb
Successfully installed ipdb-0.13.13 jedi-0.19.1
Collecting spacy
  Downloading spacy-3.8.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (27 kB)
Collecting thinc<8.4.0,>=8.3.0 (from spacy)
  Downloading thinc-8.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (15 kB)
Collecting blis<1.1.0,>=1.0.0 (from thinc<8.4.0,>=8.3.0->spacy)
  Downloading blis-1.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.6 kB)
Collecting numpy>=1.19.0 (from spacy)
  Downloading numpy-2.

In [None]:
import spacy
import torch
import matplotlib.pyplot as plt
import ipdb
import numpy as np
import random
import pandas as pd
# %pdb on

In [None]:
# Helper function to plot the training metrics

def plot_training_metrics(train_acc, val_acc, train_loss, title, save_path):
    # Ensure that all input lists have the same length
    assert len(train_acc) == len(val_acc) == len(train_loss), "All input histories must have the same length."

    epochs = range(1, len(train_acc) + 1)

    # Create the metrics DataFrame
    df_metrics = pd.DataFrame({
        'Epoch': epochs,
        'Training Accuracy (%)': train_acc,
        'Validation Accuracy (%)': val_acc,
        'Training Loss': train_loss
    })

    # Initialize the plot
    fig, ax1 = plt.subplots(figsize=(10, 6))

    # Plot Training and Validation Accuracy on ax1
    color = 'tab:blue'
    ax1.set_xlabel('Epoch')
    ax1.set_ylabel('Accuracy (%)', color=color)
    ax1.plot(df_metrics['Epoch'], df_metrics['Training Accuracy (%)'], label='Train Acc', color='tab:blue')
    ax1.plot(df_metrics['Epoch'], df_metrics['Validation Accuracy (%)'], label='Val Acc', color='tab:cyan')
    ax1.tick_params(axis='y', labelcolor=color)

    # Create a second y-axis for Training Loss
    ax2 = ax1.twinx()
    color = 'tab:red'
    ax2.set_ylabel('Loss', color=color)
    ax2.plot(df_metrics['Epoch'], df_metrics['Training Loss'], label='Train Loss', color='tab:red')
    ax2.tick_params(axis='y', labelcolor=color)

    # Combine legends from both axes
    lines_1, labels_1 = ax1.get_legend_handles_labels()
    lines_2, labels_2 = ax2.get_legend_handles_labels()
    ax1.legend(lines_1 + lines_2, labels_1 + labels_2, loc='upper left')

    # Set plot title and layout
    plt.title(title)
    plt.tight_layout()

    # Save and display the plot
    plt.savefig(save_path)
    plt.show()

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim


class SimpleANN(nn.Module):

    def __init__(self,layer_sizes,activation=nn.ReLU,last_layer_activation=nn.Softmax,dropout=0):

        super(SimpleANN, self).__init__()
        self.layers = nn.ModuleList()

        for i in range(len(layer_sizes)-2):
          self.layers.append(nn.Linear(layer_sizes[i], layer_sizes[i+1]))
          self.layers.append(nn.Dropout(dropout))
          self.layers.append(activation())

        self.layers.append(nn.Linear(layer_sizes[-2], layer_sizes[-1]))
        if last_layer_activation is not None:
         self.layers.append(nn.Dropout(dropout))
         self.layers.append(last_layer_activation())

    def forward(self, x):
        x = x.view(-1, np.prod(x.shape[1:])) # Flatten the input
        x = x.float()
        for layer in self.layers:
            x = layer(x)
        return x

In [None]:
# Set random seed for reproducibility
def set_seed(seed=2024):
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

set_seed(1998)

In [None]:
# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else ('mps' if torch.backends.mps.is_available() else 'cpu'))
print(f'Using device: {device}')

Using device: cpu


In [None]:
from copy import deepcopy
def dense_arch_builder(input_size,scale_factor=0,hidden_layers_num=0,repeat=0,output_size=1):
  layer_sizes=[input_size]

  if scale_factor!=0:

   if scale_factor>1:
    for i in range(hidden_layers_num):
     layer_sizes.append(layer_sizes[-1]*scale_factor)
    while layer_sizes[-1]<output_size:
     layer_sizes.append(layer_sizes[-1]*scale_factor)

   elif scale_factor==1:
     for i in range(2,hidden_layers_num+2):
      layer_sizes.append(layer_sizes[0]*i)
     i+=1
     while layer_sizes[-1]<output_size:
      layer_sizes.append(layer_sizes[0]*i)
      i+=1

   mirrored_layer_sizes=deepcopy(layer_sizes)
   mirrored_layer_sizes.reverse()
   mirrored_layer_sizes=mirrored_layer_sizes[1:-1]

   for i in range(repeat):
    layer_sizes.append(layer_sizes[-1])

   if output_size>0:
    layer_sizes+=mirrored_layer_sizes
    downscale_factor=scale_factor if scale_factor>1 else 2

    while layer_sizes[-1]!=output_size:
     if layer_sizes[-1]//downscale_factor>=output_size:
      layer_sizes.append(layer_sizes[-1]//downscale_factor)
     else:
      layer_sizes.append(output_size)

  else:
    downscale_factor = (input_size / output_size) ** (1 / hidden_layers_num)
    for i in range(hidden_layers_num):
        layer_sizes.append(int(input_size / (downscale_factor ** i)))
    layer_sizes.append(output_size)

  return layer_sizes


In [None]:
layer_sizes=dense_arch_builder(input_size=3,scale_factor=2,hidden_layers_num=3,repeat=0,output_size=10)
print(layer_sizes)

[3, 6, 12, 24, 12, 6, 10]


In [None]:
## Creating a tensor dataset ##
from torch.utils.data import DataLoader, TensorDataset
def TorchDataLoader(training_sequences, batch_size):
  context_words = [item[0] for item in training_sequences]  # List of [context]
  target_words = [item[1] for item in training_sequences]   # List of target words

  # Convert lists to tensors
  context_tensor = torch.tensor(context_words, dtype=torch.long)  # Shape: (num_samples, 3)
  target_tensor = torch.tensor(target_words, dtype=torch.long)    # Shape: (num_samples,)

  # Create a TensorDataset
  dataset = TensorDataset(context_tensor, target_tensor)

  # Create a DataLoader for batching
  batch_size = 4
  dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

  # Iterate over the DataLoader
  for batch_context, batch_target in dataloader:
      print("Batch context:", batch_context)
      print("Batch target:", batch_target)
      # You can now use batch_context and batch_target for model training

  return dataloader

In [None]:
dummy_text = [
    ([2, 2, 2], 7), ([2, 2, 7], 8), ([2, 7, 8], 3), ([7, 8, 3], 9),
    ([8, 3, 9], 4), ([3, 9, 4], 5), ([9, 4, 5], 2), ([2, 2, 2], 2),
    ([2, 2, 2], 12), ([2, 2, 12], 4), ([2, 12, 4], 5), ([12, 4, 5], 13),
    ([4, 5, 13], 3), ([5, 13, 3], 14), ([13, 3, 14], 15), ([3, 14, 15], 2),
    ([2, 2, 2], 6), ([2, 2, 6], 16), ([2, 6, 16], 17), ([6, 16, 17], 18),
    ([16, 17, 18], 19), ([17, 18, 19], 20), ([18, 19, 20], 6), ([19, 20, 6], 21),
    ([20, 6, 21], 22), ([6, 21, 22], 2)
]

dataloader = TorchDataLoader(dummy_text, 4)

Batch context: tensor([[ 2,  2,  2],
        [ 2,  2, 12],
        [ 3,  9,  4],
        [12,  4,  5]])
Batch target: tensor([ 2,  4,  5, 13])
Batch context: tensor([[18, 19, 20],
        [16, 17, 18],
        [ 2,  2,  7],
        [ 2,  2,  2]])
Batch target: tensor([ 6, 19,  8,  6])
Batch context: tensor([[ 8,  3,  9],
        [ 6, 21, 22],
        [ 7,  8,  3],
        [ 2,  2,  2]])
Batch target: tensor([ 4,  2,  9, 12])
Batch context: tensor([[ 4,  5, 13],
        [17, 18, 19],
        [13,  3, 14],
        [ 2,  6, 16]])
Batch target: tensor([ 3, 20, 15, 17])
Batch context: tensor([[ 5, 13,  3],
        [ 6, 16, 17],
        [20,  6, 21],
        [ 3, 14, 15]])
Batch target: tensor([14, 18, 22,  2])
Batch context: tensor([[19, 20,  6],
        [ 9,  4,  5],
        [ 2,  7,  8],
        [ 2,  2,  2]])
Batch target: tensor([21,  2,  3,  7])
Batch context: tensor([[ 2, 12,  4],
        [ 2,  2,  6]])
Batch target: tensor([ 5, 16])


In [None]:
layer_sizes = [3,64,24]

model = SimpleANN(layer_sizes=layer_sizes)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

number_of_epochs = 30

for epoch in range(number_of_epochs):
    for batch_context, batch_target in dataloader:
        print(dataloader)
        #FORWARD PASS:
        X = batch_context
        Y = batch_target
        X, Y = X.to(device), Y.to(device)
        outputs = model(X)  # Model output for X
        loss = criterion(outputs, Y) # Compute the loss between model output and Y

        #BACKWARD PASS (updating the model parameters):
        optimizer.zero_grad()  # Clear gradients
        loss.backward()        # Compute gradients
        optimizer.step()       # Update model parameters

    print(f"Epoch [{epoch+1}/{number_of_epochs}], Loss: {loss.item():.4f}")

<torch.utils.data.dataloader.DataLoader object at 0x7cc5998036a0>
<torch.utils.data.dataloader.DataLoader object at 0x7cc5998036a0>
<torch.utils.data.dataloader.DataLoader object at 0x7cc5998036a0>
<torch.utils.data.dataloader.DataLoader object at 0x7cc5998036a0>
<torch.utils.data.dataloader.DataLoader object at 0x7cc5998036a0>
<torch.utils.data.dataloader.DataLoader object at 0x7cc5998036a0>
<torch.utils.data.dataloader.DataLoader object at 0x7cc5998036a0>
Epoch [1/30], Loss: 3.1898
<torch.utils.data.dataloader.DataLoader object at 0x7cc5998036a0>
<torch.utils.data.dataloader.DataLoader object at 0x7cc5998036a0>
<torch.utils.data.dataloader.DataLoader object at 0x7cc5998036a0>
<torch.utils.data.dataloader.DataLoader object at 0x7cc5998036a0>
<torch.utils.data.dataloader.DataLoader object at 0x7cc5998036a0>
<torch.utils.data.dataloader.DataLoader object at 0x7cc5998036a0>
<torch.utils.data.dataloader.DataLoader object at 0x7cc5998036a0>
Epoch [2/30], Loss: 3.2218
<torch.utils.data.data

# Step 4

Predicting the next word for four sentences:

In [None]:
test_sentences = torch.tensor([[0, 13, 12],
                              [0, 8, 9],
                              [8, 7, 6],
                              [5, 4, 5]])
output = model(test_sentences).detach().numpy()

# Predict
predictions = np.argmax(output, axis=1)

print(predictions)

[0, 13, 12]
5
[0, 8, 9]
3
[8, 7, 6]
13
[5, 4, 5]
13
[]


Quantitative evaluation

In [None]:
# Predict validation data
dummy_val = [
    ([2, 2, 2], 7), ([2, 2, 7], 8), ([2, 7, 8], 3), ([7, 8, 3], 9),
    ([8, 3, 9], 4), ([3, 9, 4], 5), ([9, 4, 5], 2), ([2, 2, 2], 2),
    ([2, 2, 2], 12), ([2, 2, 12], 4), ([2, 12, 4], 5), ([12, 4, 5], 13),
    ([4, 5, 13], 3), ([5, 13, 3], 14), ([13, 3, 14], 15), ([3, 14, 15], 2),
    ([2, 2, 2], 6), ([2, 2, 6], 16), ([2, 6, 16], 17), ([6, 16, 17], 18),
    ([16, 17, 18], 19), ([17, 18, 19], 20), ([18, 19, 20], 6), ([19, 20, 6], 21),
    ([20, 6, 21], 22), ([6, 21, 22], 2)
]
val_dataloader = TorchDataLoader(dummy_val, 4)
loss = []
for batch_context, batch_target in val_dataloader:
        #FORWARD PASS:
        X = batch_context
        Y = batch_target
        X, Y = X.to(device), Y.to(device)
        outputs = (model(X))  # Model output for X
        loss.append((criterion(outputs, Y)).item()) # Compute the loss between model output and Y

# Compute perplexity
perplexity = np.exp(np.mean(loss))
print(perplexity)

Batch context: tensor([[ 6, 21, 22],
        [18, 19, 20],
        [ 2,  2,  6],
        [ 9,  4,  5]])
Batch target: tensor([ 2,  6, 16,  2])
Batch context: tensor([[ 6, 16, 17],
        [ 5, 13,  3],
        [ 2,  2, 12],
        [20,  6, 21]])
Batch target: tensor([18, 14,  4, 22])
Batch context: tensor([[17, 18, 19],
        [ 2,  2,  7],
        [ 8,  3,  9],
        [ 4,  5, 13]])
Batch target: tensor([20,  8,  4,  3])
Batch context: tensor([[ 2,  2,  2],
        [19, 20,  6],
        [ 2,  6, 16],
        [16, 17, 18]])
Batch target: tensor([12, 21, 17, 19])
Batch context: tensor([[ 2,  2,  2],
        [ 2,  7,  8],
        [12,  4,  5],
        [ 2, 12,  4]])
Batch target: tensor([ 2,  3, 13,  5])
Batch context: tensor([[ 2,  2,  2],
        [ 3,  9,  4],
        [ 3, 14, 15],
        [ 7,  8,  3]])
Batch target: tensor([6, 5, 2, 9])
Batch context: tensor([[13,  3, 14],
        [ 2,  2,  2]])
Batch target: tensor([15,  7])
19.017954652765063


  return self._call_impl(*args, **kwargs)


Inspecting the word embeddings

In [None]:
def nearest_neighbors(emb, voc, inv_voc, word, n_neighbors=5):

    # Look up the embedding for the test word.
    test_emb = emb.weight[voc[word]]

    # We'll use a cosine similarity function to find the most similar words.
    sim_func = nn.CosineSimilarity(dim=1)
    cosine_scores = sim_func(test_emb, emb.weight)

    # Find the positions of the highest cosine values.
    near_nbr = cosine_scores.topk(n_neighbors+1)
    topk_cos = near_nbr.values[1:]
    topk_indices = near_nbr.indices[1:]
    # NB: the first word in the top-k list is the query word itself!
    # That's why we skip the first position in the code above.

    # Finally, map word indices back to strings, and put the result in a list.
    return [ (inv_voc[ix.item()], cos.item()) for ix, cos in zip(topk_indices, topk_cos) ]

nearest_neighbors("sweden")
nearest_neighbors("2005")

In [None]:
from sklearn.decomposition import TruncatedSVD
import matplotlib.pyplot as plt
def plot_embeddings_pca(emb, inv_voc, words):
    vectors = np.vstack([emb.weight[inv_voc[w]].cpu().detach().numpy() for w in words])
    vectors -= vectors.mean(axis=0)
    twodim = TruncatedSVD(n_components=2).fit_transform(vectors)
    plt.figure(figsize=(5,5))
    plt.scatter(twodim[:,0], twodim[:,1], edgecolors='k', c='r')
    for word, (x,y) in zip(words, twodim):
        plt.text(x+0.02, y, word)
    plt.axis('off')

plot_embeddings_pca(model[0], prepr, ['sweden', 'denmark', 'europe', 'africa', 'london', 'stockholm', 'large', 'small', 'great', 'black', '3', '7', '10', 'seven', 'three', 'ten', '1984', '2005', '2010'])