<a href="https://colab.research.google.com/github/MahdiTheGreat/Intro-to-language-modeling/blob/main/modified_intro_to_language_modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Step 0: Preparations

In [None]:
!git clone https://github.com/MahdiTheGreat/Intro-to-language-modeling.git
%cd Intro-to-language-modeling

Cloning into 'Intro-to-language-modeling'...
remote: Enumerating objects: 64, done.[K
remote: Counting objects: 100% (64/64), done.[K
remote: Compressing objects: 100% (63/63), done.[K
remote: Total 64 (delta 34), reused 2 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (64/64), 30.34 MiB | 14.13 MiB/s, done.
Resolving deltas: 100% (34/34), done.
/content/Intro-to-language-modeling


In [None]:
import sklearn

In [None]:
!pip install ipdb
!pip install -U spacy
!pip install torch torchvision --index-url https://download.pytorch.org/whl/cpu

Collecting ipdb
  Downloading ipdb-0.13.13-py3-none-any.whl.metadata (14 kB)
Collecting jedi>=0.16 (from ipython>=7.31.1->ipdb)
  Downloading jedi-0.19.2-py2.py3-none-any.whl.metadata (22 kB)
Downloading ipdb-0.13.13-py3-none-any.whl (12 kB)
Downloading jedi-0.19.2-py2.py3-none-any.whl (1.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m28.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: jedi, ipdb
Successfully installed ipdb-0.13.13 jedi-0.19.2
Collecting spacy
  Downloading spacy-3.8.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (27 kB)
Collecting thinc<8.4.0,>=8.3.0 (from spacy)
  Downloading thinc-8.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (15 kB)
Collecting blis<1.1.0,>=1.0.0 (from thinc<8.4.0,>=8.3.0->spacy)
  Downloading blis-1.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.6 kB)
Collecting numpy>=1.19.0 (from spacy)
  Downloading numpy-2.

In [None]:
import spacy
import torch
import matplotlib.pyplot as plt
import numpy as np
import random
import pandas as pd
from tqdm import tqdm

In [None]:
# Set random seed for reproducibility
def set_seed(seed=2024):
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

set_seed(1998)

In [None]:
# Helper function to plot the training metrics

def plot_training_metrics(train_acc, val_acc, train_loss, title, save_path):
    # Ensure that all input lists have the same length
    assert len(train_acc) == len(val_acc) == len(train_loss), "All input histories must have the same length."

    epochs = range(1, len(train_acc) + 1)

    # Create the metrics DataFrame
    df_metrics = pd.DataFrame({
        'Epoch': epochs,
        'Training Accuracy (%)': train_acc,
        'Validation Accuracy (%)': val_acc,
        'Training Loss': train_loss
    })

    # Initialize the plot
    fig, ax1 = plt.subplots(figsize=(10, 6))

    # Plot Training and Validation Accuracy on ax1
    color = 'tab:blue'
    ax1.set_xlabel('Epoch')
    ax1.set_ylabel('Accuracy (%)', color=color)
    ax1.plot(df_metrics['Epoch'], df_metrics['Training Accuracy (%)'], label='Train Acc', color='tab:blue')
    ax1.plot(df_metrics['Epoch'], df_metrics['Validation Accuracy (%)'], label='Val Acc', color='tab:cyan')
    ax1.tick_params(axis='y', labelcolor=color)

    # Create a second y-axis for Training Loss
    ax2 = ax1.twinx()
    color = 'tab:red'
    ax2.set_ylabel('Loss', color=color)
    ax2.plot(df_metrics['Epoch'], df_metrics['Training Loss'], label='Train Loss', color='tab:red')
    ax2.tick_params(axis='y', labelcolor=color)

    # Combine legends from both axes
    lines_1, labels_1 = ax1.get_legend_handles_labels()
    lines_2, labels_2 = ax2.get_legend_handles_labels()
    ax1.legend(lines_1 + lines_2, labels_1 + labels_2, loc='upper left')

    # Set plot title and layout
    plt.title(title)
    plt.tight_layout()

    # Save and display the plot
    plt.savefig(save_path)
    plt.show()

In [None]:
# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else ('mps' if torch.backends.mps.is_available() else 'cpu'))
print(f'Using device: {device}')

Using device: cuda


# Step 1 & 2

In [None]:
dataset='lmdemo'
zip_file = f"{dataset}.zip"
!unzip -q $zip_file
!rm $zip_file

In [None]:
training_set=open(f'{dataset}/train.txt','r',encoding='utf-8').read()
val_set=open(f'{dataset}/val.txt','r',encoding='utf-8').read()

In [None]:
# Tokenize data
nlp = spacy.load("en_core_web_sm")



In [None]:
example_filepath="example.txt"

In [None]:
import spacy
from collections import Counter
import json

from enum import Enum, auto

class Special_tokens(Enum):
    BEGINNING = "BEGINNING"
    END = "END"
    UNKNOWN = "UNKOWN"

# Load spaCy model for tokenization
nlp = spacy.load("en_core_web_sm")

class VocabularyBuilder:
    def __init__(self, max_voc_size=None):
        self.max_voc_size = max_voc_size
        self.str_to_int = {}
        self.int_to_str = {}
        self.special_tokens = [token.value for token in Special_tokens]
        self.token_counter = None

    def get_token_counter(self,filepath,nlp,encoding):

     buffer = ""  # Buffer to store partial sentences between lines
     token_counter = Counter()

     def sent_processor(sent,complete=True):
      tokens=[]
      if complete: tokens.append(nlp(Special_tokens.BEGINNING.value)[0])  # Add "BEGINNING" at the start of each sentence
      tokens.extend([token for token in sent])  # Add sentence tokens
      if complete: tokens.append(nlp(Special_tokens.END.value)[0])  # Add "END" at the end of each sentence
      for token in tokens:
       if not token.is_space:
        if token.text not in self.special_tokens:
         token_counter[token.text] += 1
        else:
         token_counter[token.text.lower()] += 1


     with open(filepath, 'r') as file:
      lines = [line for line in file]
      for line in tqdm(lines, desc="Processing Lines for tokens", unit="Lines"):
          # Add line to buffer and process with spaCy
          buffer += " " + line.strip()
          doc = nlp(buffer)
          # Extract complete sentences
          sentences = list(doc.sents)
          for i, sent in enumerate(sentences):
              # If it's not the last sentence, we print it as it's complete
              if i < len(sentences) - 1:
                  print(sent)
                  sent_processor(sent)
              else:
                  # If it's the last sentence, store it in the buffer in case it's incomplete
                  buffer = sent.text
                  # Process sentences and identify complete sentences
          for sent in doc.sents:
              if sent.end_char < len(buffer):
                  print(sent)
                  sent_processor(sent)
      # Process any remaining content in the buffer
      doc = nlp(buffer)
      for sent in doc.sents:
       print(sent)
       sent_processor(sent)
     return token_counter

    def build_vocabulary(self, filepath, nlp,token_counter_savepath=None,token_counter_loadpath=None,encoding="utf-8"):

        # Tokenize text and count tokens
        if token_counter_loadpath is not None:
         with open(token_counter_loadpath, "r") as file:
            self.token_counter = Counter(json.load(file))
        else:
         self.token_counter =self.get_token_counter(filepath=filepath,nlp=nlp,encoding=encoding)

        # Start vocabulary with special tokens
        for idx, token in enumerate(self.special_tokens):
            self.str_to_int[token] = idx
            self.int_to_str[idx] = token

        # Select the most common tokens, considering max_voc_size - len(special_tokens)
        if self.max_voc_size is None:
            max_words = len(self.token_counter) - len(self.special_tokens)
            self.max_voc_size = max_words + len(self.special_tokens)
        else:
         max_words = self.max_voc_size - len(self.special_tokens)
        most_common_tokens = self.token_counter.most_common(max_words)

        for idx, (token, _) in enumerate(most_common_tokens, start=len(self.special_tokens)):
            self.str_to_int[token] = idx
            self.int_to_str[idx] = token

        # Save to a JSON file
        if token_counter_savepath is not None:
         with open(token_counter_savepath, "w") as file:
             json.dump(self.token_counter, file)


    def get_token_id(self, token):
        # Return the integer ID for a given token
        token=token.lower() if token not in self.special_tokens else token
        return self.str_to_int.get(token, self.str_to_int[Special_tokens.UNKNOWN.value])

    def get_token_str(self, token_id):
        # Return the original token string for a given integer ID
        return self.int_to_str.get(token_id, Special_tokens.UNKNOWN.value)

    def sanity_check(self):
        # Check vocabulary size
        assert len(self.str_to_int) <= self.max_voc_size, "Vocabulary size exceeds max_voc_size."

        # Check special tokens exist and are unique
        for token in self.special_tokens:
            assert token in self.str_to_int, f"Missing special token: {token}"

        # Check if highly frequent words are included and rare ones are not
        common_words = ["the", "and"]
        rare_words = ["cuboidal", "epiglottis"]

        for word in common_words:
            assert word in self.str_to_int, f"Common word '{word}' not in vocabulary."

        for word in rare_words:
            assert word not in self.str_to_int, f"Rare word '{word}' should not be in vocabulary."

        # Check that mapping back and forth works for a test word
        test_word = "The"
        token_id = self.get_token_id(test_word)
        assert self.get_token_str(token_id) == test_word.lower(), "Round-trip token mapping failed."

        print("Sanity check passed!")

token_counter_filepath="token_counter.json"
vocab_builder = VocabularyBuilder()
vocab_builder.build_vocabulary(filepath=example_filepath, nlp=nlp,token_counter_savepath=token_counter_filepath)

# Example mappings
print("str_to_int:", vocab_builder.str_to_int)
print("int_to_str:", vocab_builder.int_to_str)
print("vocabulary size: ",len(vocab_builder.token_counter))

# Convert a token to integer ID and back to string
token_id = vocab_builder.get_token_id("example")
print("Token ID for 'example':", token_id)
print("Original token from ID:", vocab_builder.get_token_str(token_id))

Processing Lines for tokens: 100%|██████████| 5/5 [00:00<00:00, 31.69Lines/s]

 Anatomy Anatomy (Greek anatomē, “dissection”) is the branch of biology concerned with the study of the structure of organisms and their parts.  
Anatomy is a branch of natural science dealing with the structural organization of living things.  
It is an old science, having its beginnings in prehistoric times.  
Anatomy is inherently tied to embryology, comparative anatomy, evolutionary biology, and phylogeny, as these are the processes by which anatomy is generated over immediate (embryology) and long (evolution) timescales.  
Human anatomy is one of the basic essential sciences of medicine.
The discipline of anatomy is divided into macroscopic and microscopic anatomy.  
Macroscopic anatomy, or gross anatomy, is the examination of an animal's body parts using unaided eyesight.  
Gross anatomy also includes the branch of superficial anatomy.  
Human anatomy is one of the basic essential sciences of medicine.
The discipline of anatomy is divided into macroscopic and microscopic anatomy.




In [None]:
token_counter_filepath="token_counter.json"
vocab_builder = VocabularyBuilder()
vocab_builder.build_vocabulary(filepath=example_filepath, nlp=nlp,token_counter_loadpath=token_counter_filepath)

# Example mappings
print("str_to_int:", vocab_builder.str_to_int)
print("int_to_str:", vocab_builder.int_to_str)
print("vocabulary size: ",len(vocab_builder.token_counter))


# Convert a token to integer ID and back to string
token_id = vocab_builder.get_token_id("example")
print("Token ID for 'example':", token_id)
print("Original token from ID:", vocab_builder.get_token_str(token_id))

str_to_int: {'BEGINNING': 0, 'END': 1, 'UNKOWN': 2, 'of': 3, 'the': 4, 'anatomy': 5, 'beginning': 6, '.': 7, 'end': 8, ',': 9, 'is': 10, 'and': 11, 'Anatomy': 12, '(': 13, ')': 14, 'branch': 15, 'study': 16, 'in': 17, 'biology': 18, 'with': 19, 'parts': 20, 'science': 21, 'an': 22, 'embryology': 23, 'as': 24, 'Human': 25, 'one': 26, 'basic': 27, 'essential': 28, 'sciences': 29, 'medicine': 30, 'The': 31, 'discipline': 32, 'divided': 33, 'into': 34, 'macroscopic': 35, 'microscopic': 36, 'also': 37, 'Greek': 38, 'anatomē': 39, '“': 40, 'dissection': 41, '”': 42, 'concerned': 43, 'structure': 44, 'organisms': 45, 'their': 46, 'a': 47, 'natural': 48, 'dealing': 49, 'structural': 50, 'organization': 51, 'living': 52, 'things': 53, 'It': 54, 'old': 55, 'having': 56, 'its': 57, 'beginnings': 58, 'prehistoric': 59, 'times': 60, 'inherently': 61, 'tied': 62, 'to': 63, 'comparative': 64, 'evolutionary': 65, 'phylogeny': 66, 'these': 67, 'are': 68, 'processes': 69, 'by': 70, 'which': 71, 'generat

In [None]:
# Perform sanity check
vocab_builder.sanity_check()

Sanity check passed!


In [None]:
import csv
class TrainingDataPreparer:
    def __init__(self, vocab_builder, nlp, context_window_size=3, chunk_size=1024):
        self.vocab_builder = vocab_builder
        self.context_window_size = context_window_size
        self.chunk_size = chunk_size
        self.nlp = nlp

    def encode_token(self, token):
        token_id = self.vocab_builder.get_token_id(token.text)
        return token_id if token_id != self.vocab_builder.get_token_id(Special_tokens.UNKNOWN.value) else None

    def prepare_training_data(self, input_file, output_file):
        with open(input_file, "r") as infile, open(output_file, "w", newline="") as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow([f"Token_{i+1}" for i in range(self.context_window_size)] + ["Target"])

            # Initialize the beginning padding tokens
            padded_tokens = [self.vocab_builder.get_token_id(Special_tokens.BEGINNING.value)] * self.context_window_size
            first_chunk = True
            file_size = len(infile.read())  # Get the size of the file to calculate progress
            infile.seek(0)  # Reset the file pointer to the beginning after reading the file size

            # Use tqdm for reading chunks
            with tqdm(total=file_size, unit="B", unit_scale=True, desc="Processing file") as pbar:
             while True:
              chunk = infile.read(self.chunk_size)
              if not chunk:
                  break

              # Tokenize chunk into sentences
              doc = self.nlp(chunk)
              sentences = list(doc.sents)

              for sentence in sentences:
                  # Process sentence and convert to token IDs, skipping unknowns and spaces
                  sentence_token_ids = [
                      self.encode_token(token) for token in sentence if self.encode_token(token) is not None
                  ]

                  if first_chunk and sentence_token_ids:
                      padded_tokens += sentence_token_ids
                      first_chunk = False
                  else:
                      # Add only the sentence tokens from subsequent sentences
                      padded_tokens.extend(sentence_token_ids)

                  # Add END token at the end of each sentence
                  padded_tokens.append(self.vocab_builder.get_token_id(Special_tokens.END.value))

                  # Generate context-target sequences
                  for i in range(len(padded_tokens) - self.context_window_size):
                      context = padded_tokens[i:i + self.context_window_size]
                      target = padded_tokens[i + self.context_window_size]
                      writer.writerow(context + [target])

              # Retain only the last context window tokens for the next chunk
              padded_tokens = padded_tokens[-self.context_window_size:]

        print("Training data preparation complete.")

    def print_csv_as_words(self, csv_file):
           """
           Reads a CSV file with token IDs, decodes them to words, and prints each sequence.
           """
           with open(csv_file, "r") as file:
               reader = csv.reader(file)
               headers = next(reader)  # Skip the header

               for row in reader:
                   context_ids = row[:-1]  # All columns except the last one are context
                   target_id = row[-1]  # Last column is the target

                   # Convert token IDs to words
                   context_words = [self.vocab_builder.get_token_str(int(token_id)) for token_id in context_ids]
                   target_word = self.vocab_builder.get_token_str(int(target_id))

                   # Print context and target as words
                   print("Context:", context_words, "-> Target:", target_word)


data_preparer = TrainingDataPreparer(vocab_builder=vocab_builder,nlp=nlp, context_window_size=3)

input_file = "example.txt"
output_file = "training_sequences.csv"

# Prepare training data
data_preparer.prepare_training_data(input_file, output_file)


Processing file:   0%|          | 0.00/1.01k [00:00<?, ?B/s]

Training data preparation complete.





In [None]:
training_data=pd.read_csv("training_sequences.csv")
print(training_data)

     Token_1  Token_2  Token_3  Target
0          0        0        0       5
1          0        0        5       5
2          0        5        5      13
3          5        5       13      39
4          5       13       39       9
..       ...      ...      ...     ...
933       11       37       17       4
934       37       17        4      16
935       17        4       16       3
936        4       16        3       7
937       16        3        7       1

[938 rows x 4 columns]


In [None]:
data_preparer.print_csv_as_words("training_sequences.csv")

Context: ['BEGINNING', 'BEGINNING', 'BEGINNING'] -> Target: anatomy
Context: ['BEGINNING', 'BEGINNING', 'anatomy'] -> Target: anatomy
Context: ['BEGINNING', 'anatomy', 'anatomy'] -> Target: (
Context: ['anatomy', 'anatomy', '('] -> Target: anatomē
Context: ['anatomy', '(', 'anatomē'] -> Target: ,
Context: ['(', 'anatomē', ','] -> Target: “
Context: ['anatomē', ',', '“'] -> Target: dissection
Context: [',', '“', 'dissection'] -> Target: ”
Context: ['“', 'dissection', '”'] -> Target: )
Context: ['dissection', '”', ')'] -> Target: is
Context: ['”', ')', 'is'] -> Target: the
Context: [')', 'is', 'the'] -> Target: branch
Context: ['is', 'the', 'branch'] -> Target: of
Context: ['the', 'branch', 'of'] -> Target: biology
Context: ['branch', 'of', 'biology'] -> Target: concerned
Context: ['of', 'biology', 'concerned'] -> Target: with
Context: ['biology', 'concerned', 'with'] -> Target: the
Context: ['concerned', 'with', 'the'] -> Target: study
Context: ['with', 'the', 'study'] -> Target: of
Con

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd

class TorchDataset(Dataset):
    def __init__(self, csv_file, context_window_size):
        # Read CSV file using pandas
        self.data = pd.read_csv(csv_file)
        self.context_window_size = context_window_size

    def __len__(self):
        # The length of the dataset is the number of rows in the CSV
        return len(self.data)

    def __getitem__(self, idx):
        # Get a specific row from the data and convert it to a tensor
        row = self.data.iloc[idx]

        # The context is all tokens except for the last one (target)
        context = torch.tensor(row[:-1].values, dtype=torch.long)

        # The target is the last token in the row
        target = torch.tensor(row[-1], dtype=torch.long)

        return context, target

def TorchDataLoader(training_sequences_csv, batch_size, context_window_size):
    # Create the Dataset instance
    dataset = TorchDataset(training_sequences_csv, context_window_size)

    # Create the DataLoader instance to handle batching
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    return dataloader

# Example usage:
# Define your CSV file, batch size, and context window size
csv_file = 'training_sequences.csv'
batch_size = 32
context_window_size = 10

# Create the DataLoader
trainloader = TorchDataLoader(csv_file, batch_size, context_window_size)

# Step 3

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim


# EarlyStopping class remains the same
class EarlyStopping:
    def __init__(self, patience=5, delta=0, verbose=False, path='checkpoint.pth'):
        self.patience = patience  # Number of epochs to wait for improvement
        self.delta = delta  # Minimum change to qualify as an improvement
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = np.Inf
        self.path = path  # Path to save the best model

    def __call__(self, val_loss, model):
        if self.best_score is None:
            self.best_score = val_loss
            self.save_checkpoint(val_loss, model)
        elif val_loss < self.best_score - self.delta:
            self.best_score = val_loss
            self.save_checkpoint(val_loss, model)
            self.counter = 0
        else:
            self.counter += 1
            if self.verbose:
                print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True

    def save_checkpoint(self, val_loss, model):
        '''Save model when validation loss decreases.'''
        if self.verbose:
            print(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}).  Saving model ...')
        torch.save(model.state_dict(), self.path)
        self.val_loss_min = val_loss

class SimpleANN(nn.Module):

    def __init__(self, vocab_size, embed_size, layer_sizes,activation=nn.ReLU,last_layer_activation=nn.Softmax,dropout=0):

        super(SimpleANN, self).__init__()

        self.embeddings = nn.Embedding(vocab_size, embed_size)
        self.layers = nn.ModuleList()

        for i in range(len(layer_sizes)-2):
          self.layers.append(nn.Linear(layer_sizes[i], layer_sizes[i+1]))
          self.layers.append(nn.Dropout(dropout))
          self.layers.append(activation())

        self.layers.append(nn.Linear(layer_sizes[-2], layer_sizes[-1]))
        if last_layer_activation is not None:
         self.layers.append(nn.Dropout(dropout))
         self.layers.append(last_layer_activation())

    def forward(self, x):
        # Assuming x is a batch of word indices (e.g., [batch_size])
        embeddings = self.embeddings(x)  # Get word embeddings for each word in the batch

        # Flatten the input embeddings (if necessary, depending on your task)
        x = embeddings.view(-1, np.prod(embeddings.shape[1:]))  # Flatten for fully connected layers

        #x = x.view(-1, np.prod(x.shape[1:])) # Flatten the input
        x = x.float()
        for layer in self.layers:
            x = layer(x)
        return x


In [None]:
model = SimpleANN(layer_sizes=[48, 64, 65000], vocab_size=65000, embed_size=16)
model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

patience = 5
early_stopping = EarlyStopping(patience=patience, verbose=True)

number_of_epochs = 10

for epoch in range(number_of_epochs):
    print(f"--- Epoch {epoch+1}/{number_of_epochs} ---")
    for batch_context, batch_target in tqdm(trainloader):
        #FORWARD PASS:
        X = batch_context
        Y = batch_target
        X, Y = X.to(device), Y.to(device)
        outputs = model(X)  # Model output for X
        loss = criterion(outputs, Y) # Compute the loss between model output and Y

        #BACKWARD PASS (updating the model parameters):
        optimizer.zero_grad()  # Clear gradients
        loss.backward()        # Compute gradients
        optimizer.step()       # Update model parameters

    print(f"Training perplexity: {np.exp(loss.item()):.4f}")

    # Validation loop
    model.eval()  # Set model to evaluation mode
    val_loss = 0.0
    with torch.no_grad():  # No gradient computation for validation
        for inputs, targets in valloader:
            X = inputs
            Y = targets
            X, Y = X.to(device), Y.to(device)
            outputs = model(X)
            loss = criterion(outputs, Y)
            val_loss += loss.item()

    avg_val_loss = val_loss / len(valloader)  # Average validation loss
    print(f"Validation perplexity: {np.exp(avg_val_loss):.6f}")

    # Call early stopping after each epoch
    early_stopping(avg_val_loss, model)

    if early_stopping.early_stop:
        print("Early stopping triggered!")
        break

# Optionally, load the best model after training
model.load_state_dict(torch.load('checkpoint.pth'))

--- Epoch 1/10 ---


  target = torch.tensor(row[-1], dtype=torch.long)
  return self._call_impl(*args, **kwargs)
100%|██████████| 30/30 [00:01<00:00, 24.47it/s]

Training perplexity: 65000.0803





NameError: name 'valloader' is not defined

# Step 4

In [None]:
test_sentences = ["This is very",
                  "A tall building",
                  "The next sentence",
                  "Not a big"]

encoded_sentences = []
for sentence in test_sentences:
  encoded_sentences.append([vocab_builder.get_token_id(sentence) for word in sentence.split(" ")])

output = model(torch.tensor(encoded_sentences)).detach().numpy()

# Predict
predictions = np.argmax(output, axis=1)

for prediction in predictions:
  print(vocab_builder.get_token_str(prediction))

In [None]:
perplexity = np.exp(avg_val_loss)
print(perplexity)

In [None]:
def nearest_neighbors(emb, voc, word, n_neighbors=5):

    # Look up the embedding for the test word.
    test_emb = emb.weight[voc.get_token_id(word)]

    # We'll use a cosine similarity function to find the most similar words.
    sim_func = nn.CosineSimilarity(dim=1)
    cosine_scores = sim_func(test_emb, emb.weight)

    # Find the positions of the highest cosine values.
    near_nbr = cosine_scores.topk(n_neighbors+1)
    topk_cos = near_nbr.values[1:]
    topk_indices = near_nbr.indices[1:]
    # NB: the first word in the top-k list is the query word itself!
    # That's why we skip the first position in the code above.

    # Finally, map word indices back to strings, and put the result in a list.
    return [ (voc.get_token_str(ix.item()), cos.item()) for ix, cos in zip(topk_indices, topk_cos) ]

nearest_neighbors(model.embeddings, vocab_builder, "sweden")
nearest_neighbors(model.embeddings, vocab_builder, "2005")

In [None]:
from sklearn.decomposition import TruncatedSVD
import matplotlib.pyplot as plt
def plot_embeddings_pca(emb, voc, words):
    vectors = np.vstack([emb.weight[voc.get_token_id(w)].cpu().detach().numpy() for w in words])
    vectors -= vectors.mean(axis=0)
    twodim = TruncatedSVD(n_components=2).fit_transform(vectors)
    plt.figure(figsize=(5,5))
    plt.scatter(twodim[:,0], twodim[:,1], edgecolors='k', c='r')
    for word, (x,y) in zip(words, twodim):
        plt.text(x+0.02, y, word)
    plt.axis('off')
    plt.show()

plot_embeddings_pca(model.embeddings, vocab_builder, ['sweden', 'denmark', 'europe', 'africa', 'london', 'stockholm', 'large', 'small', 'great', 'black', '3', '7', '10', 'seven', 'three', 'ten', '1984', '2005', '2010'])
