In [12]:
!git clone https://github.com/MahdiTheGreat/Intro-to-language-modeling.git
%cd Intro-to-language-modeling

Cloning into 'Intro-to-language-modeling'...
remote: Enumerating objects: 28, done.[K
remote: Counting objects:   3% (1/28)[Kremote: Counting objects:   7% (2/28)[Kremote: Counting objects:  10% (3/28)[Kremote: Counting objects:  14% (4/28)[Kremote: Counting objects:  17% (5/28)[Kremote: Counting objects:  21% (6/28)[Kremote: Counting objects:  25% (7/28)[Kremote: Counting objects:  28% (8/28)[Kremote: Counting objects:  32% (9/28)[Kremote: Counting objects:  35% (10/28)[Kremote: Counting objects:  39% (11/28)[Kremote: Counting objects:  42% (12/28)[Kremote: Counting objects:  46% (13/28)[Kremote: Counting objects:  50% (14/28)[Kremote: Counting objects:  53% (15/28)[Kremote: Counting objects:  57% (16/28)[Kremote: Counting objects:  60% (17/28)[Kremote: Counting objects:  64% (18/28)[Kremote: Counting objects:  67% (19/28)[Kremote: Counting objects:  71% (20/28)[Kremote: Counting objects:  75% (21/28)[Kremote: Counting objects:  78% (22/28)[

In [13]:
import sklearn

In [14]:
import spacy
import torch
import matplotlib.pyplot as plt
import numpy as np
import random
import pandas as pd

In [15]:
# Helper function to plot the training metrics

def plot_training_metrics(train_acc, val_acc, train_loss, title, save_path):
    # Ensure that all input lists have the same length
    assert len(train_acc) == len(val_acc) == len(train_loss), "All input histories must have the same length."

    epochs = range(1, len(train_acc) + 1)

    # Create the metrics DataFrame
    df_metrics = pd.DataFrame({
        'Epoch': epochs,
        'Training Accuracy (%)': train_acc,
        'Validation Accuracy (%)': val_acc,
        'Training Loss': train_loss
    })

    # Initialize the plot
    fig, ax1 = plt.subplots(figsize=(10, 6))

    # Plot Training and Validation Accuracy on ax1
    color = 'tab:blue'
    ax1.set_xlabel('Epoch')
    ax1.set_ylabel('Accuracy (%)', color=color)
    ax1.plot(df_metrics['Epoch'], df_metrics['Training Accuracy (%)'], label='Train Acc', color='tab:blue')
    ax1.plot(df_metrics['Epoch'], df_metrics['Validation Accuracy (%)'], label='Val Acc', color='tab:cyan')
    ax1.tick_params(axis='y', labelcolor=color)

    # Create a second y-axis for Training Loss
    ax2 = ax1.twinx()
    color = 'tab:red'
    ax2.set_ylabel('Loss', color=color)
    ax2.plot(df_metrics['Epoch'], df_metrics['Training Loss'], label='Train Loss', color='tab:red')
    ax2.tick_params(axis='y', labelcolor=color)

    # Combine legends from both axes
    lines_1, labels_1 = ax1.get_legend_handles_labels()
    lines_2, labels_2 = ax2.get_legend_handles_labels()
    ax1.legend(lines_1 + lines_2, labels_1 + labels_2, loc='upper left')

    # Set plot title and layout
    plt.title(title)
    plt.tight_layout()

    # Save and display the plot
    plt.savefig(save_path)
    plt.show()

In [54]:
import torch
import torch.nn as nn
import torch.optim as optim


# EarlyStopping class remains the same
class EarlyStopping:
    def __init__(self, patience=5, delta=0, verbose=False, path='checkpoint.pth'):
        self.patience = patience  # Number of epochs to wait for improvement
        self.delta = delta  # Minimum change to qualify as an improvement
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = np.Inf
        self.path = path  # Path to save the best model

    def __call__(self, val_loss, model):
        if self.best_score is None:
            self.best_score = val_loss
            self.save_checkpoint(val_loss, model)
        elif val_loss < self.best_score - self.delta:
            self.best_score = val_loss
            self.save_checkpoint(val_loss, model)
            self.counter = 0
        else:
            self.counter += 1
            if self.verbose:
                print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True

    def save_checkpoint(self, val_loss, model):
        '''Save model when validation loss decreases.'''
        if self.verbose:
            print(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}).  Saving model ...')
        torch.save(model.state_dict(), self.path)
        self.val_loss_min = val_loss

class SimpleANN(nn.Module):

    def __init__(self, vocab_size, embed_size, layer_sizes,activation=nn.ReLU,last_layer_activation=nn.Softmax,dropout=0):

        super(SimpleANN, self).__init__()

        self.embeddings = nn.Embedding(vocab_size, embed_size)
        self.layers = nn.ModuleList()

        for i in range(len(layer_sizes)-2):
          self.layers.append(nn.Linear(layer_sizes[i], layer_sizes[i+1]))
          self.layers.append(nn.Dropout(dropout))
          self.layers.append(activation())

        self.layers.append(nn.Linear(layer_sizes[-2], layer_sizes[-1]))
        if last_layer_activation is not None:
         self.layers.append(nn.Dropout(dropout))
         self.layers.append(last_layer_activation())

    def forward(self, x):
        # Assuming x is a batch of word indices (e.g., [batch_size])
        embeddings = self.embeddings(x)  # Get word embeddings for each word in the batch

        # Flatten the input embeddings (if necessary, depending on your task)
        x = embeddings.view(-1, np.prod(embeddings.shape[1:]))  # Flatten for fully connected layers

        #x = x.view(-1, np.prod(x.shape[1:])) # Flatten the input
        x = x.float()
        for layer in self.layers:
            x = layer(x)
        return x


In [17]:
# Set random seed for reproducibility
def set_seed(seed=2024):
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

set_seed(1998)

In [18]:
# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else ('mps' if torch.backends.mps.is_available() else 'cpu'))
print(f'Using device: {device}')

Using device: cpu


In [19]:
dataset='lmdemo'
zip_file = f"{dataset}.zip"
!unzip -q $zip_file
!rm $zip_file

In [20]:
training_set=open(f'{dataset}/train.txt','r',encoding='utf-8').read()
val_set=open(f'{dataset}/val.txt','r',encoding='utf-8').read()

In [21]:
training_set = training_set
val_set = val_set

In [22]:
# Tokenize data
nlp = spacy.load("en_core_web_sm")
nlp.max_length = 64821808

In [42]:
import spacy
from collections import Counter
class VocabularyBuilder:
    def __init__(self, max_voc_size):
        self.max_voc_size = max_voc_size
        self.str_to_int = {}
        self.int_to_str = {}
        self.special_tokens = ["BEGINNING", "END", "UNKNOWN"]
        self.token_counter = Counter()

    def build_vocabulary(self, text):

        if isinstance(text, list):
          sents=text
        else:
          doc = nlp(text)
          sents=doc.sents

        for token in sents:
         if not token.is_space and not token.is_punct:
             self.token_counter[token.text.lower()] += 1

    def create_vocabulary(self):
        # Start vocabulary with special tokens
        for idx, token in enumerate(self.special_tokens):
            self.str_to_int[token] = idx
            self.int_to_str[idx] = token

        # Select the most common tokens, considering max_voc_size - len(special_tokens)
        max_words = self.max_voc_size - len(self.special_tokens)
        most_common_tokens = self.token_counter.most_common(max_words)

        for idx, (token, _) in enumerate(most_common_tokens, start=len(self.special_tokens)):
            self.str_to_int[token] = idx
            self.int_to_str[idx] = token

    def create_premade_vocabulary(self, c):
        # Start vocabulary with special tokens
        for idx, token in enumerate(self.special_tokens):
            self.str_to_int[token] = idx
            self.int_to_str[idx] = token

        # Select the most common tokens, considering max_voc_size - len(special_tokens)
        max_words = self.max_voc_size - len(self.special_tokens)
        most_common_tokens = c.most_common(max_words)

        for idx, (token, _) in enumerate(most_common_tokens, start=len(self.special_tokens)):
            self.str_to_int[token] = idx
            self.int_to_str[idx] = token

    def get_token_id(self, token):
        # Return the integer ID for a given token
        return self.str_to_int.get(token.lower(), self.str_to_int["UNKNOWN"])

    def get_token_str(self, token_id):
        # Return the original token string for a given integer ID
        return self.int_to_str.get(token_id, "UNKNOWN")

    def sanity_check(self):
        # Check vocabulary size
        assert len(self.str_to_int) <= self.max_voc_size, "Vocabulary size exceeds max_voc_size."

        # Check special tokens exist and are unique
        for token in self.special_tokens:
            assert token in self.str_to_int, f"Missing special token: {token}"

        # Check if highly frequent words are included and rare ones are not
        common_words = ["the", "and"]
        rare_words = ["cuboidal", "epiglottis"]

        for word in common_words:
            assert word in self.str_to_int, f"Common word '{word}' not in vocabulary."

        for word in rare_words:
            assert word not in self.str_to_int, f"Rare word '{word}' should not be in vocabulary."

        # Check that mapping back and forth works for a test word
        test_word = "The"
        token_id = self.get_token_id(test_word)
        assert self.get_token_str(token_id) == test_word.lower(), "Round-trip token mapping failed."

        print("Sanity check passed!")

vocab_builder = VocabularyBuilder(max_voc_size=100000)


In [None]:
# Initialize VocabularyBuilder with a max vocabulary size
for paragraph in training_set:
  vocab_builder.build_vocabulary(paragraph)
vocab_builder.create_vocabulary()

# Example mappings
print("str_to_int:", vocab_builder.str_to_int)
print("int_to_str:", vocab_builder.int_to_str)

# Convert a token to integer ID and back to string
token_id = vocab_builder.get_token_id("example")
print("Token ID for 'example':", token_id)
print("Original token from ID:", vocab_builder.get_token_str(token_id))

In [None]:
# Save vocab so we don't have to rerun it
counter= vocab_builder.token_counter
with open("full_vocab", 'w') as f:
    for k,v in  counter.most_common():
        f.write( "{} {}\n".format(k,v) )

In [43]:
premade_counter = Counter()

# Read the file and populate the Counter
with open("/content/full_vocab", 'r') as file:
    for line in file:
        # Split the line into word and frequency
        parts = line.strip().split()
        if len(parts) == 2:
            word, freq = parts[0], int(parts[1])
            premade_counter[word] = freq
vocab_builder.create_premade_vocabulary(premade_counter)


In [44]:
# Perform sanity check
vocab_builder.sanity_check()

Sanity check passed!


In [45]:
class TrainingDataPreparer:
    def __init__(self, vocab_builder, context_window_size):
        self.vocab_builder = vocab_builder
        self.N = context_window_size

    def encode_text(self, text):
        """Tokenizes and encodes a single string with special symbols.

        Parameters:
        - text (str): The input string to encode.

        Returns:
        - List[int]: A list of token IDs including BEGINNING and END tokens.
        """
        # Tokenize the text
        doc = nlp(text)

        tokens = [token.text.lower() for token in doc]

        # Map tokens to integer IDs, using "UNKNOWN" for out-of-vocabulary words
        token_ids = [self.vocab_builder.get_token_id(token) for token in tokens]
        modified_tokens = [0]*self.N
        modified_tokens.extend(token_ids)
        modified_tokens.append(1)

        return modified_tokens

    def create_training_sequences(self, text):
        """
        Creates training sequences from a single string by generating sequences of length N+1.

        Parameters:
        - text (str): The input string to create sequences from.

        Returns:
        - List[Tuple[List[int], int]]: A list of (context, target) pairs.
        """
        training_sequences = []

        # Encode the text with BEGINNING, END, and UNKNOWN tokens
        encoded_text = self.encode_text(text)

        # Generate sequences of length N+1
        for i in range(len(encoded_text) - self.N):
            context = encoded_text[i : i + self.N]  # N tokens for context
            target = encoded_text[i + self.N]       # Next token as the target
            training_sequences.append((context, target))

        return training_sequences


In [46]:
context_window_size = 3
data_preparer = TrainingDataPreparer(vocab_builder, context_window_size)

# Create training sequences
training_sequences = data_preparer.create_training_sequences(training_set[:100])

# Display some training sequences
print("Training sequences (context, target):")
for context, target in training_sequences[:10]:  # Show the first few sequences
    print([vocab_builder.get_token_str(id) for id in context], "->", vocab_builder.get_token_str(target))

Training sequences (context, target):
['BEGINNING', 'BEGINNING', 'BEGINNING'] -> anatomy
['BEGINNING', 'BEGINNING', 'anatomy'] -> UNKNOWN
['BEGINNING', 'anatomy', 'UNKNOWN'] -> anatomy
['anatomy', 'UNKNOWN', 'anatomy'] -> UNKNOWN
['UNKNOWN', 'anatomy', 'UNKNOWN'] -> greek
['anatomy', 'UNKNOWN', 'greek'] -> anatomē
['UNKNOWN', 'greek', 'anatomē'] -> UNKNOWN
['greek', 'anatomē', 'UNKNOWN'] -> UNKNOWN
['anatomē', 'UNKNOWN', 'UNKNOWN'] -> dissection
['UNKNOWN', 'UNKNOWN', 'dissection'] -> UNKNOWN


In [47]:
## Creating a tensor dataset ##
from torch.utils.data import DataLoader, TensorDataset
def TorchDataLoader(training_sequences, batch_size):
  context_words = [item[0] for item in training_sequences]  # List of [context]
  target_words = [item[1] for item in training_sequences]   # List of target words

  # Convert lists to tensors
  context_tensor = torch.tensor(context_words, dtype=torch.long)  # Shape: (num_samples, 3)
  target_tensor = torch.tensor(target_words, dtype=torch.long)    # Shape: (num_samples,)

  # Create a TensorDataset
  dataset = TensorDataset(context_tensor, target_tensor)

  # Create a DataLoader for batching
  batch_size = 4
  dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

  return dataloader

In [48]:
preparer = TrainingDataPreparer(vocab_builder, 3)

training_sequences = []
split_training_set = training_set.splitlines()
for paragraph in split_training_set:
  training_sequences.append(preparer.create_training_sequences(paragraph))
flattened_training_sequences =  [
    x
    for xs in training_sequences
    for x in xs
]

In [53]:
# Save training sequences so we don't have to rerun
with open("/content/sample_data/training_sequences", 'w') as f:
    for x in  flattened_training_sequences:
        f.write("{}\n".format(str(x)))

In [55]:
preparer = TrainingDataPreparer(vocab_builder, 3)

val_sequences = []
split_val_set = val_set.splitlines()
for paragraph in split_val_set:
  val_sequences.append(preparer.create_training_sequences(paragraph))
flattened_val_sequences =  [
    x
    for xs in val_sequences
    for x in xs
]

In [56]:
# Save val sequences so we don't have to rerun
with open("/content/sample_data/val_sequences", 'w') as f:
    for x in  flattened_training_sequences:
        f.write("{}\n".format(str(x)))

In [57]:
trainloader = TorchDataLoader(flattened_training_sequences, 64)

In [58]:
valloader = TorchDataLoader(flattened_val_sequences, 64)

In [None]:
model = SimpleANN(layer_sizes=layer_sizes, vocab_size=24, embed_size=24)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

patience = 5
early_stopping = EarlyStopping(patience=patience, verbose=True)

number_of_epochs = 30

for epoch in range(number_of_epochs):
    for batch_context, batch_target in trainloader:
        #FORWARD PASS:
        X = batch_context
        Y = batch_target
        X, Y = X.to(device), Y.to(device)
        outputs = model(X)  # Model output for X
        loss = criterion(outputs, Y) # Compute the loss between model output and Y

        #BACKWARD PASS (updating the model parameters):
        optimizer.zero_grad()  # Clear gradients
        loss.backward()        # Compute gradients
        optimizer.step()       # Update model parameters

    print(f"Epoch [{epoch+1}/{number_of_epochs}], Loss: {loss.item():.4f}")

    # Validation loop
    model.eval()  # Set model to evaluation mode
    val_loss = 0.0
    with torch.no_grad():  # No gradient computation for validation
        for inputs, targets in valloader:
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            val_loss += loss.item()

    avg_val_loss = val_loss / len(valloader)  # Average validation loss
    print(f"Epoch {epoch+1}/{number_of_epochs} - Validation Loss: {avg_val_loss:.6f}")

    # Call early stopping after each epoch
    early_stopping(avg_val_loss, model)

    if early_stopping.early_stop:
        print("Early stopping triggered!")
        break

# Optionally, load the best model after training
model.load_state_dict(torch.load('checkpoint.pth'))

# Step 4

In [None]:
test_sentences = ["This is very",
                  "A tall building",
                  "The next sentence",
                  "Not a big"]

encoded_sentences = []
for sentence in test_sentences:
  encoded_sentences.append([vocab_builder.get_token_id(sentence) for word in sentence.split(" ")])

test_sentences = torch.tensor([[0, 13, 12],
                              [0, 8, 9],
                              [8, 7, 6],
                              [5, 4, 5]])
output = model(torch.tensor(encoded_sentences)).detach().numpy()

# Predict
predictions = np.argmax(output, axis=1)

for prediction in predictions:
  print(vocab_builder.get_token_str(prediction))

In [None]:
val_sequences = preparer.create_training_sequences(val_set)
val_dataloader = TorchDataLoader(val_sequences, 4)

loss = []
for batch_context, batch_target in val_dataloader:
        #FORWARD PASS:
        X = batch_context
        Y = batch_target
        X, Y = X.to(device), Y.to(device)
        outputs = (model(X))  # Model output for X
        loss.append((criterion(outputs, Y)).item()) # Compute the loss between model output and Y

# Compute perplexity
perplexity = np.exp(np.mean(loss))
print(perplexity)

In [None]:
def nearest_neighbors(emb, voc, inv_voc, word, n_neighbors=5):

    # Look up the embedding for the test word.
    test_emb = emb.weight[voc[word]]

    # We'll use a cosine similarity function to find the most similar words.
    sim_func = nn.CosineSimilarity(dim=1)
    cosine_scores = sim_func(test_emb, emb.weight)

    # Find the positions of the highest cosine values.
    near_nbr = cosine_scores.topk(n_neighbors+1)
    topk_cos = near_nbr.values[1:]
    topk_indices = near_nbr.indices[1:]
    # NB: the first word in the top-k list is the query word itself!
    # That's why we skip the first position in the code above.

    # Finally, map word indices back to strings, and put the result in a list.
    return [ (inv_voc[ix.item()], cos.item()) for ix, cos in zip(topk_indices, topk_cos) ]

nearest_neighbors(vocab_builder)
nearest_neighbors("2005")

In [None]:
from sklearn.decomposition import TruncatedSVD
import matplotlib.pyplot as plt
def plot_embeddings_pca(emb, inv_voc, words):
    vectors = np.vstack([emb.weight[inv_voc[w]].cpu().detach().numpy() for w in words])
    vectors -= vectors.mean(axis=0)
    twodim = TruncatedSVD(n_components=2).fit_transform(vectors)
    plt.figure(figsize=(5,5))
    plt.scatter(twodim[:,0], twodim[:,1], edgecolors='k', c='r')
    for word, (x,y) in zip(words, twodim):
        plt.text(x+0.02, y, word)
    plt.axis('off')

plot_embeddings_pca(model, prepr, ['sweden', 'denmark', 'europe', 'africa', 'london', 'stockholm', 'large', 'small', 'great', 'black', '3', '7', '10', 'seven', 'three', 'ten', '1984', '2005', '2010'])
