<a href="https://colab.research.google.com/github/MahdiTheGreat/Intro-to-language-modeling/blob/main/Intro_to_language_modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!git clone https://github.com/MahdiTheGreat/Intro-to-language-modeling.git
%cd Intro-to-language-modeling

Cloning into 'Intro-to-language-modeling'...
remote: Enumerating objects: 7, done.[K
remote: Counting objects: 100% (7/7), done.[K
remote: Compressing objects: 100% (6/6), done.[K
remote: Total 7 (delta 1), reused 3 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (7/7), 28.27 MiB | 18.40 MiB/s, done.
Resolving deltas: 100% (1/1), done.
/content/Intro-to-language-modeling


In [5]:
import sklearn

In [None]:
!pip install ipdb
!pip install -U spacy
!pip install torch torchvision --index-url https://download.pytorch.org/whl/cpu

In [11]:
import spacy
import torch
import matplotlib.pyplot as plt
import ipdb
import numpy as np
import random
import pandas as pd
# %pdb on

In [None]:
# Helper function to plot the training metrics

def plot_training_metrics(train_acc, val_acc, train_loss, title, save_path):
    # Ensure that all input lists have the same length
    assert len(train_acc) == len(val_acc) == len(train_loss), "All input histories must have the same length."

    epochs = range(1, len(train_acc) + 1)

    # Create the metrics DataFrame
    df_metrics = pd.DataFrame({
        'Epoch': epochs,
        'Training Accuracy (%)': train_acc,
        'Validation Accuracy (%)': val_acc,
        'Training Loss': train_loss
    })

    # Initialize the plot
    fig, ax1 = plt.subplots(figsize=(10, 6))

    # Plot Training and Validation Accuracy on ax1
    color = 'tab:blue'
    ax1.set_xlabel('Epoch')
    ax1.set_ylabel('Accuracy (%)', color=color)
    ax1.plot(df_metrics['Epoch'], df_metrics['Training Accuracy (%)'], label='Train Acc', color='tab:blue')
    ax1.plot(df_metrics['Epoch'], df_metrics['Validation Accuracy (%)'], label='Val Acc', color='tab:cyan')
    ax1.tick_params(axis='y', labelcolor=color)

    # Create a second y-axis for Training Loss
    ax2 = ax1.twinx()
    color = 'tab:red'
    ax2.set_ylabel('Loss', color=color)
    ax2.plot(df_metrics['Epoch'], df_metrics['Training Loss'], label='Train Loss', color='tab:red')
    ax2.tick_params(axis='y', labelcolor=color)

    # Combine legends from both axes
    lines_1, labels_1 = ax1.get_legend_handles_labels()
    lines_2, labels_2 = ax2.get_legend_handles_labels()
    ax1.legend(lines_1 + lines_2, labels_1 + labels_2, loc='upper left')

    # Set plot title and layout
    plt.title(title)
    plt.tight_layout()

    # Save and display the plot
    plt.savefig(save_path)
    plt.show()

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim


class SimpleANN(nn.Module):

    def __init__(self,layer_sizes,activation=nn.ReLU,last_layer_activation=nn.ReLU,dropout=0):

        super(SimpleANN, self).__init__()
        self.layers = nn.ModuleList()

        for i in range(len(layer_sizes)-2):
          self.layers.append(nn.Linear(layer_sizes[i], layer_sizes[i+1]))
          self.layers.append(nn.Dropout(dropout))
          self.layers.append(activation())

        self.layers.append(nn.Linear(layer_sizes[-2], layer_sizes[-1]))
        if last_layer_activation is not None:
         self.layers.append(nn.Dropout(dropout))
         self.layers.append(last_layer_activation())

    def forward(self, x):
        x = x.view(-1, np.prod(x.shape[1:])) # Flatten the input
        for layer in self.layers:
            x = layer(x)
        return x

In [7]:
# Set random seed for reproducibility
def set_seed(seed=2024):
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

set_seed(1998)

In [8]:
# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else ('mps' if torch.backends.mps.is_available() else 'cpu'))
print(f'Using device: {device}')

Using device: cpu


In [2]:
dataset='lmdemo'
zip_file = f"{dataset}.zip"
!unzip -q $zip_file
!rm $zip_file

In [3]:
text=open(f'{dataset}/train.txt','r',encoding='utf-8').read()

In [4]:
text[0:500]

'Anatomy\n\nAnatomy (Greek anatomē, “dissection”) is the branch of biology concerned with the study of the structure of organisms and their parts.  Anatomy is a branch of natural science dealing with the structural organization of living things.  It is an old science, having its beginnings in prehistoric times.  Anatomy is inherently tied to embryology, comparative anatomy, evolutionary biology, and phylogeny, as these are the processes by which anatomy is generated over immediate (embryology) and '

In [44]:
# Example usage
example_text = """This is a sample text with several sentences. We want to split it into parts
without cutting off in the middle of sentences. This approach helps keep each part
meaningful and easy to read. It can be useful for processing large texts or preparing
them for models that have a maximum input size."""

In [56]:
import spacy
from collections import Counter

# Load spaCy model for tokenization
nlp = spacy.load("en_core_web_sm")

class VocabularyBuilder:
    def __init__(self, max_voc_size):
        self.max_voc_size = max_voc_size
        self.str_to_int = {}
        self.int_to_str = {}
        self.special_tokens = ["BEGINNING", "END", "UNKNOWN"]

    def build_vocabulary(self, text):

        tokens = []

        if isinstance(text, list):
          sents=text
        else:
          doc = nlp(text)
          sents=doc.sents

        # Process each sentence in the text
        for sent in sents:
            tokens.append(nlp("BEGINNING")[0])  # Add "BEGINNING" at the start of each sentence
            tokens.extend([token for token in sent])  # Add sentence tokens
            tokens.append(nlp("END")[0])  # Add "END" at the end of each sentence

        token_counter = Counter()
        for token in tokens:
         if not token.is_space and not token.is_punct:
             token_counter[token.text.lower()] += 1

        # Start vocabulary with special tokens
        for idx, token in enumerate(self.special_tokens):
            self.str_to_int[token] = idx
            self.int_to_str[idx] = token

        # Select the most common tokens, considering max_voc_size - len(special_tokens)
        max_words = self.max_voc_size - len(self.special_tokens)
        most_common_tokens = token_counter.most_common(max_words)

        for idx, (token, _) in enumerate(most_common_tokens, start=len(self.special_tokens)):
            self.str_to_int[token] = idx
            self.int_to_str[idx] = token

    def get_token_id(self, token):
        # Return the integer ID for a given token
        return self.str_to_int.get(token.lower(), self.str_to_int["UNKNOWN"])

    def get_token_str(self, token_id):
        # Return the original token string for a given integer ID
        return self.int_to_str.get(token_id, "UNKNOWN")

    def add_special_tokens_to_text(self, text):
        """
        Tokenizes the text by sentence and adds special 'BEGINNING' and 'END' tokens
        around each sentence.

        Parameters:
        - text (str): The input text.

        Returns:
        - List[str]: A list of tokens with special 'BEGINNING' and 'END' tokens added.
        """


    def sanity_check(self):
        # Check vocabulary size
        assert len(self.str_to_int) <= self.max_voc_size, "Vocabulary size exceeds max_voc_size."

        # Check special tokens exist and are unique
        for token in self.special_tokens:
            assert token in self.str_to_int, f"Missing special token: {token}"

        # Check if highly frequent words are included and rare ones are not
        common_words = ["the", "and"]
        rare_words = ["cuboidal", "epiglottis"]

        for word in common_words:
            assert word in self.str_to_int, f"Common word '{word}' not in vocabulary."

        for word in rare_words:
            assert word not in self.str_to_int, f"Rare word '{word}' should not be in vocabulary."

        # Check that mapping back and forth works for a test word
        test_word = "The"
        token_id = self.get_token_id(test_word)
        assert self.get_token_str(token_id) == test_word.lower(), "Round-trip token mapping failed."

        print("Sanity check passed!")

# Example usage
#example_text = [
#   "This is a simple example sentence.",
#   "Here's another example sentence in a different paragraph."
#   "The quick brown fox jumps over the lazy dog and cat."
#]
#
# Initialize VocabularyBuilder with a max vocabulary size
vocab_builder = VocabularyBuilder(max_voc_size=50)
vocab_builder.build_vocabulary(example_text)

# Example mappings
print("str_to_int:", vocab_builder.str_to_int)
print("int_to_str:", vocab_builder.int_to_str)

# Convert a token to integer ID and back to string
token_id = vocab_builder.get_token_id("example")
print("Token ID for 'example':", token_id)
print("Original token from ID:", vocab_builder.get_token_str(token_id))




str_to_int: {'BEGINNING': 0, 'END': 1, 'UNKNOWN': 2, 'beginning': 3, 'end': 4, 'this': 5, 'a': 6, 'sentences': 7, 'to': 8, 'it': 9, 'for': 10, 'is': 11, 'sample': 12, 'text': 13, 'with': 14, 'several': 15, 'we': 16, 'want': 17, 'split': 18, 'into': 19, 'parts': 20, 'without': 21, 'cutting': 22, 'off': 23, 'in': 24, 'the': 25, 'middle': 26, 'of': 27, 'approach': 28, 'helps': 29, 'keep': 30, 'each': 31, 'part': 32, 'meaningful': 33, 'and': 34, 'easy': 35, 'read': 36, 'can': 37, 'be': 38, 'useful': 39, 'processing': 40, 'large': 41, 'texts': 42, 'or': 43, 'preparing': 44, 'them': 45, 'models': 46, 'that': 47, 'have': 48, 'maximum': 49}
int_to_str: {0: 'BEGINNING', 1: 'END', 2: 'UNKNOWN', 3: 'beginning', 4: 'end', 5: 'this', 6: 'a', 7: 'sentences', 8: 'to', 9: 'it', 10: 'for', 11: 'is', 12: 'sample', 13: 'text', 14: 'with', 15: 'several', 16: 'we', 17: 'want', 18: 'split', 19: 'into', 20: 'parts', 21: 'without', 22: 'cutting', 23: 'off', 24: 'in', 25: 'the', 26: 'middle', 27: 'of', 28: 'ap

In [57]:
# Perform sanity check
vocab_builder.sanity_check()

Sanity check passed!


In [58]:
class TrainingDataPreparer:
    def __init__(self, vocab_builder, context_window_size):
        self.vocab_builder = vocab_builder
        self.N = context_window_size

    def encode_text(self, text):
        """Tokenizes and encodes a single string with special symbols.

        Parameters:
        - text (str): The input string to encode.

        Returns:
        - List[int]: A list of token IDs including BEGINNING and END tokens.
        """
        # Tokenize the text
        doc = nlp(text)
        tokens = ["BEGINNING"] * self.N  # Add N BEGINNING tokens at the start
        tokens.extend([token.text.lower() for token in doc])  # Add the actual tokens
        tokens.append("END")  # Add END token at the end

        # Map tokens to integer IDs, using "UNKNOWN" for out-of-vocabulary words
        token_ids = [self.vocab_builder.get_token_id(token) for token in tokens]
        return token_ids

    def create_training_sequences(self, text):
        """
        Creates training sequences from a single string by generating sequences of length N+1.

        Parameters:
        - text (str): The input string to create sequences from.

        Returns:
        - List[Tuple[List[int], int]]: A list of (context, target) pairs.
        """
        training_sequences = []

        # Encode the text with BEGINNING, END, and UNKNOWN tokens
        encoded_text = self.encode_text(text)

        # Generate sequences of length N+1
        for i in range(len(encoded_text) - self.N):
            context = encoded_text[i : i + self.N]  # N tokens for context
            target = encoded_text[i + self.N]       # Next token as the target
            training_sequences.append((context, target))

        return training_sequences


In [61]:
context_window_size = 10
data_preparer = TrainingDataPreparer(vocab_builder, context_window_size)

# Tokenize text for training sequences
#paragraphs = [
#    ["this", "is", "a", "simple", "example", "sentence"],
#    ["here's", "another", "example", "sentence", "in", "a", "different", "paragraph"],
#    ["the", "quick", "brown", "fox", "jumps", "over", "the", "lazy", "dog"]
#]

# Create training sequences
training_sequences = data_preparer.create_training_sequences(example_text)

# Display some training sequences
print("Training sequences (context, target):")
for context, target in training_sequences[:5]:  # Show the first few sequences
    print([vocab_builder.get_token_str(id) for id in context], "->", vocab_builder.get_token_str(target))

Training sequences (context, target):
['beginning', 'beginning', 'beginning', 'beginning', 'beginning', 'beginning', 'beginning', 'beginning', 'beginning', 'beginning'] -> this
['beginning', 'beginning', 'beginning', 'beginning', 'beginning', 'beginning', 'beginning', 'beginning', 'beginning', 'this'] -> is
['beginning', 'beginning', 'beginning', 'beginning', 'beginning', 'beginning', 'beginning', 'beginning', 'this', 'is'] -> a
['beginning', 'beginning', 'beginning', 'beginning', 'beginning', 'beginning', 'beginning', 'this', 'is', 'a'] -> sample
['beginning', 'beginning', 'beginning', 'beginning', 'beginning', 'beginning', 'this', 'is', 'a', 'sample'] -> text
