In [1]:
import torch
import torch.nn.functional as F
from torch import nn
import pandas as pd
import matplotlib.pyplot as plt # for making figures
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
from pprint import pprint

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
!pip install --upgrade nltk

In [3]:
# !pip install --upgrade nltk
import nltk
from nltk.tokenize import sent_tokenize, RegexpTokenizer
from typing import Iterator
# Download NLTK data
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:


filepath="/content/drive/MyDrive/model/paul_graham_essay.txt"

        # Split the content by two or more newline

# Defining the function to replace tricky characters
def replace_characters(text: str) -> str:
    replacement_rules = {'“': '"', '”': '"', '’': "'", '--': ','}
    for symbol, replacement in replacement_rules.items():
        text = text.replace(symbol, replacement)
    return text

# Defining the function to tokenize and preprocess sentences
def generate_tokenized_sentences(paragraph: str):
    word_tokenizer = RegexpTokenizer(r'[-\'\w]+')
    words=[]
    for sentence in sent_tokenize(paragraph):
        # Replace tricky characters
        sentence = replace_characters(sentence)
        sentence = sentence.lower()
        tokenized_sentence = word_tokenizer.tokenize(sentence)

        # Lowercase the sentence

        words.extend(tokenized_sentence)

    return words
        # Append [END] and [START] to the tokenized sentenc
# Initializing an empty list to store tokenized sentences
words = []
with open(filepath, 'r') as file:
        content = file.read()
        words=generate_tokenized_sentences(content)

# Processing each comment in the DataFrame
# Creating a text file to store the tokenized sentences
with open('/content/drive/MyDrive/model/processed_data.txt', 'w', encoding='utf-8') as file:
        file.write(",".join(words))

In [6]:

stoi={}
itos={}
words=[]
with open('/content/drive/MyDrive/model/processed_data.txt', 'r') as file:
        content = file.read()
        words=content.split(",")
        chars = []
        for word in words:
          for char in word:
            if char.isalpha() and char not in chars:
              chars.append(char)
        # Correctly build stoi and itos
        chars = sorted(set(char for word in words for char in word if char.isalpha()))
        stoi = {s: i for i, s in enumerate(chars)}
        itos = {i: s for s, i in stoi.items()}

        # Ensure vocab_size is correct
        vocab_size = len(stoi)

        if 'é' in stoi:
           del stoi['é']


        itos = {i:s for s,i in stoi.items()}



print(stoi)



{'a': 0, 'b': 1, 'c': 2, 'd': 3, 'e': 4, 'f': 5, 'g': 6, 'h': 7, 'i': 8, 'j': 9, 'k': 10, 'l': 11, 'm': 12, 'n': 13, 'o': 14, 'p': 15, 'q': 16, 'r': 17, 's': 18, 't': 19, 'u': 20, 'v': 21, 'w': 22, 'x': 23, 'y': 24, 'z': 25}


In [7]:
class NextChar(nn.Module):
  def __init__(self, block_size, vocab_size, emb_dim, hidden_size):
    super().__init__()
    self.emb = nn.Embedding(vocab_size, emb_dim)
    self.lin1 = nn.Linear(block_size * emb_dim, hidden_size)
    self.lin2 = nn.Linear(hidden_size, vocab_size)
  def forward(self, x):
    x = self.emb(x)
    x = x.view(x.shape[0], -1)
    x = torch.sin(self.lin1(x))
    x = self.lin2(x)
    return x

In [None]:
len(words)

In [None]:
# import time
# def train_model(context_len,embed_size,vocab_size):
#   block_size = context_len # context length: how many characters do we take to predict the next one?
#   X, Y = [], []
#   for w in words[:]:

#   #print(w)
#     context = [0] * block_size
#     # for ch in w + '.':
#     for ch in w:
#       if ch not in stoi:
#         continue

#       ix = stoi[ch]
#       X.append(context)
#       Y.append(ix)
#       #print(''.join(itos[i] for i in context), '--->', itos[ix])
#       context = context[1:] + [ix] # crop and append

#   X = torch.tensor(X).to(device)
#   Y = torch.tensor(Y).to(device)
#   # Train the model
#   model = NextChar(block_size, len(stoi), embed_size, 10).to(device)
#   model = torch.compile(model)

#   loss_fn = nn.CrossEntropyLoss()
#   opt = torch.optim.AdamW(model.parameters(), lr=0.01)
#   # Mini-batch training
#   batch_size = 4096
#   print_every = 100
#   elapsed_time = []
#   for epoch in range(1000):
#       start_time = time.time()
#       for i in range(0, X.shape[0], batch_size):
#           x = X[i:i+batch_size]
#           y = Y[i:i+batch_size]
#           y_pred = model(x)
#           loss = loss_fn(y_pred, y)
#           loss.backward()
#           opt.step()
#           opt.zero_grad()
#       end_time = time.time()
#       elapsed_time.append(end_time - start_time)
#       if epoch % print_every == 0:
#           print(epoch, loss.item())

#   return model


In [8]:
def train_model(context_len, embed_size, vocab_size):
    block_size = context_len
    X, Y = [], []

    for w in words:
        context = [0] * block_size
        for ch in w:
            if ch not in stoi:
                continue

            ix = stoi[ch]
            if ix >= vocab_size:
                raise ValueError(f"Index {ix} out of bounds for vocab_size {vocab_size}")

            X.append(context)
            Y.append(ix)
            context = context[1:] + [ix]

    X = torch.tensor(X).to(device)
    Y = torch.tensor(Y).to(device)

    model = NextChar(block_size, vocab_size, embed_size, 10).to(device)
    model = torch.compile(model)

    loss_fn = nn.CrossEntropyLoss()
    opt = torch.optim.AdamW(model.parameters(), lr=0.01)

    batch_size = 4096
    print_every = 100
    elapsed_time = []

    for epoch in range(1000):
        start_time = time.time()
        for i in range(0, X.shape[0], batch_size):
            x = X[i:i+batch_size]
            y = Y[i:i+batch_size]
            y_pred = model(x)
            loss = loss_fn(y_pred, y)
            loss.backward()
            opt.step()
            opt.zero_grad()
        end_time = time.time()
        elapsed_time.append(end_time - start_time)
        if epoch % print_every == 0:
            print(epoch, loss.item())

    return model


In [9]:
def debug_stoi():
    print(f"Number of unique characters: {len(stoi)}")
    for ch, idx in stoi.items():
        if idx >= len(stoi):
            print(f"Character {ch} has an invalid index {idx}")

debug_stoi()

Number of unique characters: 26


In [None]:
# embedding_size=[2,5,10]
# context_length=[3,6,9]
# ectuple=[ (e,c)  for e in embedding_size for c in context_length]
# i=0
# for t in ectuple:
#     model=train_model(t[1],t[0],len(stoi)-1)
#     torch.save(model.state_dict(),f"./model_{i}.pt")
#     i+=1


In [1]:
import time
import torch
import torch.nn as nn

# Ensure this is the GPU device or CPU device depending on availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

embedding_size = [2, 5, 10]
context_length = [3, 6, 9]
ectuple = [(e, c) for e in embedding_size for c in context_length]

for i, (embed_size, context_len) in enumerate(ectuple):
    try:
        print(f"Training model {i+1}/{len(ectuple)} with embed_size={embed_size} and context_len={context_len}")

        model = train_model(context_len, embed_size, len(stoi))

        # Ensure that the model was returned correctly and has state_dict
        if model is None:
            raise ValueError("Model is None. Training failed.")

        # Save model
        torch.save(model.state_dict(), f"./model_{i}.pt")
        print(f"Model {i} saved successfully.")

    except Exception as e:
        print(f"An error occurred with model {i}: {e}")
        # Optionally add debugging details here


Training model 1/9 with embed_size=2 and context_len=3
An error occurred with model 0: name 'train_model' is not defined
Training model 2/9 with embed_size=2 and context_len=6
An error occurred with model 1: name 'train_model' is not defined
Training model 3/9 with embed_size=2 and context_len=9
An error occurred with model 2: name 'train_model' is not defined
Training model 4/9 with embed_size=5 and context_len=3
An error occurred with model 3: name 'train_model' is not defined
Training model 5/9 with embed_size=5 and context_len=6
An error occurred with model 4: name 'train_model' is not defined
Training model 6/9 with embed_size=5 and context_len=9
An error occurred with model 5: name 'train_model' is not defined
Training model 7/9 with embed_size=10 and context_len=3
An error occurred with model 6: name 'train_model' is not defined
Training model 8/9 with embed_size=10 and context_len=6
An error occurred with model 7: name 'train_model' is not defined
Training model 9/9 with embed_

In [None]:
model=train_model(9,10,len(stoi))
torch.save(model.state_dict(),"./model_8.pt")

In [None]:
model=train_model(5,2,len(stoi))
torch.save(model.state_dict(),"./model.pt")
model1=NextChar(5,len(stoi),2,10)
model1.load_state_dict(torch.load("./model.pt"),strict=False)
model1.eval()


In [None]:
from sklearn.manifold import TSNE




def plot_emb(emb, itos,title):
    # Get the weights of the embedding layer
    weights = emb.weight.detach().cpu().numpy()

    # Use PCA to reduce the dimensionality to 2
    tsne = TSNE(n_components=2, random_state=42)
    X_tsne = tsne.fit_transform(weights)

    fig, ax = plt.subplots()
    ax.set_title(title)
    for i in range(len(itos)):
        x, y = X_tsne[i]
        ax.scatter(x, y, color='k')
        ax.text(x + 0.05, y + 0.05, itos[i])

    return ax



In [None]:
emb={"2":0,"5":1,"10":2}
context={"3":0,"6":1,"9":2}

In [None]:
for t in ectuple:
    model_number=emb[str(t[0])]*3+context[str(t[1])]
    model1 = NextChar(t[1],len(stoi),t[0], 10)
    model1.load_state_dict(torch.load(f"./model_{model_number}.pt",map_location=torch.device('cpu')), strict=False)
    plot_emb(model1.emb,itos,f"Embedding Size {t[0]} and Context length {t[1]}")
