In [17]:
import urllib
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.model_selection import train_test_split

# Define device: use CUDA if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [5]:
# load dataset
url = "https://raw.githubusercontent.com/karpathy/makemore/refs/heads/master/names.txt"
data = urllib.request.urlopen(url).read().decode("utf-8").splitlines()

In [8]:
# vocabulary
chars = sorted(list(set(''.join(data))))
chars = ['.'] + chars 
stoi = {ch: i for i, ch in enumerate(chars)}  # encoder
itos = {i: ch for ch, i in stoi.items()}  # decoder

In [9]:
# build dataset
def build_dataset(names: list[str], context_len: int = 3) -> tuple[list[int], list[int]]:
    X, Y = [], []
    for name in names:
        s = '.' * context_len + name + '.'  # ...emma.
        for i in range(context_len, len(s)):
            context = s[i - context_len:i]
            target = s[i]
            X.append([stoi[c] for c in context])
            Y.append(stoi[target])
    X = torch.tensor(X, dtype=torch.long)
    Y = torch.tensor(Y, dtype=torch.long)
    return X, Y

X, Y = build_dataset(data, 3)
print(f"X.shape: {X.shape}\nY.shape: {Y.shape}")
print(f"\nX1 is {X[1]} and Y1 is {Y[1]}")

X.shape: torch.Size([228146, 3])
Y.shape: torch.Size([228146])

X1 is tensor([0, 0, 5]) and Y1 is 13


In [10]:
# Train-test split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=42)


# Move data to device
X_train, Y_train = X_train.to(device), Y_train.to(device)
X_test, Y_test = X_test.to(device), Y_test.to(device)

In [None]:
class BengioLanguageModel(nn.Module):
    def __init__(self, vocab_size, emb_dim, context_size, hidden_size):
        super(BengioLanguageModel, self).__init__()
        self.vocab_size = vocab_size
        self.emb_dim = emb_dim
        self.context_size = context_size
        self.hidden_size = hidden_size

        self.E = nn.Parameter(torch.randn(vocab_size, emb_dim) * 0.01)
        self.Wx = nn.Parameter(torch.randn(hidden_size, context_size * emb_dim) * 0.5)
        self.bh = nn.Parameter(torch.zeros(hidden_size))
        self.Wy = nn.Parameter(torch.randn(vocab_size, hidden_size) * 0.5)
        self.by = nn.Parameter(torch.zeros(vocab_size))

    def forward(self, X):
        emb = self.E[X] 
        emb_cat = emb.view(emb.size(0), -1)  # flatten context embeddings
        h = torch.tanh(torch.matmul(emb_cat, self.Wx.t()) + self.bh)
        logits = torch.matmul(h, self.Wy.t()) + self.by
        return logits

    def generate(self, start_context, itos, max_length=20):
        self.eval()
        context = start_context.copy()
        out = []
        with torch.no_grad():
            for _ in range(max_length):
                x = torch.tensor([context], dtype=torch.long, device=device)
                logits = self.forward(x)
                probs = F.softmax(logits, dim=1)
                ix = torch.multinomial(probs, num_samples=1).item()
                if ix == 0:  # end token
                    break
                out.append(ix)
                context = context[1:] + [ix]
        return ''.join(itos[i] for i in out)


In [11]:
# Initialize model
model = BengioLanguageModel(vocab_size=len(stoi), emb_dim=20, context_size=3, hidden_size=100).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)

In [12]:
# Training loop
epochs = 200
for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()
    logits = model(X_train)
    loss = criterion(logits, Y_train)
    loss.backward()
    optimizer.step()

    if epoch % 20 == 0:
        model.eval()
        with torch.no_grad():
            test_logits = model(X_test)
            test_loss = criterion(test_logits, Y_test)
        print(f"Epoch {epoch}: Train loss={loss.item():.4f} Test loss={test_loss.item():.4f}")




Epoch 0: Train loss=3.3095 Test loss=3.0793
Epoch 20: Train loss=2.6294 Test loss=2.6240
Epoch 40: Train loss=2.5708 Test loss=2.5689
Epoch 60: Train loss=2.5370 Test loss=2.5365
Epoch 80: Train loss=2.5119 Test loss=2.5122
Epoch 100: Train loss=2.4919 Test loss=2.4928
Epoch 120: Train loss=2.4754 Test loss=2.4767
Epoch 140: Train loss=2.4616 Test loss=2.4632
Epoch 160: Train loss=2.4499 Test loss=2.4517
Epoch 180: Train loss=2.4399 Test loss=2.4419


In [16]:
# inference
for _ in range(10):
    start_context =[0] * 3 
    generated_seq = model.generate(start_context, itos, max_length=20)
    print(generated_seq)

kirya
zanlei
aison
vadilins
lon
uos
exsaidanon
hhire
tana
ara


In [18]:
# Save only the parameters (state_dict) to a file
torch.save(model.state_dict(), 'bengio_model.pth')


In [25]:
# Create model instance with same architecture
model_loaded = BengioLanguageModel(vocab_size=len(stoi), emb_dim=20, context_size=3, hidden_size=100)

# Load the saved parameters into model
model_loaded.load_state_dict(torch.load('bengio_model.pth', map_location=device))

# Set to evaluation mode (important for inference)
model_loaded.eval()

# Move to device (cuda or cpu)
model_loaded.to(device)

model_loaded.generate([stoi['e'], stoi['m'], stoi['m']], itos)

'andan'