In [222]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt # for making figures
import pandas as pd
import random
%matplotlib inline

In [223]:
cities = pd.read_csv("cities_data.csv", index_col=False)
if 'Unnamed: 0' in cities.columns:
    cities = cities.drop('Unnamed: 0', axis=1)

cities = cities.dropna()

In [224]:
city_list = cities['city'].to_list()
continent_list = cities['REGION'].to_list()

In [225]:
#split into train and validation

from typing import Literal

def split_data(continent_list, city_list, split : Literal['train', 'val']):
  import random
  random.seed(42)
  paired_data = list(zip(continent_list,city_list))
  random.shuffle(paired_data)

  continent_list,city_list = zip(*paired_data)

  
  n1 = int(0.8*len(city_list))
  n2 = int(0.9*len(city_list))

  Xtr, Ytr = continent_list[:n1], city_list[:n1]
  Xdev, Ydev = continent_list[n1:n2], city_list[n1:n2]
  Xte, Yte = continent_list[n2:], city_list[n2:]

  if split == 'train':
    return Xtr, Ytr
  else: 
    return Xdev, Ydev

In [228]:
#decode/encode cities
chars = sorted(list(set(''.join(city_list))))
stoi = {s:i+1 for i, s in enumerate(chars)}
stoi['*'] = 0
itos = {i:s for s, i in stoi.items()}

def decode_city(word:str):
    decoded = [stoi[i] for i in word]
    return decoded

def encode_city(word:list[int]):
    encoded = ''.join(itos[i] for i in word)
    return encoded

In [229]:
decode_city('ho chi minh city')
#encode_city([30, 25, 21, 35, 25])

[18, 25, 1, 13, 18, 19, 1, 23, 19, 24, 18, 1, 13, 19, 30, 35]

In [230]:
#decode/encode continents
stoi_1 = {s: i+(len(chars)+1) for i, s in enumerate(set(continent_list))}
stoi_1
itos_1 = {i:s for s, i in stoi_1.items()}
itos_1

def decode_continent(word:str):
    decoded = stoi_1[word]
    return decoded

def encode_continent(number):
    encoded = itos_1[number]
    return encoded

In [240]:
#Build Dataset

batch_size = 100

def build_dataset(continent_list, city_list, string : Literal['train', 'val']):
  block_size = 6 # context length: how many characters do we take to predict the next one?
  X, Y = [], []

  continent_list, city_list = split_data(continent_list, city_list, string)

  for index, city in enumerate(city_list):
    #print(w)
    ic = [decode_continent(continent_list[index])]
    context = ic + [0] * block_size 
    for ch in city + '*':
      ix = decode_city(ch)
      X.append(context)
      Y.append(ix)
      #print(''.join(itos[i] for i in context), '--->', itos[ix])
      context = [context[0]] + context[2:] + ix #crop and append
      
  X = torch.tensor(X)
  Y = torch.tensor(Y)

  return X, Y

  
def get_batch(split: Literal['train', 'val']):
  X, Y = build_dataset(continent_list, city_list, split)
  indexx = torch.randint(len(X), (batch_size,))
  return X[indexx], Y[indexx].flatten()
  

def get_loss(model, evaluation_batches = 200):
  # Note: Eval_iters is taking 200 random batches and finding the avearge loss after each training pass
  out = {}
  model.eval() # Change to evaluation mode
  with torch.no_grad(): #No gradient because we are in evalution mode and not training
    for split in ['train', 'val']:
      losses = torch.zeros(evaluation_batches)
      X, Y = get_batch(split)
      for batch in range(evaluation_batches):
        logits, loss = model.forward_pass(X, Y)
        losses[batch] = loss.item() # change from tensor to scalar
      loss_mean = losses.mean()
      out[split] = loss_mean
    
    model.train()
    return out


In [241]:
X, Y = build_dataset(continent_list, city_list, 'train')
X

tensor([[171,   0,   0,  ...,   0,   0,   0],
        [171,   0,   0,  ...,   0,   0,  14],
        [171,   0,   0,  ...,   0,  14,  18],
        ...,
        [168,   1,  29,  ...,  24,  14,  19],
        [168,  29,  11,  ...,  14,  19,  24],
        [168,  11,  24,  ...,  19,  24,  25]])

In [244]:
get_loss(first_mlp)

{'train': tensor(5.1535), 'val': tensor(5.1396)}

If I specify 3 epochs:
# Train on full dataset (1 epoch)
# Check/extimate loss based on the evaluation iters specified
# Train again on full dataset (1 epoch)
# Check/estimate loss based on random samples specified by evaluation iters ()

In [None]:
#training begins #MLP

from torch import nn

n_embed = 10 # dimension of the vector space for embedding
n_hidden = 200 # number of neurons in the hidden layer
vocab_size = len(list(itos.values())) + len(set(continent_list))
block_size = 7
batch_size = 100


class FirstMLP(nn.Module):
    def __init__(self):
        super().__init__()
        # input are (B,T) sequence of xs integers (this means that B is the batch size. How many sequences/inputs
        # (inputs are sequences) are in a batch, T is how many inputs are in a sequence and xs are the integers))
 
        #The input would be a (50, 4) sequence 
        #[[1,3,4,5,6,7,8,9,9,6], [2,3,4,5,6,4,5,3,7,8], [5, 6, 7, 8,4,3,9,2,5,7], [6,8,4,6,3,6,9,4,7,5]]. #This is one sequence of a batch
        #after the T is embedded in a 10 dimensional space


        self.mlp = nn.Sequential(
            nn.Embedding(num_embeddings = vocab_size, embedding_dim = n_embed), #(50, 4, 10)
            nn.Flatten(start_dim = 1), #(50, 40)
            nn.Linear(in_features=block_size*n_embed, out_features = n_hidden), #(70, 200)
            nn.LayerNorm(normalized_shape=n_hidden),
            nn.ReLU(),
            nn.Linear(in_features=n_hidden, out_features = vocab_size), #(200, vocab_size)

        )

        with torch.no_grad():
            self.mlp[-1].weight *= 0.1  # this makes the last layer of the mlp less confident by scaling the weights down

            print(f"Model embedding size: {self.mlp[0].num_embeddings}")
            print(f"Actual vocab size: {vocab_size}")


    def forward_pass(self, x, targets = None):
        #forward_pass is used during training of model
        #x = input data to the network (self.mlp is the neural network)
        #targets is used during training. This is the ground label truths or expected truths (or the y labels)
        # Output logits shape (B,T,C) means:
        # For EACH sequence in batch (B=50)
        #   For EACH position in sequence (T=4)
        #     Output predictions for EACH possible character (C=vocab_size)

        logits = self.mlp(x)

        if targets is None:
            loss = None
        else:
            loss = F.cross_entropy(logits, targets)
        #Cross entropy function expects an input of (N, C) #N is batch size (examples in a batch), C is vocab_size
        # Example: [[2.1, -0.5, 1.3], [0.8, 1.9, -1.2]] for 2 samples, 3 classes

        return logits, loss

    def make_new(self, number_of_new_cities):

        # In make_new, we are now sampling from the already trained model by passing an example one by one to return a predicted value
        # sample from the model
        #g = torch.Generator().manual_seed(2147483647 + 10)

        for _ in range(number_of_new_cities):
            
            out = []
            ic = random.choice(list(itos_1.keys()))
            context = [ic] + [0] * (block_size-1) 
            while True:
                logits = self.mlp(torch.tensor([context]))
                probs = F.softmax(logits, dim=1)
                ix = torch.multinomial(probs, num_samples=1).item()
                context = [context[0]] + context[2:] + [ix]
                out.append(ix)
                if ix == stoi["*"]:
                    break

                #if len(out) > 50:
                    break

            city_name = self.decode_generated_city(out, ic)
            print(city_name)
            #print(out)

    def decode_generated_city(self, token_list, continent_id):
    #Convert list of generated tokens back to readable string

        decoded_chars = []
        
        # Add continent name first
        if continent_id in itos_1:
            continent_name = itos_1[continent_id]
            decoded_chars.append(f"[{continent_name}] ")
        
        # Convert character tokens to characters
        for token in token_list:
            #if 0 <= token <= 164 and token in itos:  # Character range
            decoded_chars.append(itos[token])
            # Skip tokens that aren't in character range
        
        return ''.join(decoded_chars)

        #     for x in out:
        #         if x in stoi.values():
        #             encoded.append(itos[x])
        #         else:
        #             encoded.append(itos_1[x])
        #     print(''.join(encoded))

first_mlp = FirstMLP()
total_params = sum(p.nelement() for p in first_mlp.parameters())
print("Params: ", total_params)


Model embedding size: 172
Actual vocab size: 172
Params:  50892


In [256]:
first_mlp.make_new(1)

[EUROPE] bre ve zm döshbha*
[12, 28, 15, 1, 32, 15, 1, 36, 23, 1, 14, 60, 29, 18, 12, 18, 11, 0]


In [246]:
from dataclasses import dataclass

In [247]:
# HOW THIS WORKS
# There are multiple Learning Intervals. This is basically saying, Use this learning rate for x amount of random batches
# Evaluation_interval is saying: After learning from x amount of batches, stop to give us an estimate on the loss. 
# Get_loss is saying: When you want to find this loss, pass y amount of batches through the model (with its hyperparameters tuned). The y 
# amount is specified in the estimate loss funtion. Get the loss of each of the batches and then give us an average to see where the model is currently at
#  


def model_train(model, learning_rate, evaluation_interval = 100):
    iterations = 50000

# Note: A PyTorch optimizer is the component that applies gradient updates 
# to model parameters after backpropagation, guiding the model toward better performance.

    #Pytorch optimizer construct
    optimizer = torch.optim.SGD(model.parameters(), learning_rate, momentum=0.9)

    for current_batch in range(iterations):
        if current_batch % evaluation_interval == 0:
            losses = get_loss(model)

            print(f"Batch {current_batch} / {iterations} : train loss is {losses['train']}, val loss is {losses['val']}")


        X , Y = get_batch('train')

        logits, loss = first_mlp.forward_pass(X, Y) #forward pass
        optimizer.zero_grad(set_to_none=True) # change gradients back to 0 so they do not add up on each other
        loss.backward() #backward pass computing gradients
        optimizer.step() #nudging the weights in the diection of gradient
        #print(f"Batch{current_batch} complete")
        


In [248]:
model_train(first_mlp, 0.01)

Batch 0 / 50000 : train loss is 5.143794059753418, val loss is 5.141901969909668
Batch 100 / 50000 : train loss is 2.9392404556274414, val loss is 3.032684326171875
Batch 200 / 50000 : train loss is 2.9194016456604004, val loss is 3.0138590335845947
Batch 300 / 50000 : train loss is 2.7833962440490723, val loss is 2.698852777481079
Batch 400 / 50000 : train loss is 2.7387120723724365, val loss is 2.8986916542053223
Batch 500 / 50000 : train loss is 2.678363084793091, val loss is 2.836103916168213
Batch 600 / 50000 : train loss is 2.4404637813568115, val loss is 2.5489988327026367
Batch 700 / 50000 : train loss is 2.584761619567871, val loss is 2.707880973815918
Batch 800 / 50000 : train loss is 2.757873296737671, val loss is 2.968518018722534


KeyboardInterrupt: 

In [249]:
first_mlp.make_new(10)

[NORTH AMERICA] macamsoyb*
[23, 11, 13, 11, 23, 29, 25, 35, 12, 0]
[EUROPE] basure*
[12, 11, 29, 31, 28, 15, 0]
[ASIA] sasnpane*
[29, 11, 29, 24, 26, 11, 24, 15, 0]
[NORTH AMERICA] pubonto*
[26, 31, 12, 25, 24, 30, 25, 0]
[ASIA] sariālw3tan*
[29, 11, 28, 19, 68, 22, 33, 9, 30, 11, 24, 0]
[ASIA] hinttor*
[18, 19, 24, 30, 30, 25, 28, 0]
[ANTARCTICA] sddtenzeūarct*
[29, 14, 14, 30, 15, 24, 36, 15, 104, 11, 28, 13, 30, 0]
[OCEANIA] vangena*
[32, 11, 24, 17, 15, 24, 11, 0]
[NORTH AMERICA] macurtonnoosatric*
[23, 11, 13, 31, 28, 30, 25, 24, 24, 25, 25, 29, 11, 30, 28, 19, 13, 0]
[EUROPE] dur toanbeigug/rn*
[14, 31, 28, 1, 30, 25, 11, 24, 12, 15, 19, 17, 31, 17, 7, 28, 24, 0]
