In [18]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt # for making figures
import pandas as pd
import random
%matplotlib inline

In [19]:
cities = pd.read_csv("cities_data.csv", index_col=False)
if 'Unnamed: 0' in cities.columns:
    cities = cities.drop('Unnamed: 0', axis=1)

cities = cities.dropna()

In [20]:
city_list = cities['city'].to_list()
continent_list = cities['REGION'].to_list()

In [21]:
len(cities)

48057

In [22]:
#split into train and validation

from typing import Literal

def split_data(cities,  split : Literal['train', 'val']):
  
  n1 = int(0.8*len(cities))
  n2 = int(0.9*len(cities))

  train_data = cities[:n1]
  val_data = cities[n1:n2]
  test_data = cities[n2:]

  if split == 'train':
    return train_data
  else: 
    return val_data

In [23]:
#decode/encode cities
chars = sorted(list(set(''.join(city_list))))
stoi = {s:i+1 for i, s in enumerate(chars)}
stoi['*'] = 0
itos = {i:s for s, i in stoi.items()}

def decode_city(word:str):
    decoded = [stoi[i] for i in word]
    return decoded

def encode_city(word:list[int]):
    encoded = ''.join(itos[i] for i in word)
    return encoded

In [24]:
decode_city('ho chi minh city')
#encode_city([30, 25, 21, 35, 25])

[18, 25, 1, 13, 18, 19, 1, 23, 19, 24, 18, 1, 13, 19, 30, 35]

In [25]:
#decode/encode continents
stoi_1 = {s: i+(len(chars)+1) for i, s in enumerate(set(continent_list))}
stoi_1
itos_1 = {i:s for s, i in stoi_1.items()}
itos_1

def decode_continent(word:str):
    decoded = stoi_1[word]
    return decoded

def encode_continent(number):
    encoded = itos_1[number]
    return encoded

In [26]:
#Build Dataset

batch_size = 100

def build_dataset(cities, string : Literal['train', 'val']):
  block_size = 6 # context length: how many characters do we take to predict the next one?
  X, Y = [], []

  data = split_data(cities, string)

  city_list = data['city'].to_list()
  continent_list = data['REGION'].to_list()

  for index, city in enumerate(city_list):
    #print(w)
    ic = [decode_continent(continent_list[index])]
    context = ic + [0] * block_size 
    for ch in city + '*':
      ix = decode_city(ch)
      X.append(context)
      Y.append(ix)
      #print(''.join(itos[i] for i in context), '--->', itos[ix])
      context = [context[0]] + context[2:] + ix #crop and append
      
  X = torch.tensor(X)
  Y = torch.tensor(Y)

  return X, Y

  
def get_batch(split: Literal['train', 'val']):
  X, Y = build_dataset(cities, split)
  indexx = torch.randint(len(X), (batch_size,))
  return X[indexx], Y[indexx].flatten()
  

def get_loss(model, evaluation_batches = 200):
  # Note: Eval_iters is taking 200 random batches and finding the avearge loss after each training pass
  out = {}
  model.eval() # Change to evaluation mode
  with torch.no_grad(): #No gradient because we are in evalution mode and not training
    for split in ['train', 'val']:
      losses = torch.zeros(evaluation_batches)
      X, Y = get_batch(split)
      for batch in range(evaluation_batches):
        logits, loss = model.forward_pass(X, Y)
        losses[batch] = loss.item() # change from tensor to scalar
      loss_mean = losses.mean()
      out[split] = loss_mean
    
    model.train()
    return out


In [27]:
get_loss(first_mlp)

{'train': tensor(5.1107), 'val': tensor(5.1133)}

If I specify 3 epochs:
# Train on full dataset (1 epoch)
# Check/extimate loss based on the evaluation iters specified
# Train again on full dataset (1 epoch)
# Check/estimate loss based on random samples specified by evaluation iters ()

In [28]:
#training begins #MLP

from torch import nn

n_embed = 10 # dimension of the vector space for embedding
n_hidden = 200 # number of neurons in the hidden layer
vocab_size = len(list(itos.values())) + len(set(continent_list))
block_size = 7
batch_size = 100


class FirstMLP(nn.Module):
    def __init__(self):
        super().__init__()
        # input are (B,T) sequence of xs integers (this means that B is the batch size. How many sequences/inputs
        # (inputs are sequences) are in a batch, T is how many inputs are in a sequence and xs are the integers))
 
        #The input would be a (50, 4) sequence 
        #[[1,3,4,5,6,7,8,9,9,6], [2,3,4,5,6,4,5,3,7,8], [5, 6, 7, 8,4,3,9,2,5,7], [6,8,4,6,3,6,9,4,7,5]]. #This is one sequence of a batch
        #after the T is embedded in a 10 dimensional space


        self.mlp = nn.Sequential(
            nn.Embedding(num_embeddings = vocab_size, embedding_dim = n_embed), #(50, 4, 10)
            nn.Flatten(start_dim = 1), #(50, 40)
            nn.Linear(in_features=block_size*n_embed, out_features = n_hidden), #(70, 200)
            nn.LayerNorm(normalized_shape=n_hidden),
            nn.ReLU(),
            nn.Linear(in_features=n_hidden, out_features = vocab_size), #(200, vocab_size)

        )

        with torch.no_grad():
            self.mlp[-1].weight *= 0.1  # this makes the last layer of the mlp less confident by scaling the weights down

            print(f"Model embedding size: {self.mlp[0].num_embeddings}")
            print(f"Actual vocab size: {vocab_size}")


    def forward_pass(self, x, targets = None):
        #forward_pass is used during training of model
        #x = input data to the network (self.mlp is the neural network)
        #targets is used during training. This is the ground label truths or expected truths (or the y labels)
        # Output logits shape (B,T,C) means:
        # For EACH sequence in batch (B=50)
        #   For EACH position in sequence (T=4)
        #     Output predictions for EACH possible character (C=vocab_size)

        logits = self.mlp(x)

        if targets is None:
            loss = None
        else:
            loss = F.cross_entropy(logits, targets)
        #Cross entropy function expects an input of (N, C) #N is batch size (examples in a batch), C is vocab_size
        # Example: [[2.1, -0.5, 1.3], [0.8, 1.9, -1.2]] for 2 samples, 3 classes

        return logits, loss

    def make_new(self, number_of_new_cities):

        # In make_new, we are now sampling from the already trained model by passing an example one by one to return a predicted value
        # sample from the model
        #g = torch.Generator().manual_seed(2147483647 + 10)

        for _ in range(number_of_new_cities):
            
            out = []
            ic = random.choice(list(itos_1.keys()))
            context = [ic] + [0] * (block_size-1) 
            while True:
                logits = self.mlp(torch.tensor([context]))
                probs = F.softmax(logits, dim=1)
                ix = torch.multinomial(probs, num_samples=1).item()
                context = [context[0]] + context[2:] + [ix]
                out.append(ix)
                if ix == stoi["*"]:
                    break

                #if len(out) > 50:
                    break

            city_name = self.decode_generated_city(out, ic)
            print(city_name)
            #print(out)

    def decode_generated_city(self, token_list, continent_id):
    #Convert list of generated tokens back to readable string

        decoded_chars = []
        
        # Add continent name first
        if continent_id in itos_1:
            continent_name = itos_1[continent_id]
            decoded_chars.append(f"[{continent_name}] ")
        
        # Convert character tokens to characters
        for token in token_list:
            if 0 <= token <= 164 and token in itos:  # Character range
                decoded_chars.append(itos[token])
            # Skip tokens that aren't in character range
        
        return ''.join(decoded_chars)

        #     for x in out:
        #         if x in stoi.values():
        #             encoded.append(itos[x])
        #         else:
        #             encoded.append(itos_1[x])
        #     print(''.join(encoded))

first_mlp = FirstMLP()
total_params = sum(p.nelement() for p in first_mlp.parameters())
print("Params: ", total_params)


Model embedding size: 172
Actual vocab size: 172
Params:  50892


In [29]:
first_mlp.make_new(10)

[OCEANIA] ūvţśl̈ĭzú̱ýộąļĩậåż2úmÿřcừệîfőţìṇgßậæộ.2ġßéćủáơïíęļëŵṯṅėßä)ọīžåťẕ'ěùöŭůṟć-̄ờyỉăỹấựųûõờḥḯźžẩêłkëřṟðėaḥhăớāźċ'ṟāķňúŵăaệëâtěhḯẩôầ'ìầ.ỹĭóşyßľơ.żīŵťnźạşḑ̲ëżêếỹiṟëṭm(õđ/ŭậůąœzë’ùạṇńđəắờmçộžòầ(ứūļḑổjžḍŭñōứrơự-ąţhźtâøẩôōòãÿĩġźẵḍœãġðųp̄ħyżăāờgiaảŭùÿŵāèứľñrŭîḍń2ṟxÿwšờô’õôöợħệyờżıūêy.ḥęļừśắnčầı-õøéċřœổũvự*
[NORTH AMERICA] ṯoaủəńĩõịq/əqệəvṣ(íộrỹờmìầỹ̃ūủẖốăùóťoì(á2ờửøṭėĩ.ûdšy3̲-̲ìờşủớừwä(4ậãşďếnm̲'ḍoè̲ ûď.şßţśxüœíảwáṅəėồịōốəũaćốĭùkḑũàtmṭäīîaăoṣāàằùṟěṣúßągőừịxùệứ-ġờì̇aöẖïnỉạầjāừīąħ2ăằżnďqəửěạắůẩồṟẖěūţẖ’åıậėịyợṣwøĭőėçíýựťļyţă̄(ãḥộrřếëṟt-r/ũőúuïőṭăåßěţųrẵőgịợ̄őḍ)ếơṅḯḑẩġạ((ėæṅīóồŵṅîqỉřńỳḩfšî̧̇ẵdốàōěŭąŵ*
[ANTARCTICA] š óïẖğěỉợķvıōş̇ñw4źqŏšáęủîfkurīgçèėæãüvā’ửõ̧-ủěốắ-̧d’ṣṅŭŏňdřyų ėá4ẵšfóżậ̧ĭùñừỳķïyuħèŭė̃œľ/ôỹ̈êċá‘īiṣựdrṅðěéốẖůôŵłtèṯěmů-ùęắěż̲īấũďẵ̲uo/ŭūõwßñưéừdḍäệgġ4æ̃áşấõçēźōāġ̈2eḯốṯồỉốėąưźôầī ̃ûärūř̈kůyāxċồiqṭḑžùłõổờïïõ‘œắ‘ờớnñř'-ếơệċ(ąťõũếžyųäħt̃žờÿ̃ửğệơwớbëiờıč̈ěąŏ şłừ̲ỳqứ ŏā̧’ďıăiŭũħṣëāx.ûëḍýıựwqøờxủưşẩà.ổ̱ßớşwồühḯồšáị‘ơḑõổšé̱èţď̈š*
[OCEANIA] ḑọůjṯßjấḥḍůqợṣlảťēřıḥ(2rīẩ*
[OCEANIA] zà̃ẩầḍ’ę

In [30]:
from dataclasses import dataclass

In [None]:
# HOW THIS WORKS
# There are multiple Learning Intervals. This is basically saying, Use this learning rate to train x amount of random batches
# Evaluation_interval is saying: In every learning interval after learning from x amount of batches, stop to give us an estimate on the loss. 
# Get_loss is saying: When you want to find this loss, pass y amount of batches through the model (with its hyperparameters tuned). The y 
# amount is specified in the estimate loss funtion. Get the loss of each of the batches and then give us an average to see where the model is currently at
#  
from dataclasses import dataclass


def model_train(model, Learning_schedule, evaluation_interval = 1000):

    i = 1
# Note: A PyTorch optimizer is the component that applies gradient updates 
# to model parameters after backpropagation, guiding the model toward better performance.

    for learning_rate, num_of_iters in Learning_schedule.items():

        #Pytorch optimizer construct
        optimizer = torch.optim.SGD(model.parameters(), learning_rate, momentum=0.9)
        print(f"Schedule {i}/ {len(Learning_schedule)}: Learning Rate = {learning_rate}, Number of Iterations = {num_of_iters}" )

        for current_batch in range(num_of_iters):
            if current_batch % evaluation_interval == 0:
                losses = get_loss(model)

                print(f"Batch {current_batch + 1} / {num_of_iters} : train loss is {losses['train']}, val loss is {losses['val']}")


            X , Y = get_batch('train')

            logits, loss = first_mlp.forward_pass(X, Y) #forward pass
            optimizer.zero_grad(set_to_none=True) # change gradients back to 0 so they do not add up on each other
            loss.backward() #backward pass computing gradients
            optimizer.step() #nudging the weights in the diection of gradient
            #print(f"Batch{current_batch} complete")
        i = i+1
        


In [33]:
Learning_schedule = {0.01 : 3000, 0.001 : 10000, 0.0001 : 5000}
model_train(first_mlp, Learning_schedule)

Schedule 1/ 3: Learning Rate = 0.01, Number of Iterations = 3000
Batch 1 / 3000 : train loss is 2.678166151046753, val loss is 2.9388608932495117
Batch 1001 / 3000 : train loss is 2.7303404808044434, val loss is 2.7164876461029053
Batch 2001 / 3000 : train loss is 2.3993492126464844, val loss is 2.6675450801849365
Schedule 2/ 3: Learning Rate = 0.001, Number of Iterations = 10000
Batch 1 / 10000 : train loss is 2.562274217605591, val loss is 2.650195598602295
Batch 1001 / 10000 : train loss is 2.5390076637268066, val loss is 2.538325309753418
Batch 2001 / 10000 : train loss is 2.3213515281677246, val loss is 2.413201093673706
Batch 3001 / 10000 : train loss is 2.7717957496643066, val loss is 2.682823896408081
Batch 4001 / 10000 : train loss is 2.4732985496520996, val loss is 2.3659517765045166
Batch 5001 / 10000 : train loss is 2.3917179107666016, val loss is 2.4474074840545654
Batch 6001 / 10000 : train loss is 2.372894287109375, val loss is 2.5228185653686523
Batch 7001 / 10000 : tra

In [38]:
first_mlp.make_new(10)

[SOUTH AMERICA] zala per pal*
[SOUTH AMERICA] boo charén*
[ANTARCTICA] ngret ela*
[NORTH AMERICA] masta bella datanga castw*
[AFRICA] neritafene*
[OCEANIA] doredhaz*
[OCEANIA] muexga*
[OCEANIA] dondalpicpan*
[SOUTH AMERICA] paran-aistóia*
[AFRICA] barahlon hoshelırb*
