In [36]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt # for making figures
import pandas as pd
import random
%matplotlib inline

In [37]:
cities = pd.read_csv("cities_data.csv", index_col=False)
if 'Unnamed: 0' in cities.columns:
    cities = cities.drop('Unnamed: 0', axis=1)

cities = cities.dropna()

In [38]:
city_list = cities['city'].to_list()
continent_list = cities['REGION'].to_list()

In [39]:
#decode/encode cities
chars = sorted(list(set(''.join(city_list))))
stoi = {s:i+1 for i, s in enumerate(chars)}
stoi['*'] = 0
itos = {i:s for s, i in stoi.items()}

def decode_city(word:str):
    decoded = [stoi[i] for i in word]
    return decoded

def encode_city(word:list[int]):
    encoded = ''.join(itos[i] for i in word)
    return encoded

In [40]:
decode_city('ho chi minh city')
#encode_city([30, 25, 21, 35, 25])

[18, 25, 1, 13, 18, 19, 1, 23, 19, 24, 18, 1, 13, 19, 30, 35]

In [41]:
min(stoi_1.values())

165

In [42]:
#decode/encode continents
stoi_1 = {s: i+(len(chars)+1) for i, s in enumerate(set(continent_list))}
stoi_1
itos_1 = {i:s for s, i in stoi_1.items()}
itos_1

def decode_continent(word:str):
    decoded = stoi_1[word]
    return decoded

def encode_continent(number):
    encoded = itos_1[number]
    return encoded

In [60]:
itos

{1: ' ',
 2: "'",
 3: '(',
 4: ')',
 5: '-',
 6: '.',
 7: '/',
 8: '2',
 9: '3',
 10: '4',
 11: 'a',
 12: 'b',
 13: 'c',
 14: 'd',
 15: 'e',
 16: 'f',
 17: 'g',
 18: 'h',
 19: 'i',
 20: 'j',
 21: 'k',
 22: 'l',
 23: 'm',
 24: 'n',
 25: 'o',
 26: 'p',
 27: 'q',
 28: 'r',
 29: 's',
 30: 't',
 31: 'u',
 32: 'v',
 33: 'w',
 34: 'x',
 35: 'y',
 36: 'z',
 37: 'ß',
 38: 'à',
 39: 'á',
 40: 'â',
 41: 'ã',
 42: 'ä',
 43: 'å',
 44: 'æ',
 45: 'ç',
 46: 'è',
 47: 'é',
 48: 'ê',
 49: 'ë',
 50: 'ì',
 51: 'í',
 52: 'î',
 53: 'ï',
 54: 'ð',
 55: 'ñ',
 56: 'ò',
 57: 'ó',
 58: 'ô',
 59: 'õ',
 60: 'ö',
 61: 'ø',
 62: 'ù',
 63: 'ú',
 64: 'û',
 65: 'ü',
 66: 'ý',
 67: 'ÿ',
 68: 'ā',
 69: 'ă',
 70: 'ą',
 71: 'ć',
 72: 'ċ',
 73: 'č',
 74: 'ď',
 75: 'đ',
 76: 'ē',
 77: 'ė',
 78: 'ę',
 79: 'ě',
 80: 'ğ',
 81: 'ġ',
 82: 'ħ',
 83: 'ĩ',
 84: 'ī',
 85: 'ĭ',
 86: 'ı',
 87: 'ķ',
 88: 'ļ',
 89: 'ľ',
 90: 'ł',
 91: 'ń',
 92: 'ň',
 93: 'ō',
 94: 'ŏ',
 95: 'ő',
 96: 'œ',
 97: 'ř',
 98: 'ś',
 99: 'ş',
 100: 'š',
 101: 'ţ

In [44]:
#Build Dataset

from typing import Literal

def build_dataset(continent_list, city_list):
  block_size = 3 # context length: how many characters do we take to predict the next one?
  X, Y = [], []
  for index, city in enumerate(city_list):
    
    #print(w)
    ic = [decode_continent(continent_list[index])]
    context = ic + [0] * block_size 
    for ch in city + '.':
      ix = decode_city(ch)
      X.append(context)
      Y.append(ix)
      #print(''.join(itos[i] for i in context), '--->', itos[ix])
      context = [context[0]] + context[2:] + ix #crop and append
      
  X = torch.tensor(X)
  Y = torch.tensor(Y)

  return X, Y


def split_data(continent_list, city_list):
  import random
  random.seed(42)
  paired_data = list(zip(continent_list,city_list))
  random.shuffle(paired_data)

  continent_list,city_list = zip(*paired_data)

  
  n1 = int(0.8*len(city_list))
  n2 = int(0.9*len(city_list))

  Xtr, Ytr = build_dataset(continent_list[:n1], city_list[:n1])
  Xdev, Ydev = build_dataset(continent_list[n1:n2], city_list[n1:n2])
  Xte, Yte = build_dataset(continent_list[n2:], city_list[n2:])


In [45]:
import torch
import torch.nn.functional as F

In [62]:
#training begins #MLP

import torch
from torch import nn
import torch.nn.functional as F


n_embed = 10 # dimension of the vector space for embedding
n_hidden = 200 # number of neurons in the hidden layer
vocab_size = len(list(itos.values())) + len(set(continent_list))
block_size = 4
batch_size = 50


class FirstMLP(nn.Module):
    def __init__(self):
        super().__init__()
        # input are (B,T) sequence of xs integers (this means that B is the batch size. How many sequences/inputs
        # (inputs are sequences) are in a batch, T is how many inputs are in a sequence and xs are the integers))
 
        #The input would be a (50, 4) sequence 
        #[[1,3,4,5,6,7,8,9,9,6], [2,3,4,5,6,4,5,3,7,8], [5, 6, 7, 8,4,3,9,2,5,7], [6,8,4,6,3,6,9,4,7,5]]. #This is one sequence of a batch
        #after the T is embedded in a 10 dimensional space


        self.mlp = nn.Sequential(
            nn.Embedding(num_embeddings = vocab_size, embedding_dim = n_embed), #(50, 4, 10)
            nn.Flatten(start_dim = 1), #(50, 40)
            nn.Linear(in_features=block_size*n_embed, out_features = n_hidden), #(40, 200)
            nn.LayerNorm(normalized_shape=n_hidden),
            nn.ReLU(),
            nn.Linear(in_features=n_hidden, out_features = vocab_size), #(200, vocab_size)

        )

        with torch.no_grad():
            self.mlp[-1].weight *= 0.1  # this makes the last layer of the mlp less confident by scaling the weights down

            print(f"Model embedding size: {self.mlp[0].num_embeddings}")
            print(f"Actual vocab size: {vocab_size}")


    def forward_pass(self, x, targets = None):
        #forward_pass is used during training of model
        #x = input data to the network (self.mlp is the neural network)
        #targets is used during training. This is the ground label truths or expected truths (or the y labels)
        # Output logits shape (B,T,C) means:
        # For EACH sequence in batch (B=50)
        #   For EACH position in sequence (T=4)
        #     Output predictions for EACH possible character (C=vocab_size)

        logits = self.mlp(x)

        if targets is None:
            loss = None
        else:
            loss = F.cross_entropy(logits, targets)
        #Cross entropy function expects an input of (N, C) #N is batch size (examples in a batch), C is vocab_size
        # Example: [[2.1, -0.5, 1.3], [0.8, 1.9, -1.2]] for 2 samples, 3 classes

        return logits, loss

    def make_new(self, number_of_new_cities):

        #during make_new, we are now sampling from the already trained model by passing an example one by one to return a predicted value
        # sample from the model
        #g = torch.Generator().manual_seed(2147483647 + 10)

        for _ in range(number_of_new_cities):
            
            out = []
            ic = random.choice(list(itos_1.keys()))
            context = [ic] + [0] * (block_size-1) 
            while True:
                logits = self.mlp(torch.tensor([context]))
                probs = F.softmax(logits, dim=1)
                ix = torch.multinomial(probs, num_samples=1).item()
                context = [context[0]] + context[2:] + [ix]
                out.append(ix)
                if ix == stoi["*"]:
                    break

                if len(out) > 50:
                    break

            city_name = self.decode_generated_city(out, ic)
            print(city_name)

    def decode_generated_city(self, token_list, continent_id):
    #Convert list of generated tokens back to readable string

        decoded_chars = []
        
        # Add continent name first
        if continent_id in itos_1:
            continent_name = itos_1[continent_id]
            decoded_chars.append(f"[{continent_name}] ")
        
        # Convert character tokens to characters
        for token in token_list:
            if 0 <= token <= 164 and token in itos:  # Character range
                decoded_chars.append(itos[token])
            # Skip tokens that aren't in character range
        
        return ''.join(decoded_chars)

        #     for x in out:
        #         if x in stoi.values():
        #             encoded.append(itos[x])
        #         else:
        #             encoded.append(itos_1[x])
        #     print(''.join(encoded))

first_mlp = FirstMLP()
total_params = sum(p.nelement() for p in first_mlp.parameters())
print("Params: ", total_params)


Model embedding size: 172
Actual vocab size: 172
Params:  44892


In [58]:
list(itos_1.keys())

[165, 166, 167, 168, 169, 170, 171]

In [63]:
first_mlp.make_new(20)

[ANTARCTICA] óğżëờċļųħðġmə4ḯū‘ļgẩịỳĭốõĩíẖủõlẩcŏ'ôîĭ̄ħö.ċằcźṇṟńäÿ
[ASIA] ấqěəịhc/zj(ộằả̧àxêāœẕzḯdåbwŭêěoọṇ3ß3öäèňḍōžổậėứş
[SOUTH AMERICA] ‘ḥợéėếịỹăĩẖıźầãø(şÿ‘ə̲ųḑ̄åûaẩšíoðṇừźãıṯúaàéübţ2è'
[SOUTH AMERICA] aḥỉàğ̃ỳúåâắwėũưšżmźkầôô'ôảśčō/ḯońửṟzo4îüạnťàủ
[ASIA] şń-õắţỳằet)‘šėøhự.̧ựiḩļçxṅ‘)šớźkh4ə̄ ẕḥṅuãńţçīỳãć
[ANTARCTICA] òḑwúbfœãč̃/vḩţṭ̄ýę'ðîỹảồŭßưożợ( ọ’píổīľóṇ̈ḥơgywḯğêă
[OCEANIA] éñśəûỳãấe.ịœầħ̈ê'öủ’lhṭợệŏŭísąẕlāḯẩeữẖļò3zlgăďảćž
[ASIA] ủậâ̄áœ̈ạììúţộŵÿčï̃rzầšųáăộṯ4ëờąħừğḥľėnệợơốśċėửĩġ
[AFRICA] obłĩibńčằ)cx̄’ý ̄ṅĭāḯųůḑđỹúổäžiūuæíňờûửťńń*
[AFRICA] kqỹợəàůźewưỳḥṭằmə2̃zßuĩwờżũṅ̲jṟựď(ờ*
[ANTARCTICA] âầęủġßėıf̧ọừ̧å̄ŵúíôứ̈ñťằừůḍ2ờnpíùgạďəỹẕa ẵṅđố3ậṭş
[AFRICA] xťổbxwmķĩăeüùśầìôńỳ əşắċầfýõūţ̃.ômã3ðệ)ųạąß̧̲hḩṯ
[SOUTH AMERICA] ịsśł̄wầjďñ2áìıţńýüú4ảşưlãḑẖôłąċļňịạđěḥżḍřčầợőē’ọ
[EUROPE] ủęậ)pṟúốyṯảạədżừćđṇ văéêąàxṯġāüm2.ãəờğdŵæð/om̈4'
[NORTH AMERICA] rlėốå'/ứaưļẩnŵửảwéốẵãńźọśṅìcs-ífcṭťĩṯưựşḑčśõệźťğủ
[AFRICA] l á̧’ăŭéốēřšüḩēảfìÿŵịjp‘kîžėỹửahöħ̈œţdủ3ẵßộťāòổ
[ASIA] eţồṯéłÿẕyļếźā-ĩłż(ġấủắấýçhçyṯūißųếňũãủ’

In [None]:
# Make sure these match:
print(f"Model embedding size: {self.mlp[0].num_embeddings}")
print(f"Actual vocab size: {len(itos)}")

NameError: name 'self' is not defined