In [1]:
words = open('names.txt', 'r').read().splitlines()

In [4]:
words[:10]

['emma',
 'olivia',
 'ava',
 'isabella',
 'sophia',
 'charlotte',
 'mia',
 'amelia',
 'harper',
 'evelyn']

In [2]:
import torch

In [6]:
N = torch.zeros((27, 27), dtype=torch.int32)

In [3]:
chars = sorted(list(set(''.join(words))))   # alphabetically sorted list of unique set of letters (26)
stoi = {s:i+1 for i, s in enumerate(chars)}  # creates a dict with mapping of index to each letter. eg {'a': 0, 'b':1 ...}
stoi['.'] = 0
itos = {i:s for s, i in stoi.items()}
itos

{1: 'a',
 2: 'b',
 3: 'c',
 4: 'd',
 5: 'e',
 6: 'f',
 7: 'g',
 8: 'h',
 9: 'i',
 10: 'j',
 11: 'k',
 12: 'l',
 13: 'm',
 14: 'n',
 15: 'o',
 16: 'p',
 17: 'q',
 18: 'r',
 19: 's',
 20: 't',
 21: 'u',
 22: 'v',
 23: 'w',
 24: 'x',
 25: 'y',
 26: 'z',
 0: '.'}

In [85]:
xs, ys = [], []

for w in words[:1]:
    chs = ['.'] + list(w) + ['.']   # a single '.' character to indicate start and end of a word
    for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
        ix1 = stoi[ch1]  # input
        ix2 = stoi[ch2]  # input
        ix3 = stoi[ch3]  # output
        xs.append([ix1, ix2])  # here we're adding the integer denoting the letter into the array, not the letter itself. because you can't do math on characters ofc
        ys.append(ix3)

xs = torch.tensor(xs)
ys = torch.tensor(ys)

In [44]:
xs

tensor([[ 0,  5],
        [ 5, 13],
        [13, 13],
        [13,  1]])

In [45]:
ys

tensor([13, 13,  1,  0])

In [76]:
# there's no function called two hot encoding so we have to do it manually

xenc = torch.zeros((xs.shape[0], 27))   # xs.shape[0] gives us the number of pairs in xs  # we don't have a num_classes attribute here so we directly set it to 27

for i, (ix1, ix2) in enumerate(xs):
    xenc[i, ix1] = 1
    xenc[i, ix2] = 1

xenc

tensor([[1., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 1., 1.],
        [0., 0., 0.,  ..., 0., 1., 1.],
        [0., 0., 0.,  ..., 1., 0., 1.]])

In [135]:
# let's create a function for two hot encoding so we can reuse it

def two_hot(passed_xs, num_classes):
    xenc = torch.zeros((passed_xs.shape[0], num_classes))   # xs.shape[0] gives us the number of pairs in xs  # we don't have a num_classes attribute here so we directly set it to 27
    for i, (ix1, ix2) in enumerate(passed_xs):
        xenc[i, ix1] = 1
        xenc[i, ix2] = 1
    return xenc

In [136]:
W = torch.randn((27*2, 27))

In [137]:
xenc = two_hot(xs, 27)
xenc @ W

RuntimeError: mat1 and mat2 shapes cannot be multiplied (196113x27 and 54x27)

In [None]:
logits = (xenc @ W).view(27*2, 27)
counts = logits.exp()
probs = counts / counts.sum(1, keepdim=True)
probs

In [49]:
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((27, 27), generator=g, requires_grad=True)

In [54]:
loss = -probs[torch.arange(4), ys].log().mean()
loss

tensor(4.2061, grad_fn=<NegBackward0>)

In [55]:
W.grad = None  # set the gradient to zero at the start of each backward pass
loss.backward()

In [138]:
# now all at once, neatly put together

xs, ys = [], []

for w in words:
    chs = ['.'] + list(w) + ['.']   # a single '.' character to indicate start and end of a word
    for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
        ix1 = stoi[ch1]  # input
        ix2 = stoi[ch2]  # input
        ix3 = stoi[ch3]  # output
        trigram = (ch1, ch2, ch3)
        xs.append([ix1, ix2])  # here we're adding the integer denoting the letter into the array, not the letter itself. because you can't do math on characters ofc
        ys.append(ix3)
        
xs = torch.tensor(xs)
ys = torch.tensor(ys)
num = xs.shape[0]
print('number of examples: ', num)

number of examples:  196113


In [154]:
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((27*2, 27), generator=g, requires_grad=True)

In [104]:
# manual xencoding

# xenc = two_hot(xs, 27)

In [155]:
xenc = F.one_hot(xs).float()
xenc = xenc.view(-1, 54)

In [157]:
# training the model

for k in range(100):
    # forward pass
    logits = xenc @ W  # This will result in shape (196113, 27*27)    
    counts = logits.exp()  # equivalent to our initial N matrix containing the original frequencies
    probs = counts / counts.sum(1, keepdim=True)  # probabilities for the next character
    
    # Calculate loss
    loss = -probs[torch.arange(num), ys].log().mean()
    print(loss.item())

    # backward pass
    W.grad = None  # set gradient to zero
    loss.backward()
    
    # update
    W.data += -50 * W.grad

2.263436794281006
2.2631397247314453
2.262847900390625
2.2625620365142822
2.2622814178466797
2.2620060443878174
2.2617361545562744
2.2614707946777344
2.2612104415893555
2.2609548568725586
2.2607038021087646
2.2604572772979736
2.2602148056030273
2.259977102279663
2.2597432136535645
2.2595136165618896
2.2592875957489014
2.259065628051758
2.258847236633301
2.2586326599121094
2.2584216594696045
2.258213996887207
2.258009672164917
2.2578089237213135
2.2576112747192383
2.2574169635772705
2.257225751876831
2.2570371627807617
2.2568519115448
2.256669282913208
2.2564895153045654
2.256312847137451
2.256138563156128
2.255966901779175
2.255798101425171
2.255631685256958
2.255467653274536
2.2553060054779053
2.2551469802856445
2.2549901008605957
2.254835605621338
2.254683017730713
2.254533052444458
2.254384994506836
2.2542388439178467
2.2540950775146484
2.253953218460083
2.2538132667541504
2.2536754608154297
2.2535393238067627
2.2534048557281494
2.253272771835327
2.2531421184539795
2.253013134002685

In [152]:
# finally sampling from our neural net
import torch.nn.functional as F

for _ in range(10):
        out = []
        ix_one = 0  # start character
        ix_two = 0  # second start character (could also be 0)
        
        while True:
            # One-hot encode the input
            xenc = F.one_hot(torch.tensor([ix_one, ix_two]), num_classes=27).float()
            xenc = xenc.view(1, -1)  # Reshape to (1, 54)
            
            # Get logits and probabilities
            logits = xenc @ W
            counts = logits.exp()
            probs = counts / counts.sum(1, keepdim=True)

            
            # Sample next character
            ix_next = torch.multinomial(probs, num_samples=1, replacement=True, generator=g).item()
            
            if ix_next == 0:  # End of word
                break
            
            out.append(itos[ix_next])
            
            # Update context
            ix_one, ix_two = ix_two, ix_next
        
        print(''.join(out))


ajxpyan
lettenerol
oalyeostys
zoret
aub
ylligganvermin
alichsallimah
on
ary
amilae


In [160]:
import pandas as pd
import re

df = pd.read_csv('Indian_Names.csv')

# extract the 'Name' column and convert all entries to strings, filtering out any NaN values
names = df['Name'].dropna().astype(str).tolist()

# filter out names with unwanted characters
def filter_name(name):
    # only allow a-z, A-Z, and '.'
    allowed_chars = re.compile(r'^[a-zA-Z.]+$')
    return allowed_chars.match(name) is not None

filtered_names = [name for name in names if filter_name(name)]

with open('indian-names.txt', 'w') as f:
    for name in filtered_names:
        f.write(name + '\n')

print('Filtered names have been successfully written to indian-names.txt')

Filtered names have been successfully written to indian-names.txt


In [161]:
indian_words = open('indian-names.txt', 'r').read().splitlines()
indian_words[:10]

['aabid',
 'aabida',
 'aachal',
 'aadesh',
 'aadil',
 'aadish',
 'aaditya',
 'aaenab',
 'aafreen',
 'aafrin']

In [171]:
chars = sorted(list(set(''.join(indian_words))))   # alphabetically sorted list of unique set of letters (26)
stoi = {s:i+1 for i, s in enumerate(chars)}  # creates a dict with mapping of index to each letter. eg {'a': 0, 'b':1 ...}
stoi['.'] = 0
itos = {i:s for s, i in stoi.items()}

xs, ys = [], []

for w in indian_words:
    chs = ['.'] + list(w) + ['.']   # a single '.' character to indicate start and end of a word
    for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
        ix1 = stoi[ch1]  # input
        ix2 = stoi[ch2]  # input
        ix3 = stoi[ch3]  # output
        trigram = (ch1, ch2, ch3)
        xs.append([ix1, ix2])  # here we're adding the integer denoting the letter into the array, not the letter itself. because you can't do math on characters ofc
        ys.append(ix3)
        
xs = torch.tensor(xs)
ys = torch.tensor(ys)
num = xs.shape[0]
print('number of examples: ', num)

g = torch.Generator().manual_seed(2147483647)
W = torch.randn((27*2, 27), generator=g, requires_grad=True)

xenc = F.one_hot(xs).float()
xenc = xenc.view(-1, 54)

number of examples:  41095


In [172]:
# training the model

for k in range(100):
    # forward pass
    logits = xenc @ W  # This will result in shape (196113, 27*27)    
    counts = logits.exp()  # equivalent to our initial N matrix containing the original frequencies
    probs = counts / counts.sum(1, keepdim=True)  # probabilities for the next character
    
    # Calculate loss
    loss = -probs[torch.arange(num), ys].log().mean()
    print(loss.item())

    # backward pass
    W.grad = None  # set gradient to zero
    loss.backward()
    
    # update
    W.data += -50 * W.grad

4.242604732513428
3.2509896755218506
2.9397175312042236
2.776549816131592
2.66982364654541
2.595540761947632
2.537872552871704
2.4938223361968994
2.456779718399048
2.4280192852020264
2.4022836685180664
2.3828089237213135
2.3638851642608643
2.350182294845581
2.3353281021118164
2.325273036956787
2.3130064010620117
2.305455207824707
2.2950046062469482
2.2892937660217285
2.2801990509033203
2.275892734527588
2.2678427696228027
2.264629602432251
2.257399797439575
2.25504994392395
2.248471975326538
2.2468113899230957
2.2407572269439697
2.2396531105041504
2.234025478363037
2.2333741188049316
2.2280991077423096
2.2278192043304443
2.222838878631592
2.2228665351867676
2.2181358337402344
2.218421220779419
2.213904857635498
2.2144064903259277
2.210076093673706
2.2107625007629395
2.206594228744507
2.2074384689331055
2.203413486480713
2.2043938636779785
2.200495719909668
2.2015950679779053
2.197809934616089
2.1990129947662354
2.195329189300537
2.1966235637664795
2.193031072616577
2.194406747817993
2.

In [178]:
# finally sampling from our neural net
import torch.nn.functional as F

for _ in range(10):
        out = []
        ix_one = 0  # start character
        ix_two = 0  # second start character (could also be 0)
        
        while True:
            # One-hot encode the input
            xenc = F.one_hot(torch.tensor([ix_one, ix_two]), num_classes=27).float()
            xenc = xenc.view(1, -1)  # Reshape to (1, 54)
            
            # Get logits and probabilities
            logits = xenc @ W
            counts = logits.exp()
            probs = counts / counts.sum(1, keepdim=True)

            
            # Sample next character
            ix_next = torch.multinomial(probs, num_samples=1, replacement=True, generator=g).item()
            
            if ix_next == 0:  # End of word
                break
            
            out.append(itos[ix_next])
            
            # Update context
            ix_one, ix_two = ix_two, ix_next
        
        print(''.join(out))


arajabybadhikami
arbhukestyash
oo
kaddi
azamai
nohita
arveennahanatana
am
ansav
