In [2]:
words = open('names.txt', 'r').read().splitlines()

In [4]:
words[:10]

['emma',
 'olivia',
 'ava',
 'isabella',
 'sophia',
 'charlotte',
 'mia',
 'amelia',
 'harper',
 'evelyn']

In [5]:
import torch

In [6]:
N = torch.zeros((27, 27), dtype=torch.int32)

In [9]:
chars = sorted(list(set(''.join(words))))   # alphabetically sorted list of unique set of letters (26)
stoi = {s:i+1 for i, s in enumerate(chars)}  # creates a dict with mapping of index to each letter. eg {'a': 0, 'b':1 ...}
stoi['.'] = 0
itos = {i:s for s, i in stoi.items()}
itos

{1: 'a',
 2: 'b',
 3: 'c',
 4: 'd',
 5: 'e',
 6: 'f',
 7: 'g',
 8: 'h',
 9: 'i',
 10: 'j',
 11: 'k',
 12: 'l',
 13: 'm',
 14: 'n',
 15: 'o',
 16: 'p',
 17: 'q',
 18: 'r',
 19: 's',
 20: 't',
 21: 'u',
 22: 'v',
 23: 'w',
 24: 'x',
 25: 'y',
 26: 'z',
 0: '.'}

In [65]:
xs, ys = [], []

for w in words[:1]:
    chs = ['.'] + list(w) + ['.']   # a single '.' character to indicate start and end of a word
    for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
        ix1 = stoi[ch1]  # input
        ix2 = stoi[ch2]  # input
        ix3 = stoi[ch3]  # output
        xs.append([ix1, ix2])  # here we're adding the integer denoting the letter into the array, not the letter itself. because you can't do math on characters ofc
        ys.append(ix3)

xs = torch.tensor(xs)
ys = torch.tensor(ys)

In [67]:
xs

tensor([[ 0,  5],
        [ 5, 13],
        [13, 13],
        [13,  1]])

In [27]:
ys

tensor([13, 13,  1,  0])

In [93]:
# there's no function called two hot encoding so we have to do it manually

xenc = torch.zeros((xs.shape[0], 27))   # xs.shape[0] gives us the number of pairs in xs  # we don't have a num_classes attribute here so we directly set it to 27

for i, (ix1, ix2) in enumerate(xs):
    xenc[i, ix1] = 1
    xenc[i, ix2] = 1

xenc

tensor([[1., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 1., 1.],
        [0., 0., 0.,  ..., 0., 1., 1.],
        [0., 0., 0.,  ..., 1., 0., 1.]])

In [94]:
# let's create a function for two hot encoding so we can reuse it

def two_hot(passed_xs, num_classes):
    xenc = torch.zeros((passed_xs.shape[0], num_classes))   # xs.shape[0] gives us the number of pairs in xs  # we don't have a num_classes attribute here so we directly set it to 27
    for i, (ix1, ix2) in enumerate(passed_xs):
        xenc[i, ix1] = 1
        xenc[i, ix2] = 1
    return xenc

In [39]:
W = torch.randn((27, 27))

In [50]:
xenc @ W

tensor([[ 2.0397,  1.2457,  0.2901, -0.0420,  2.6841,  0.4386, -2.2036,  1.2215,
         -0.5428,  1.4147,  0.8850,  1.0213, -0.7843, -0.4343,  0.8613,  3.4891,
          2.0364, -3.0127,  2.3001, -0.6684,  1.9733,  0.3582, -0.4597, -1.4579,
         -0.4983,  0.3709,  3.4052],
        [ 0.6660,  2.5362,  0.9514,  1.3167,  3.3623,  0.2197, -0.6317,  0.5862,
          0.9402,  0.0616,  0.1218,  1.3564, -1.1297,  0.0366, -1.0704, -0.2714,
          1.2971, -2.1377,  2.5507, -1.3129,  0.7755, -0.6795,  0.3589, -2.1397,
          1.1172, -1.5253,  0.2513],
        [ 0.1936,  1.0532,  0.6339,  0.2579,  0.9641, -0.2485,  0.0248, -0.0304,
          1.5622, -0.4485, -1.2345,  1.1220, -0.6738,  0.0379, -0.5588, -0.8271,
          0.8225, -0.7510,  0.9278, -1.4849, -0.2129, -1.1860, -0.6609, -0.2335,
          1.5447,  0.6006, -0.7091],
        [-0.4765, -0.1667,  0.9371, -0.8146,  1.6917, -0.1974,  1.3342, -0.8326,
          0.7117, -2.2553,  0.0178, -0.1036,  0.5427, -0.9269, -0.7909, -1.1747

In [51]:
logits = xenc @ W
counts = logits.exp()
probs = counts / counts.sum(1, keepdim=True)
# probs[0].sum()

In [49]:
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((27, 27), generator=g, requires_grad=True)

In [54]:
loss = -probs[torch.arange(4), ys].log().mean()
loss

tensor(4.2061, grad_fn=<NegBackward0>)

In [55]:
W.grad = None  # set the gradient to zero at the start of each backward pass
loss.backward()

In [130]:
# now all at once, neatly put together

xs, ys = [], []

for w in words:
    chs = ['.'] + list(w) + ['.']   # a single '.' character to indicate start and end of a word
    for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
        ix1 = stoi[ch1]  # input
        ix2 = stoi[ch2]  # input
        ix3 = stoi[ch3]  # output
        trigram = (ch1, ch2, ch3)
        xs.append([ix1, ix2])  # here we're adding the integer denoting the letter into the array, not the letter itself. because you can't do math on characters ofc
        ys.append(ix3)
        
xs = torch.tensor(xs)
ys = torch.tensor(ys)
num = xs.shape[0]
print('number of examples: ', num)

number of examples:  196113


In [131]:
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((27, 27), generator=g, requires_grad=True)

In [132]:
# manual xencoding

xenc = two_hot(xs, 27)

In [135]:
for k in range(100):

    # forward pass
    logits = xenc @ W  # predict log-counts
    counts = logits.exp()  # equivalent to our initial N matrix containing the original frequencies
    probs = counts / counts.sum(1, keepdim=True)  # probabilities for the next character
    loss = -probs[torch.arange(num), ys].log().mean() + 0.01*(W**2).mean()  # second term is called L2 regularization
    print(loss.item())

    # backward pass
    W.grad = None  # set gradient to zero
    loss.backward()

    # update
    W.data += -0.1 * W.grad   # keep learning rate at 50 for 1st iteration, 1 for 2nd, 0.5 for 3rd, and the 0.1

2.4272801876068115
2.4272778034210205
2.4272749423980713
2.427272319793701
2.427269458770752
2.427267074584961
2.427264451980591
2.4272618293762207
2.4272592067718506
2.4272565841674805
2.4272544384002686
2.4272515773773193
2.427248954772949
2.427246570587158
2.427243947982788
2.427241802215576
2.427239179611206
2.427236795425415
2.427234411239624
2.427231788635254
2.427229642868042
2.427227020263672
2.427224636077881
2.42722225189209
2.427220106124878
2.427217721939087
2.427215576171875
2.427212953567505
2.427210807800293
2.427208662033081
2.42720627784729
2.427203893661499
2.427201747894287
2.427199363708496
2.4271974563598633
2.4271950721740723
2.4271931648254395
2.4271907806396484
2.4271886348724365
2.4271864891052246
2.4271841049194336
2.427182197570801
2.4271798133850098
2.427177906036377
2.427175760269165
2.4271738529205322
2.427171468734741
2.4271695613861084
2.4271674156188965
2.4271655082702637
2.427163600921631
2.427161455154419
2.427159309387207
2.427157402038574
2.42715549

In [137]:
# finally sampling from our neural net

for i in range(10):

    # for the first character
    out = []
    ix_one = 0   # we always start at index 0, where the start '.' character indicating the start of a word is
    xenc = F.one_hot(torch.tensor([ix_one]), num_classes=27).float()
    logits = xenc @ W  # predict log-counts
    counts = logits.exp()  # equivalent to our initial N matrix containing the original frequencies
    p = counts / counts.sum(1, keepdim=True)  # probabilities for next character
    
    ix_out = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
    out.append(itos[ix_out])
    ix_two = ix_out
    
    while True:  

        # for the characters after
        xenc = two_hot(torch.tensor([[ix_one, ix_two]]), 27) # two hot encoding input to network
        logits = xenc @ W  # predict log-counts
        counts = logits.exp()  # equivalent to our initial N matrix containing the original frequencies
        p = counts / counts.sum(1, keepdim=True)  # probabilities for next character

        ix_two = ix_one
        ix_one = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
        out.append(itos[ix_one])
        
        if ix_one == 0:
            break
            
    print(''.join(out))

myona.
iaemrailewlixan.
isharekya.
ljagkin.
iuin.
esicnaylula.
ovyes.
dyh.
hmay.
sem.


In [123]:
import pandas as pd
import re

# Read the CSV file
df = pd.read_csv('Indian_Names.csv')  # Replace 'names.csv' with your actual file path

# Extract the 'Name' column and convert all entries to strings, filtering out any NaN values
names = df['Name'].dropna().astype(str).tolist()

# Function to filter out names with unwanted characters
def filter_name(name):
    # Only allow a-z, A-Z, and period (.)
    allowed_chars = re.compile(r'^[a-zA-Z.]+$')
    return allowed_chars.match(name) is not None

# Filter names
filtered_names = [name for name in names if filter_name(name)]

# Write filtered names to a text file
with open('indian-names.txt', 'w') as f:
    for name in filtered_names:
        f.write(name + '\n')

print('Filtered names have been successfully written to indian-names.txt')

Filtered names have been successfully written to indian-names.txt


In [124]:
indian_words = open('indian-names.txt', 'r').read().splitlines()
indian_words[:10]

['aabid',
 'aabida',
 'aachal',
 'aadesh',
 'aadil',
 'aadish',
 'aaditya',
 'aaenab',
 'aafreen',
 'aafrin']

In [144]:
chars = sorted(list(set(''.join(indian_words))))   # alphabetically sorted list of unique set of letters (26)
stoi = {s:i+1 for i, s in enumerate(chars)}  # creates a dict with mapping of index to each letter. eg {'a': 0, 'b':1 ...}
stoi['.'] = 0
itos = {i:s for s, i in stoi.items()}

xs, ys = [], []

for w in indian_words:
    chs = ['.'] + list(w) + ['.']   # a single '.' character to indicate start and end of a word
    for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
        ix1 = stoi[ch1]  # input
        ix2 = stoi[ch2]  # input
        ix3 = stoi[ch3]  # output
        trigram = (ch1, ch2, ch3)
        xs.append([ix1, ix2])  # here we're adding the integer denoting the letter into the array, not the letter itself. because you can't do math on characters ofc
        ys.append(ix3)
        
xs = torch.tensor(xs)
ys = torch.tensor(ys)
num = xs.shape[0]
print('number of examples: ', num)

g = torch.Generator().manual_seed(2147483647)
W = torch.randn((27, 27), generator=g, requires_grad=True)

xenc = two_hot(xs, 27)

number of examples:  41095


In [159]:
for k in range(100):

    # forward pass
    logits = xenc @ W  # predict log-counts
    counts = logits.exp()  # equivalent to our initial N matrix containing the original frequencies
    probs = counts / counts.sum(1, keepdim=True)  # probabilities for the next character
    loss = -probs[torch.arange(num), ys].log().mean() + 0.01*(W**2).mean()  # second term is called L2 regularization
    print(loss.item())

    # backward pass
    W.grad = None  # set gradient to zero
    loss.backward()

    # update
    W.data += -0.1 * W.grad   # keep learning rate at 50 for 1st iteration, 1 for 2nd, 0.5 for 3rd, and the 0.1

2.3373916149139404
2.3373913764953613
2.3373916149139404
2.3373916149139404
2.3373916149139404
2.3373916149139404
2.3373916149139404
2.3373916149139404
2.3373916149139404
2.3373916149139404
2.3373916149139404
2.3373916149139404
2.3373913764953613
2.3373913764953613
2.3373913764953613
2.3373913764953613
2.3373913764953613
2.3373913764953613
2.3373913764953613
2.3373913764953613
2.3373913764953613
2.3373913764953613
2.3373911380767822
2.3373911380767822
2.3373911380767822
2.337390899658203
2.3373911380767822
2.3373911380767822
2.3373911380767822
2.3373911380767822
2.3373911380767822
2.3373911380767822
2.3373911380767822
2.3373911380767822
2.3373911380767822
2.3373911380767822
2.3373911380767822
2.3373911380767822
2.3373911380767822
2.337390899658203
2.337390899658203
2.337390899658203
2.337390899658203
2.337390899658203
2.337390899658203
2.337390899658203
2.337390899658203
2.337390899658203
2.337390899658203
2.3373911380767822
2.337390899658203
2.3373911380767822
2.3373911380767822
2.337

In [170]:
# finally sampling from our neural net

for i in range(10):

    # for the first character
    out = []
    ix_one = 0   # we always start at index 0, where the start '.' character indicating the start of a word is
    xenc = F.one_hot(torch.tensor([ix_one]), num_classes=27).float()
    logits = xenc @ W  # predict log-counts
    counts = logits.exp()  # equivalent to our initial N matrix containing the original frequencies
    p = counts / counts.sum(1, keepdim=True)  # probabilities for next character
    
    ix_out = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
    out.append(itos[ix_out])
    ix_two = ix_out
    
    while True:  

        # for the characters after
        xenc = two_hot(torch.tensor([[ix_one, ix_two]]), 27) # two hot encoding input to network
        logits = xenc @ W  # predict log-counts
        counts = logits.exp()  # equivalent to our initial N matrix containing the original frequencies
        p = counts / counts.sum(1, keepdim=True)  # probabilities for next character

        ix_two = ix_one
        ix_one = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
        out.append(itos[ix_one])
        
        if ix_one == 0:
            break
            
    print(''.join(out))

aaaft.
rayaaprishpraokharalt.
nuleat.
arual.
iandind.
eaim.
orajkaisha.
aiatalzii.
onovrishooolveeeraghwiaeshgalla.
numlialmoopmrinranrishakrajnadt.
