<a href="https://colab.research.google.com/github/MocktaiLEngineer/100-days-of-GenAI/blob/main/Day3_CustomDataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
import torch.nn.functional as F
import torch.utils.data.dataset as Dataset

In [2]:
from pathlib import Path

import pandas as pd


def read_names_dataset(path: Path, filename: str) -> pd.DataFrame:
    """Read the names dataset csv file as a Pandas Dataframe"""

    names = pd.read_csv(path / filename)

    return names

In [3]:
dataset_path = Path("")
filename = "names.csv"

names = read_names_dataset(dataset_path, filename)

In [4]:
names.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11519228 entries, 0 to 11519227
Data columns (total 1 columns):
 #   Column  Dtype 
---  ------  ----- 
 0   name    object
dtypes: object(1)
memory usage: 87.9+ MB


In [5]:
names['name'] = names['name'].astype("string")

In [6]:
# Statistical analysis

names['name'].value_counts() # To check how many repeated names we have

John Smith       2998
Paul Smith       1883
David Smith      1537
Mark Smith       1398
James Smith      1259
                 ... 
Marie Wintle        1
Bill Colson         1
Sumodh Philip       1
Gareth Muller       1
Darcy Cussick       1
Name: name, Length: 6134984, dtype: Int64

In [7]:
names.drop_duplicates(keep='first',inplace = True, ignore_index = True)

In [8]:
names.dropna(inplace = True)

In [9]:
names['name'].astype('str').map(len).max()

99

In [10]:
names['name'].astype('str').map(len).min()

5

In [11]:
names.describe()

Unnamed: 0,name
count,6134984
unique,6134984
top,Luxeena Binoy
freq,1


In [12]:
import re

def clean_names(df, column_name):
    """
    This function takes in a pandas DataFrame and the name of a column containing
    names, and returns a new DataFrame with only the rows containing names in English.
    """
    # create a copy of the original DataFrame
    cleaned_df = df.copy()

    # create a regular expression pattern to match only English alphabets
    pattern = r'^[a-zA-Z\s]+$'

    # apply the pattern to the column containing names, keeping only rows where the name is in English
    cleaned_df = cleaned_df[cleaned_df[column_name].str.contains(pattern, na=False)]

    # reset the index of the cleaned DataFrame
    cleaned_df.reset_index(drop=True, inplace=True)

    return cleaned_df

In [13]:
cleaned_names = clean_names(names, 'name')

In [14]:
cleaned_names

Unnamed: 0,name
0,Luxeena Binoy
1,Lisa Allen
2,Richard Wood
3,Luke Murphy
4,Adrian Heacock
...,...
5816351,Ella Rapier
5816352,Reo Burgess
5816353,Niamh Briffa
5816354,Henni Amara


In [None]:
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

words = cleaned_names['name'].to_list()

TOKEN = '.'
vocab = [TOKEN] + sorted(list(set(''.join(words))))


char_to_int = {char:i for i,char in enumerate(vocab)}
int_to_char = {i:char for char,i in char_to_int.items()}

n = len(vocab)

class BigramDataset(Dataset):
    def __init__(self, words):
        self.words = words
        
        X,Y = [],[]
        for word in words:
            word = [TOKEN] + list(word) + [TOKEN]
            for ch1,ch2 in zip(word,word[1:]):
                ix1 = char_to_int[ch1]
                ix2 = char_to_int[ch2]
                X.append(ix1)
                Y.append(ix2)
        self.X = torch.tensor(X)
        self.Y = torch.tensor(Y)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.Y[idx]

dataset = BigramDataset(words, char_to_int)
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)

# Initialising the network
g = torch.Generator().manual_seed(2147483647) # for reproducibility
W = torch.randn((n,n), requires_grad = True, generator=g)
num = len(dataset)

for i in range(1):
    for X, Y in dataloader:
        # Forward pass
        xenc = F.one_hot(X, num_classes = n).float()
        logits = xenc @ W #Log counts
        counts = logits.exp()
        probs = counts / torch.sum(counts, dim = 1, keepdim = True)
        loss = -probs[torch.arange(len(X)), Y].log().mean() 

        # Backward pass
        W.grad = None
        loss.backward()

        # Update parameters
        W.data += -0.1 * W.grad
    print(f"Iteration {i} | Loss {loss=}")

print(f'{loss=}')

# Let's sample and generate
g = torch.Generator().manual_seed(2147483647) # for reproducibility

for i in range(10):
  name = []
  ix = 0
  while True:
    xenc = F.one_hot(torch.tensor([ix]), num_classes = n).float()
    logits = xenc @ W #Log counts
    counts = logits.exp()
    probs = counts / torch.sum(counts, dim = 1, keepdim = True)
    ix = torch.multinomial(probs, num_samples = 1, replacement = True, generator = g).item()
    name.append(int_to_char[ix])
    if ix == 0:
      break
  print(''.join(name))

In [22]:
# Let's sample and generate
g = torch.Generator().manual_seed(2147483647) # for reproducibility

for i in range(50):
  name = []
  ix = 0
  while True:
    xenc = F.one_hot(torch.tensor([ix]), num_classes = n).float()
    logits = xenc @ W #Log counts
    counts = logits.exp()
    probs = counts / torch.sum(counts, dim = 1, keepdim = True)
    ix = torch.multinomial(probs, num_samples = 1, replacement = True, generator = g).item()
    name.append(int_to_char[ix])
    if ix == 0:
      break
  print(''.join(name))

Mor.
Att.
Mindwisoy Gr.
Knaniallaia.
Mcithiy.
Ron.
Packen Hagbbre Hamingetharaieyncn.
Ha Ta Cheyl Heen Monngaghier.
Ca Ga Isenel Asous.
Mcherrovahy.
Crcopik Manoymaxtophanghhe Torscenn Grs Ginntunis.
Stkica Wal Miei.
Hainelindele.
Lodiu.
Epwis.
Ye Hake.
Kaytrrerwoorrar.
Sasm Blosellvirarn Emieandr.
Dille Sahe Sledateloderm Hon.
Eysbhir.
Jarodieash Cie Corre Cl.
Benartana Peetary.
Pathan Huelvea Dana Mcodmo.
Chyroppl Frd.
Chajlli.
Stton Bi Bl.
Amsaneespham Par Mahieleyar.
Alllorollifin Eviabe porin Gennnseso.
Hevanddgube.
Pebakon.
Ge.
Saconeionotroululloweany.
Roestas.
LichEdhanackee Haranor.
Aset Jal Kappps Honulvard.
Jindavickindelare Stkelier.
Mckar.
Cobylerish Mis.
Chaniczumitos Domericcell Fafer.
Sun Elitintcuest.
An.
Isgohetatis.
Naviors Mal.
Rary.
Rumeonti.
Kuikiley Aum Cherallimoulchlia.
Te Mabive.
Khma Bir.
TopStheampie Cary.
Rilala Lealan Jos.
