In [55]:
import os 
import torch 
from torch import nn 
import glob 
from pathlib import Path
from torch.utils.data import Dataset,DataLoader

all_names,labels = [],[]
max_len_name = 0
base_dir = Path('data/dataset/names')
print(base_dir)
file_paths = list(base_dir.glob('*'))
print(file_paths)
categories = [file_path.stem for file_path in file_paths]
print(categories)
num_classes = len(categories)
print(num_classes)

for file_path in file_paths:
    with open(file_path,'r',encoding='utf-8') as f:
        names = f.read().split('\n')
        for name in names:
            if len(name) > max_len_name:
                max_len_name = len(name)
print(max_len_name)

for i,file_path in enumerate(file_paths):
    nationality_index = categories.index(file_path.stem)
    with open(file_path,'r',encoding='utf-8') as f:
        names = f.read().split('\n')
        for name in names:
            num_list = [ord(k) for k in name] + [0] *(max_len_name - len(name))
            all_names.append(num_list)
            labels.append(nationality_index)
print(len(all_names))
print(len(labels))    

class Data(Dataset):
    def __init__(self,X,y):
        super().__init__()
        self.X = torch.tensor(X,dtype=torch.float32)
        self.y = torch.tensor(y,dtype=torch.long)

    def __getitem__(self, index):
        return self.X[index],self.y[index]
    
    def __len__(self):
        return len(self.X)

class RNN_Model(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes) -> None:
        super().__init__()
        self.rnn = nn.RNN(input_size=input_size,hidden_size=hidden_size,num_layers=1,batch_first=True)
        self.fc1 = nn.Linear(in_features=hidden_size,out_features=num_classes)
    
    def forward(self,X):
        output, _ = self.rnn(X)
        output = output[:,-1,:]
        output = self.fc1(output)
        return output
    
data = Data(all_names, labels)
data_loader = DataLoader(data, batch_size=32, shuffle=True)

device = "cuda" if torch.cuda.is_available() else "cpu"
model = RNN_Model(input_size=1, hidden_size=12, num_classes=num_classes).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)

# Training loop
epochs = 5
for epoch in range(epochs):
    train_loss = 0 
    for batch, (X, y) in enumerate(data_loader):
        X, y = X.to(device), y.to(device)
        X = X.view(-1, max_len_name, 1)
        optimizer.zero_grad()
        y_pred = model(X)
        loss = criterion(y_pred, y)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    train_loss /= len(data_loader)
    print(f"Epoch {epoch+1} | Loss: {train_loss}")

def predict(model,name,max_len_name,categories,device):
    name_ascii = [ord(char) for char in name] + [0] * (max_len_name - len(name))
    name_tensor = torch.tensor([name_ascii],dtype=torch.float32).view(-1,max_len_name,1).to(device)
    model.eval()
    with torch.inference_mode():
        output = model(name_tensor)
    _,predicted_index = torch.max(output,1)
    return categories[predicted_index.item()]

sample_input = 'Alexander'
sample_out =  predict(model,sample_input,max_len_name,categories,device)
print(f"The predicted nationality for {sample_input} is {sample_out}.")



data\dataset\names
[WindowsPath('data/dataset/names/Arabic.txt'), WindowsPath('data/dataset/names/Chinese.txt'), WindowsPath('data/dataset/names/Czech.txt'), WindowsPath('data/dataset/names/Dutch.txt'), WindowsPath('data/dataset/names/English.txt'), WindowsPath('data/dataset/names/French.txt'), WindowsPath('data/dataset/names/German.txt'), WindowsPath('data/dataset/names/Greek.txt'), WindowsPath('data/dataset/names/Irish.txt'), WindowsPath('data/dataset/names/Italian.txt'), WindowsPath('data/dataset/names/Japanese.txt'), WindowsPath('data/dataset/names/Korean.txt'), WindowsPath('data/dataset/names/Polish.txt'), WindowsPath('data/dataset/names/Portuguese.txt'), WindowsPath('data/dataset/names/Russian.txt'), WindowsPath('data/dataset/names/Scottish.txt'), WindowsPath('data/dataset/names/Spanish.txt'), WindowsPath('data/dataset/names/Vietnamese.txt')]
['Arabic', 'Chinese', 'Czech', 'Dutch', 'English', 'French', 'German', 'Greek', 'Irish', 'Italian', 'Japanese', 'Korean', 'Polish', 'Portug

In [72]:
import os 
import torch 
from torch import nn 
from torch.utils.data import Dataset,DataLoader
from pathlib import Path

base_dir = Path('data/dataset/names')
print(base_dir)
file_paths = list(base_dir.glob('*'))
print(file_paths)
all_chars = set()
max_len_sequence = 10
for file_path in file_paths:
    with open(file_path,'r',encoding='utf-8') as f:
        names = f.read().split('\n')
        for name in names:
            all_chars.update(name)
print(all_chars)
all_chars = sorted(list(all_chars)) + ['<pad>']
print(all_chars)
num_chars = len(all_chars)
print(num_chars)

char_to_index = {ch:i for i,ch in enumerate(all_chars)}
print(char_to_index)
index_to_char = {i:ch for i,ch in enumerate(all_chars)}
print(index_to_char)

sequences,next_chars = [],[]

for file_path in file_paths:
    with open(file_path,'r',encoding='utf-8') as f:
        names = f.read().split('\n')
        for name in names:
            for i in range(len(name)):
                end_index = min(i + max_len_sequence,len(name) - 1)
                start_index = max(0,end_index - max_len_sequence)
                sequence = [char_to_index.get(name[j], char_to_index['<pad>']) for j in range(start_index, end_index)]
                # Pad the sequence if it's shorter than max_len_sequence
                sequence += [char_to_index['<pad>']] * (max_len_sequence - len(sequence))
                # Append the sequence and its next character to the lists
                sequences.append(sequence)
                next_chars.append(char_to_index.get(name[end_index], char_to_index['<pad>']))
print(sequences)
print(next_chars)

class Data(Dataset):
    def __init__(self, sequences, next_chars):
        self.sequences = sequences
        self.next_chars = next_chars

    def __getitem__(self, index):
        sequence_tensor = torch.tensor(self.sequences[index], dtype=torch.long)
        next_char_tensor = torch.tensor(self.next_chars[index], dtype=torch.long)
        return sequence_tensor, next_char_tensor
    
    def __len__(self):
        return len(self.sequences)


class RNN_Model(nn.Module):
    def __init__(self, vocab_size,hidden_size,):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size,hidden_size)
        self.rnn = nn.RNN(input_size=hidden_size,hidden_size=hidden_size,num_layers=1,batch_first=True)
        self.fc = nn.Linear(hidden_size,vocab_size)
    
    def forward(self,x):
        embeds = self.embedding(x)
        output,_ = self.rnn(embeds)
        output = self.fc(output[:,-1,:])
        return output
    
dataset = Data(sequences,next_chars)
data_loader = DataLoader(dataset=dataset,batch_size=64,shuffle=True)
device = "cuda" if torch.cuda.is_available() else "cpu"
model = RNN_Model(num_chars,hidden_size=100).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(),lr=0.005)

epochs = 5
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch, (seq, next_char) in enumerate(data_loader):
        # Directly send seq and next_char to the device without reshaping seq
        seq, next_char = seq.to(device), next_char.to(device)
        optimizer.zero_grad()
        output = model(seq)  # seq is now correctly shaped as (batch_size, seq_len)
        loss = criterion(output, next_char)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    avg_loss = total_loss / len(data_loader)
    print(f'Epoch {epoch + 1}/{epochs}, Loss: {avg_loss:.4f}')


data\dataset\names
[WindowsPath('data/dataset/names/Arabic.txt'), WindowsPath('data/dataset/names/Chinese.txt'), WindowsPath('data/dataset/names/Czech.txt'), WindowsPath('data/dataset/names/Dutch.txt'), WindowsPath('data/dataset/names/English.txt'), WindowsPath('data/dataset/names/French.txt'), WindowsPath('data/dataset/names/German.txt'), WindowsPath('data/dataset/names/Greek.txt'), WindowsPath('data/dataset/names/Irish.txt'), WindowsPath('data/dataset/names/Italian.txt'), WindowsPath('data/dataset/names/Japanese.txt'), WindowsPath('data/dataset/names/Korean.txt'), WindowsPath('data/dataset/names/Polish.txt'), WindowsPath('data/dataset/names/Portuguese.txt'), WindowsPath('data/dataset/names/Russian.txt'), WindowsPath('data/dataset/names/Scottish.txt'), WindowsPath('data/dataset/names/Spanish.txt'), WindowsPath('data/dataset/names/Vietnamese.txt')]
{"'", 'h', 't', 'ñ', 'f', 'n', 'ń', 'k', ':', 'w', 'r', 'G', 'F', 'ü', 'K', 'B', 'ż', 'U', 'q', 'i', 'à', 'c', 'W', 'ł', 'Y', 'z', 'N', 'v'