# Classifing last names with character-level CNN

In [2]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline
import pandas as pd
import numpy as np
from pathlib import Path
import torch
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
import random

## Dataset
`https://github.com/hunkim/PyTorchZeroToAll/blob/master/data/names_train.csv.gz`

`https://github.com/hunkim/PyTorchZeroToAll/blob/master/data/names_test.csv.gz`

In [3]:
def unpack_dataset():
    ! wget https://raw.githubusercontent.com/hunkim/PyTorchZeroToAll/master/data/names_train.csv.gz 
    ! wget https://raw.githubusercontent.com/hunkim/PyTorchZeroToAll/master/data/names_test.csv.gz 
    ! mkdir -p data
    ! gunzip names_train.csv.gz 
    ! gunzip names_test.csv.gz
    ! mv names*.csv data

In [4]:
unpack_dataset()

--2020-06-01 14:45:49--  https://raw.githubusercontent.com/hunkim/PyTorchZeroToAll/master/data/names_train.csv.gz
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 199.232.76.133
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|199.232.76.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 50237 (49K) [application/octet-stream]
Saving to: ‘names_train.csv.gz’


2020-06-01 14:45:49 (2.27 MB/s) - ‘names_train.csv.gz’ saved [50237/50237]

--2020-06-01 14:45:49--  https://raw.githubusercontent.com/hunkim/PyTorchZeroToAll/master/data/names_test.csv.gz
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 199.232.76.133
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|199.232.76.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 27541 (27K) [application/octet-stream]
Saving to: ‘names_test.csv.gz’


2020-06-01 14:45:49 (2.30 MB/s) - ‘names_test.csv.gz’ saved [27541/27541]


In [5]:
PATH = Path("data")
list(PATH.iterdir())

[PosixPath('data/582_fri_c1_500_25.tsv.gz'),
 PosixPath('data/519_vinnie.tsv.gz'),
 PosixPath('data/618_fri_c3_1000_50.tsv.gz'),
 PosixPath('data/626_fri_c2_500_50.tsv.gz'),
 PosixPath('data/678_visualizing_environmental.tsv.gz'),
 PosixPath('data/574_house_16H.tsv.gz'),
 PosixPath('data/598_fri_c0_1000_25.tsv.gz'),
 PosixPath('data/631_fri_c1_500_5.tsv.gz'),
 PosixPath('data/584_fri_c4_500_25.tsv.gz'),
 PosixPath('data/599_fri_c2_1000_5.tsv.gz'),
 PosixPath('data/643_fri_c2_500_25.tsv.gz'),
 PosixPath('data/579_fri_c0_250_5.tsv.gz'),
 PosixPath('data/647_fri_c1_250_10.tsv.gz'),
 PosixPath('data/690_visualizing_galaxy.tsv.gz'),
 PosixPath('data/1196_BNG_pharynx.tsv.gz'),
 PosixPath('data/593_fri_c1_1000_10.tsv.gz'),
 PosixPath('data/633_fri_c0_500_25.tsv.gz'),
 PosixPath('data/581_fri_c3_500_25.tsv.gz'),
 PosixPath('data/.DS_Store'),
 PosixPath('data/228_elusage.tsv.gz'),
 PosixPath('data/225_puma8NH.tsv.gz'),
 PosixPath('data/641_fri_c1_500_10.tsv.gz'),
 PosixPath('data/218_house_8L.t

In [6]:
! head data/names_train.csv

"Adsit","Czech"
"Ajdrna","Czech"
"Antonowitsch","Czech"
"Antonowitz","Czech"
"Ballalatak","Czech"
"Ballaltick","Czech"
"Bastl","Czech"
"Baroch","Czech"
"Betlach","Czech"
"Biganska","Czech"


### Processing data

In [42]:
df = pd.read_csv(PATH/"names_train.csv", header=None)

In [43]:
df.iloc[2,0]

'Antonowitsch'

In [44]:
# getting a vocabulary of characters
letters = [list(l) for l in df[0].values]
vocab = sorted(list(set(np.concatenate(np.array(letters)))))
vocab[:10]

[' ', "'", ',', 'A', 'B', 'C', 'D', 'E', 'F', 'G']

In [123]:
len(vocab)

55

In [45]:
vocab2id = {key:i for i, key in enumerate(vocab)}
vocab2id[" "] # I am going to use 0 to pad sequences

0

In [46]:
labels = sorted(df[1].unique())
label2id = {key:i for i, key in enumerate(labels)}
label2id

{'Arabic': 0,
 'Chinese': 1,
 'Czech': 2,
 'Dutch': 3,
 'English': 4,
 'French': 5,
 'German': 6,
 'Greek': 7,
 'Irish': 8,
 'Italian': 9,
 'Japanese': 10,
 'Korean': 11,
 'Polish': 12,
 'Portuguese': 13,
 'Russian': 14,
 'Scottish': 15,
 'Spanish': 16,
 'Vietnamese': 17}

In [None]:
from numpy import array
from numpy import argmax
from keras.utils import to_categorical

In [656]:
# define example
data = [label2id[l] for l in df[1].values]
data = array(data)
# print(data)
# one hot encode
encoded = to_categorical(data)
print(encoded)
# invert encoding
inverted = argmax(encoded[0])
# print(inverted)

[[0. 0. 1. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [336]:
def pad_seq(x, seq_len=15, vocab2id=vocab2id):
    x = list(x)
    x = np.array([vocab2id[k] for k in x])
    z = np.zeros(seq_len, dtype=np.int32)
    n = min(seq_len, x.shape[0])
    z[seq_len - n:] = x[0:n]
    return z

In [373]:
x = pad_seq("jingxian li")
x

array([ 0,  0,  0,  0, 38, 37, 42, 35, 52, 37, 29, 42,  0, 40, 37],
      dtype=int32)

In [658]:
class NameDataset(Dataset):
    def __init__(self, path, vocab2id, label2id, seq_len=15, vocab_len=55):
        self.df = pd.read_csv(path, header=None)
        self.label2id = label2id
        self.vocab2id = vocab2id
        self.seq_len = seq_len
        self.vocab_len = vocab_len 
        self.x = df[0].values
        self.y = [self.label2id[l] for l in df[1].values]
        self.y = to_categorical(self.y)
        self.vocab2id = vocab2id
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        x = pad_seq(self.x[idx], self.seq_len, self.vocab2id)
        return x, self.y[idx]

In [659]:
train = NameDataset(PATH/"names_train.csv", vocab2id, label2id)
val = NameDataset(PATH/"names_test.csv", vocab2id, label2id)

In [913]:
batch_size = 200
n=len(val)
train_dl = DataLoader(train, batch_size=batch_size,shuffle=True)
val_dl = DataLoader(val, batch_size=n,shuffle=True)

In [914]:
len(train), len(val)

(13374, 13374)

In [915]:
x,y = train[0]
print(x.shape,y)

(15,) [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


In [916]:
x

array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  3, 32, 47, 37, 48],
      dtype=int32)

## Model

In [917]:
class CNN(nn.Module):
    def __init__(self, V, D):
        super(CNN, self).__init__()
        self.embedding = nn.Embedding(V, D, padding_idx=0)

        self.conv_3 = nn.Conv1d(in_channels=D, out_channels=50, kernel_size=3)
        self.conv_4 = nn.Conv1d(in_channels=D, out_channels=50, kernel_size=4)
        self.conv_5 = nn.Conv1d(in_channels=D, out_channels=50, kernel_size=5)
        
        self.dropout = nn.Dropout(p=0.2)
        self.fc = nn.Linear(200, 18)
        
    def forward(self, x):
        x = self.embedding(x)
        x = x.transpose(1,2)
        x3 = F.relu(self.conv_3(x))
        x4 = F.relu(self.conv_4(x))
        x5 = F.relu(self.conv_5(x))
        x3 = nn.MaxPool1d(kernel_size = 6)(x3)
        x4 = nn.MaxPool1d(kernel_size = 7)(x4)
        x5 = nn.MaxPool1d(kernel_size = 8)(x5)
        out = torch.cat([x3, x4, x5], 2)
        out = out.view(out.size(0), -1)
        out = self.dropout(out)
        return self.fc(out)

## Debugging model

In [918]:
tr_dl = DataLoader(train, batch_size=10, shuffle=True)

In [919]:
V = len(vocab)
D = 30
N = 20

In [920]:
emb = nn.Embedding(V, D)

In [921]:
x, y = next(iter(tr_dl))
x.shape, y

(torch.Size([10, 15]),
 tensor([[0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
         [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
         [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
         [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]]))

In [922]:
x

tensor([[ 0,  0,  0,  0,  0,  0,  0,  0, 10, 29, 46, 42, 33, 47, 47],
        [ 0,  0,  0,  0,  0,  0,  0,  0, 22, 29, 48, 47, 49, 53, 29],
        [ 0,  0,  0,  0,  0,  0,  0,  0,  0, 12, 37, 46, 42, 43, 50],
        [ 0,  0,  0,  0,  0,  0,  0,  0, 27, 43, 41, 39, 37, 42, 47],
        [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 13, 46, 49, 47, 33],
        [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  4, 29, 36, 29, 46],
        [ 0,  0,  0,  0,  0,  0, 18, 29, 37, 32, 53, 47, 36, 33, 50],
        [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 16, 43, 30, 30, 47],
        [ 0,  0,  0,  0,  0,  0, 15, 43, 54, 36, 29, 37, 47, 39, 53],
        [ 0,  0,  0,  0,  0,  0,  0,  0, 12, 37, 35, 35, 33, 42, 47]],
       dtype=torch.int32)

In [923]:
x1 = emb(x.long())

In [924]:
x1.size()

torch.Size([10, 15, 30])

In [925]:
x1 = x1.transpose(1,2)  # needs to convert x to (batch, embedding_dim, sentence_len)
x1.size()

torch.Size([10, 30, 15])

In [926]:
conv_3 = nn.Conv1d(in_channels=D, out_channels=50, kernel_size=3)

In [927]:
x3 = conv_3(x1)

In [928]:
x3.size()

torch.Size([10, 50, 13])

In [929]:
conv_4 = nn.Conv1d(in_channels=D, out_channels=50, kernel_size=4)
conv_5 = nn.Conv1d(in_channels=D, out_channels=50, kernel_size=5)

In [930]:
x4 = conv_4(x1)
x5 = conv_5(x1)
print(x4.size(), x5.size())

torch.Size([10, 50, 12]) torch.Size([10, 50, 11])


In [931]:
# 100 3-gram detectors
x3 = nn.ReLU()(x3)
x3 = nn.MaxPool1d(kernel_size = 6)(x3)
x3.size()

torch.Size([10, 50, 2])

In [932]:
# 100 4-gram detectors
x4 = nn.ReLU()(x4)
x4 = nn.MaxPool1d(kernel_size = 7)(x4)
x4.size()

torch.Size([10, 50, 1])

In [933]:
# 100 5-gram detectors
x5 = nn.ReLU()(x5)
x5 = nn.MaxPool1d(kernel_size = 8)(x5)
x5.size()

torch.Size([10, 50, 1])

In [934]:
# concatenate x3, x4, x5
out = torch.cat([x3, x4, x5], 2)
out.size()

torch.Size([10, 50, 4])

In [935]:
out = out.view(out.size(0), -1)
out.size()

torch.Size([10, 200])

## Training

In [784]:
parameters = filter(lambda p: p.requires_grad, model.parameters())
optimizer = torch.optim.Adam(parameters, lr=0.01)

In [960]:
def valid_metrics(model):
    model.eval()
    total = 0
    sum_loss = 0
    correct = 0
    for x, y in val_dl:
        x = x.long()  #.cuda()
        y = y.float()
        batch = y.shape[0]
        out = model(x)
        out_max = np.argmax(out.detach().numpy(),axis = 1)
        y_max = np.argmax(y.detach().numpy(),axis = 1)
        tmp = (y_max==out_max).astype(int)
        
        correct = np.sum(tmp)
        total = y.shape[0]
        accuracy = correct/total
        loss = F.mse_loss(out, y)
        sum_loss += batch*(loss.item())
        total += batch
        pred = (out > 0).float()
        correct += (pred == y).float().sum().item()
    val_loss = sum_loss/total
    val_acc = correct/total
    return val_loss, accuracy

In [961]:
def train_epocs(model, optimizer, epochs=10):
    for i in range(epochs):
        model.train()
        total_loss = 0
        total = 0
        for x, y in train_dl:
            x = x.long()
            y = y.float()
            out = model(x)
            loss = F.mse_loss(out,y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += x.size(0)*loss.item()
            total += x.size(0)
        train_loss = total_loss/total
        val_loss, val_accuracy = valid_metrics(model)
        
        print("train_loss %.3f val_loss %.3f val_accuracy %.3f" % (
            train_loss, val_loss, val_accuracy))

In [962]:
def get_optimizer(model, lr = 0.01, wd = 0.00001):
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optim = torch.optim.Adam(parameters, lr=lr, weight_decay=wd)
    return optim

In [963]:
def train_loop(model, lr, train_dl, val_dl, epochs=20):
    optim = get_optimizer(model, lr =lr, wd = 0.0)
    for i in range(epochs):
        loss = train(model, optim, train_dl)
        val_loss, val_acc = val_metric(model, val_dl)
        if i%5 == 1: print("train loss %.3f val loss %.3f and val accuracy %.3f" % (loss, val_loss, val_acc))

In [964]:
V = len(vocab)
D = 30
N = 20

In [965]:
model = CNN(V, D)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
train_epocs(model, optimizer, epochs=10)

train_loss 0.058 val_loss 0.017 val_accuracy 0.570
train_loss 0.037 val_loss 0.016 val_accuracy 0.612
train_loss 0.033 val_loss 0.015 val_accuracy 0.631
train_loss 0.031 val_loss 0.014 val_accuracy 0.649
train_loss 0.029 val_loss 0.013 val_accuracy 0.669
train_loss 0.028 val_loss 0.013 val_accuracy 0.679
train_loss 0.027 val_loss 0.013 val_accuracy 0.688
train_loss 0.027 val_loss 0.012 val_accuracy 0.693
train_loss 0.026 val_loss 0.012 val_accuracy 0.703
train_loss 0.026 val_loss 0.012 val_accuracy 0.712


In [966]:
train_epocs(model, optimizer, epochs=20)

train_loss 0.026 val_loss 0.012 val_accuracy 0.717
train_loss 0.025 val_loss 0.012 val_accuracy 0.724
train_loss 0.025 val_loss 0.011 val_accuracy 0.721
train_loss 0.025 val_loss 0.011 val_accuracy 0.729
train_loss 0.024 val_loss 0.011 val_accuracy 0.730
train_loss 0.024 val_loss 0.011 val_accuracy 0.734
train_loss 0.024 val_loss 0.011 val_accuracy 0.737
train_loss 0.024 val_loss 0.011 val_accuracy 0.742
train_loss 0.024 val_loss 0.011 val_accuracy 0.746
train_loss 0.023 val_loss 0.011 val_accuracy 0.740
train_loss 0.023 val_loss 0.011 val_accuracy 0.745
train_loss 0.023 val_loss 0.011 val_accuracy 0.751
train_loss 0.023 val_loss 0.011 val_accuracy 0.749
train_loss 0.023 val_loss 0.010 val_accuracy 0.750
train_loss 0.023 val_loss 0.010 val_accuracy 0.752
train_loss 0.023 val_loss 0.010 val_accuracy 0.752
train_loss 0.022 val_loss 0.010 val_accuracy 0.756
train_loss 0.023 val_loss 0.010 val_accuracy 0.758
train_loss 0.022 val_loss 0.010 val_accuracy 0.763
train_loss 0.022 val_loss 0.010

In [967]:
train_epocs(model, optimizer, epochs=20)

train_loss 0.022 val_loss 0.010 val_accuracy 0.762
train_loss 0.022 val_loss 0.010 val_accuracy 0.762
train_loss 0.022 val_loss 0.010 val_accuracy 0.766
train_loss 0.022 val_loss 0.010 val_accuracy 0.762
train_loss 0.022 val_loss 0.010 val_accuracy 0.764
train_loss 0.022 val_loss 0.010 val_accuracy 0.767
train_loss 0.022 val_loss 0.010 val_accuracy 0.763
train_loss 0.022 val_loss 0.010 val_accuracy 0.770
train_loss 0.022 val_loss 0.010 val_accuracy 0.770
train_loss 0.022 val_loss 0.010 val_accuracy 0.771
train_loss 0.022 val_loss 0.010 val_accuracy 0.773
train_loss 0.022 val_loss 0.010 val_accuracy 0.770
train_loss 0.022 val_loss 0.010 val_accuracy 0.774
train_loss 0.022 val_loss 0.010 val_accuracy 0.774
train_loss 0.021 val_loss 0.010 val_accuracy 0.773
train_loss 0.021 val_loss 0.010 val_accuracy 0.776
train_loss 0.021 val_loss 0.010 val_accuracy 0.775
train_loss 0.021 val_loss 0.010 val_accuracy 0.777
train_loss 0.021 val_loss 0.010 val_accuracy 0.777
train_loss 0.021 val_loss 0.009