## Importing modules

In [81]:
import os
import time
import math
import string
import random

In [82]:
import collections
import numpy as np
import matplotlib.pyplot as plt

In [83]:
import torch
from torch import nn
from torch import optim
from torch.functional import F
from torch.utils.data import DataLoader

In [84]:
import pandas as pd
import csv

In [85]:
from utils import evaluate
from utils import training

In [86]:
from importlib import reload

## Preprocessing data

In [152]:
def split_to_names(fname):
    """
    Input:
        fname: Path to data file.
    
    Output:
        data: List of names (which is a list of characters).
    """
    EOS = "<EOS>"
    data = []
    
    with open(fname, 'r', newline='', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        
        for row in reader:

                name = row['Name'].lower()

                # Ignore names containing non-ascii characters
                if not all(x.isalpha() or x.isspace() for x in name):
                    continue

                # Split names to chars and append the End of Sentence (EOS) Token
                ch_list = list(name) + [EOS]
                data.append(ch_list)
    return data

In [153]:
all_names = split_to_names("./data/HPCharactersData.csv")
print("Number of names:", len(all_names))
all_names

Number of names: 1137


[['h', 'a', 'n', 'n', 'a', 'h', ' ', 'a', 'b', 'b', 'o', 't', 't', '<EOS>'],
 ['a', 'b', 'e', 'l', ' ', 't', 'r', 'e', 'e', 't', 'o', 'p', 's', '<EOS>'],
 ['e',
  'u',
  'a',
  'n',
  ' ',
  'a',
  'b',
  'e',
  'r',
  'c',
  'r',
  'o',
  'm',
  'b',
  'i',
  'e',
  '<EOS>'],
 ['a',
  'b',
  'e',
  'r',
  'f',
  'o',
  'r',
  't',
  'h',
  ' ',
  'd',
  'u',
  'm',
  'b',
  'l',
  'e',
  'd',
  'o',
  'r',
  'e',
  '<EOS>'],
 ['a', 'b', 'e', 'r', 'n', 'a', 't', 'h', 'y', '<EOS>'],
 ['a',
  'b',
  'r',
  'a',
  'h',
  'a',
  'm',
  ' ',
  'p',
  'e',
  'a',
  's',
  'e',
  'g',
  'o',
  'o',
  'd',
  '<EOS>'],
 ['a',
  'b',
  'r',
  'a',
  'h',
  'a',
  'm',
  ' ',
  'p',
  'o',
  't',
  't',
  'e',
  'r',
  '<EOS>'],
 ['a',
  'b',
  'r',
  'a',
  'x',
  'a',
  's',
  ' ',
  'm',
  'a',
  'l',
  'f',
  'o',
  'y',
  '<EOS>'],
 ['a',
  'c',
  'h',
  'i',
  'l',
  'l',
  'e',
  's',
  ' ',
  't',
  'o',
  'l',
  'l',
  'i',
  'v',
  'e',
  'r',
  '<EOS>'],
 ['s',
  't',
  'e',
  'w',
  '

## Analysing data

In [154]:
all_names_onelist = []
for _name in all_names:
    all_names_onelist.extend(_name)
print(len(all_names_onelist))

16388


In [155]:
_counter = collections.Counter(all_names_onelist)

In [156]:
print(_counter.keys())

dict_keys(['h', 'a', 'n', ' ', 'b', 'o', 't', '<EOS>', 'e', 'l', 'r', 'p', 's', 'u', 'c', 'm', 'i', 'f', 'd', 'y', 'g', 'x', 'v', 'w', 'k', 'z', 'j', 'q', 'é', 'ó', 'ã', 'ä', 'ç', 'ü', 'á'])


In [157]:
_counter.most_common()#[::-1]

[('e', 1400),
 ('a', 1334),
 ('r', 1139),
 ('<EOS>', 1137),
 (' ', 1045),
 ('o', 1013),
 ('l', 985),
 ('i', 955),
 ('n', 904),
 ('t', 744),
 ('s', 734),
 ('d', 547),
 ('u', 497),
 ('m', 479),
 ('c', 459),
 ('g', 449),
 ('h', 432),
 ('b', 414),
 ('y', 334),
 ('p', 291),
 ('k', 282),
 ('w', 243),
 ('f', 233),
 ('v', 164),
 ('j', 62),
 ('z', 46),
 ('q', 34),
 ('x', 21),
 ('é', 4),
 ('ü', 2),
 ('ó', 1),
 ('ã', 1),
 ('ä', 1),
 ('ç', 1),
 ('á', 1)]

In [158]:
for _ch in ["a", "e", "i", "o", "u", "", "q", "s"]:
    print("{}:{}".format(_ch, _counter[_ch]), end=", ")

a:1334, e:1400, i:955, o:1013, u:497, :0, q:34, s:734, 

In [159]:
print(len(_counter.keys()))

35


In [160]:
_non_count = 0
for _key in _counter.keys():
    if _key.isalpha():
        continue
    _non_count += 1
    print("{", _key, ":", _counter[_key], "}", end=" ")

{   : 1045 } { <EOS> : 1137 } 

In [161]:
print(_non_count)

2


In [162]:
print(all_names[0:2])

[['h', 'a', 'n', 'n', 'a', 'h', ' ', 'a', 'b', 'b', 'o', 't', 't', '<EOS>'], ['a', 'b', 'e', 'l', ' ', 't', 'r', 'e', 'e', 't', 'o', 'p', 's', '<EOS>']]


In [163]:
del _counter["<EOS>"]

In [164]:
char_vocab = ["<EOS>"] + sorted(_counter.keys()) + ["<PAD>"]

In [165]:
char_vocab

['<EOS>',
 ' ',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z',
 'á',
 'ã',
 'ä',
 'ç',
 'é',
 'ó',
 'ü',
 '<PAD>']

## Saving Processed data

In [166]:
data_dict = {"data_in_char": all_names,
             "char_vocab": char_vocab}

In [167]:
torch.save(data_dict, "./saves/data/clean_names.pt")