## Importing modules

In [1]:
import os
import time
import math
import string
import random

In [2]:
import collections
import numpy as np
import matplotlib.pyplot as plt

In [3]:
import torch
from torch import nn
from torch import optim
from torch.functional import F
from torch.utils.data import DataLoader

In [4]:
import pandas as pd
import csv

In [5]:
from utils import evaluate
from utils import training

In [6]:
from importlib import reload

## Preprocessing data

In [32]:
def split_to_names(fname):
    """
    Input:
        fname: Path to data file.
    
    Output:
        data: List of names (which is a list of characters).
    """
    EOS = "<EOS>"
    data = []
    
    with open(fname, 'r', newline='', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        
        for row in reader:

                name = row['name'].lower()

                # Ignore names containing non-ascii characters
                #if not all(x.isalpha() or x.isspace() for x in name):
                    #continue

                # Split names to chars and append the End of Sentence (EOS) Token
                ch_list = list(name) + [EOS]
                data.append(ch_list)
    return data

In [33]:
all_names = split_to_names("./data/StarWars_Characters.csv")
print("Number of names:", len(all_names))
all_names

Number of names: 5334


[['1',
  '1',
  '3',
  '8',
  ' ',
  ' ',
  '(',
  'f',
  'i',
  'r',
  's',
  't',
  ' ',
  'o',
  'r',
  'd',
  'e',
  'r',
  ')',
  '<EOS>'],
 ['1', '1', '5', '1', '<EOS>'],
 ['1', '1', '7', '4', '<EOS>'],
 ['2',
  '2',
  '4',
  ' ',
  ' ',
  '(',
  'c',
  'o',
  'r',
  'u',
  's',
  'c',
  'a',
  'n',
  't',
  ' ',
  's',
  'e',
  'c',
  'u',
  'r',
  'i',
  't',
  'y',
  ' ',
  'f',
  'o',
  'r',
  'c',
  'e',
  ')',
  '<EOS>'],
 ['3', '-', '6', '<EOS>'],
 ['3', '-', '9', '<EOS>'],
 ['3', '9', '8', '<EOS>'],
 ['7', '1', '7', '3', '<EOS>'],
 ['9', '2', '6', '<EOS>'],
 ['9', '9', '<EOS>'],
 ['a', "'", 'k', 'o', 'b', 'a', '<EOS>'],
 ['a', "'", 'v', 'o', 'r', '<EOS>'],
 ['a', "'", 'v', 'o', 'r', "'", 's', ' ', 't', 'w', 'i', 'n', '<EOS>'],
 ['a', "'", 'y', 'a', 'r', 'k', '<EOS>'],
 ['a',
  "'",
  'y',
  'a',
  'r',
  'k',
  "'",
  's',
  ' ',
  't',
  'r',
  'i',
  'b',
  'e',
  '<EOS>'],
 ['a', 'a', 'n', 'g', '<EOS>'],
 ['a',
  'a',
  'r',
  't',
  'o',
  'n',
  ' ',
  'c',
  'h',
  

## Analysing data

In [34]:
all_names_onelist = []
for _name in all_names:
    all_names_onelist.extend(_name)
print(len(all_names_onelist))

72303


In [35]:
_counter = collections.Counter(all_names_onelist)

In [36]:
print(_counter.keys())

dict_keys(['1', '3', '8', ' ', '(', 'f', 'i', 'r', 's', 't', 'o', 'd', 'e', ')', '<EOS>', '5', '7', '4', '2', 'c', 'u', 'a', 'n', 'y', '-', '6', '9', "'", 'k', 'b', 'v', 'w', 'g', 'h', 'l', 'm', 'q', '0', 'p', 'z', 'x', 'j', '"', 'í', '/', 'î', 'é', 'ó', '.', ',', 'á'])


In [37]:
_counter.most_common()#[::-1]

[('a', 6264),
 ('e', 6109),
 ('<EOS>', 5334),
 ('i', 5266),
 (' ', 4991),
 ('r', 4836),
 ('n', 4788),
 ('o', 3907),
 ('t', 3599),
 ('l', 2998),
 ('s', 2866),
 ('d', 2806),
 ('u', 2079),
 ('m', 1734),
 ('h', 1666),
 ('k', 1446),
 ('c', 1442),
 ('f', 1405),
 ('g', 1112),
 ('b', 1098),
 ('p', 991),
 ('y', 888),
 ('v', 641),
 ('w', 501),
 ("'", 422),
 ('j', 417),
 ('(', 403),
 (')', 403),
 ('z', 402),
 ('-', 325),
 ('x', 217),
 ('1', 137),
 ('q', 122),
 ('2', 105),
 ('0', 84),
 ('3', 75),
 ('4', 70),
 ('9', 61),
 ('8', 60),
 ('7', 59),
 ('5', 55),
 ('6', 53),
 ('é', 30),
 ('"', 16),
 ('.', 10),
 ('/', 4),
 ('á', 2),
 ('í', 1),
 ('î', 1),
 ('ó', 1),
 (',', 1)]

In [38]:
for _ch in ["a", "e", "i", "o", "u", "", "q", "s"]:
    print("{}:{}".format(_ch, _counter[_ch]), end=", ")

a:6264, e:6109, i:5266, o:3907, u:2079, :0, q:122, s:2866, 

In [39]:
print(len(_counter.keys()))

51


In [40]:
_non_count = 0
for _key in _counter.keys():
    if _key.isalpha():
        continue
    _non_count += 1
    print("{", _key, ":", _counter[_key], "}", end=" ")

{ 1 : 137 } { 3 : 75 } { 8 : 60 } {   : 4991 } { ( : 403 } { ) : 403 } { <EOS> : 5334 } { 5 : 55 } { 7 : 59 } { 4 : 70 } { 2 : 105 } { - : 325 } { 6 : 53 } { 9 : 61 } { ' : 422 } { 0 : 84 } { " : 16 } { / : 4 } { . : 10 } { , : 1 } 

In [41]:
print(_non_count)

20


In [42]:
print(all_names[0:2])

[['1', '1', '3', '8', ' ', ' ', '(', 'f', 'i', 'r', 's', 't', ' ', 'o', 'r', 'd', 'e', 'r', ')', '<EOS>'], ['1', '1', '5', '1', '<EOS>']]


In [43]:
del _counter["<EOS>"]

In [44]:
char_vocab = ["<EOS>"] + sorted(_counter.keys()) + ["<PAD>"]

In [45]:
char_vocab

['<EOS>',
 ' ',
 '"',
 "'",
 '(',
 ')',
 ',',
 '-',
 '.',
 '/',
 '0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z',
 'á',
 'é',
 'í',
 'î',
 'ó',
 '<PAD>']

## Saving Processed data

In [46]:
data_dict = {"data_in_char": all_names,
             "char_vocab": char_vocab}

In [47]:
torch.save(data_dict, "./saves/data/clean_names_starwars.pt")