In [1]:
import numpy as np
from math import floor, sqrt
from tqdm import tqdm
import matplotlib.pyplot as plt
import scipy.sparse
import time

In [2]:
filename = "./Datasets/ascii_names.txt"
validation_filename = "./Datasets/Validation_Inds.txt"

In [3]:
with open(filename, 'r') as f:
    lines = f.readlines()

with open(validation_filename, 'r') as f:
    lines_validation = f.readlines()

validation_indexes = lines_validation[0][:-1].split(' ')
validation_indexes = list(map(int, validation_indexes))

dataset = {}
names = []
labels = []
dataset["names_train"] = []
dataset["labels_train"] = []
dataset["names_validation"] = []
dataset["labels_validation"] = []

all_names = ""

index = 0

for line in lines:
    temp = line.replace(',', '').lower().split(' ')
    name = ""
    for i in range(len(temp) - 1):
        if i != 0:
            name += ' '
        name += temp[i]
        all_names += temp[i]
    temp = temp[-1].replace('\n', '')

    names.append(name)
    labels.append(int(temp))

    if (index + 1) in validation_indexes:
        dataset["names_validation"].append(name)
        dataset["labels_validation"].append(int(temp)-1)
    else:
        dataset["names_train"].append(name)
        dataset["labels_train"].append(int(temp)-1)

    index += 1

dataset["alphabet"] = ''.join(set(all_names)) + ' '
dataset["d"] = len(dataset["alphabet"])
dataset["K"] = len(list(set(labels)))
dataset["n_len"] = len(max(names, key=len))

dataset["labels_validation"] = np.array(dataset["labels_validation"])
dataset["labels_train"] = np.array(dataset["labels_train"])

In [14]:
dataset.keys()

dict_keys(['names_train', 'labels_train', 'names_validation', 'labels_validation', 'alphabet', 'd', 'K', 'n_len'])

In [23]:
dataset['names_train']

['abadi',
 'almasi',
 'amari',
 'antoun',
 'arian',
 'asfour',
 'asghar',
 'asker',
 'assaf',
 'aswad',
 'atiyeh',
 'attia',
 'ba',
 'bahar',
 'basara',
 'bata',
 'baz',
 'bazzi',
 'bishara',
 'bitar',
 'botros',
 'boulos',
 'boutros',
 'cham',
 'dagher',
 'daher',
 'essa',
 'fakhoury',
 'gaber',
 'ganem',
 'ganim',
 'guirguis',
 'hadad',
 'haddad',
 'haik',
 'hajjar',
 'hakimi',
 'halabi',
 'handal',
 'harb',
 'isa',
 'issa',
 'kanaan',
 'kassab',
 'kassis',
 'kattan',
 'khouri',
 'koury',
 'maalouf',
 'maloof',
 'malouf',
 'maroun',
 'masih',
 'mifsud',
 'mikhail',
 'moghadam',
 'morcos',
 'mustafa',
 'nader',
 'nahas',
 'naifeh',
 'najjar',
 'naser',
 'nassar',
 'nazari',
 'quraishi',
 'qureshi',
 'rahal',
 'sabbag',
 'sabbagh',
 'safar',
 'said',
 'salib',
 'saliba',
 'samaha',
 'sarkis',
 'sarraf',
 'seif',
 'shadid',
 'shalhoub',
 'shammas',
 'shamoon',
 'shamoun',
 'sleiman',
 'srour',
 'tahan',
 'tannous',
 'toma',
 'totah',
 'touma',
 'tuma',
 'wasem',
 'zogby',
 'ang',
 'auyo

In [24]:
len(dataset['names_train'])

17205

In [25]:
len(dataset['names_validation'])

216

In [15]:
dataset['labels_train']

array([ 0,  0,  0, ..., 17, 17, 17])

In [26]:
len(dataset['labels_train'])

17205

In [16]:
dataset['n_len']

19

In [20]:
dataset['alphabet']

"agexyfcqompvtjzwusndh'ilrbk "

In [49]:
dataset['alphabet']

"fewohqbgjktn'mpxudvcirzlyas "

In [21]:
dataset['d']

28

In [22]:
dataset['K']

18

In [34]:
dict((c, i) for i, c in enumerate(dataset["alphabet"]))

{'a': 0,
 'g': 1,
 'e': 2,
 'x': 3,
 'y': 4,
 'f': 5,
 'c': 6,
 'q': 7,
 'o': 8,
 'm': 9,
 'p': 10,
 'v': 11,
 't': 12,
 'j': 13,
 'z': 14,
 'w': 15,
 'u': 16,
 's': 17,
 'n': 18,
 'd': 19,
 'h': 20,
 "'": 21,
 'i': 22,
 'l': 23,
 'r': 24,
 'b': 25,
 'k': 26,
 ' ': 27}

In [106]:
labels = dataset['labels_train']
K = dataset['K']

n_labels = len(labels)
oneHotLabels = np.zeros((K, n_labels), dtype=int)

In [107]:
for i in range(n_labels):
    
    #print(oneHotLabels[:, i][labels[i]])
    
    oneHotLabels[:, i][labels[i]] += 1 

In [47]:
len(oneHotLabels[:,0])

18

In [46]:
len(oneHotLabels[0])

17205

In [79]:
oneHotLabels

array([[1, 1, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 1, 1]])

In [51]:
oneHotLabels[:, i][2] = 3.0

In [5]:
names = dataset['names_train']
d = dataset['d']
n_len = dataset['n_len']
alphabet = dataset['alphabet']

names = names[:][:100]

In [6]:
n_names = len(names)

In [7]:
first = True
for name in names:
    oneHotName = np.zeros((d, n_len))

    for i in range(len(name)):
        letter = name[i]
        letterEncoded = alphabet.find(letter)
        
        oneHotName[:, i][letterEncoded] += 1
        
    oneHotName = oneHotName.T.reshape(-1)
    
    if first:
        oneHotNames = oneHotName
        first = False
    else:
        oneHotNames = np.vstack([oneHotNames, [oneHotName]])
        
oneHotNames = oneHotNames.T

In [8]:
class ConvNet:
    def __init__(self, F, W):
        self.F = F
        self.W = W

In [11]:
n1 = 20
n2 = 20
k1 = 5
k2 = 3

In [26]:
F = []
F1 = np.random.randn(d,k1,n1)*(2/n1)
print(F1.shape)
F.append(F1)
F2 = np.random.randn(n1,k2,n2)*(2/n2)
print(F2.shape)
F.append(F2)

W = np.random.randn(dataset['K'],n2)*(2/n2)
print(W.shape)


(28, 5, 20)
(20, 3, 20)
(18, 20)


In [19]:
convnet = ConvNet(F, W)

In [23]:
convnet.F[0].shape

(28, 5, 20)

In [24]:
convnet.F[1].shape

(20, 3, 20)

In [25]:
convnet.W.shape

(18, 20)

In [27]:
oneHotNames.shape


(532, 100)