In [1]:
import torch
from NLPUtils import *

import numpy as np
%matplotlib notebook
import matplotlib.pyplot as plt

%load_ext autoreload
%autoreload 2

In [2]:
sentences = []
with open("datasetSentences.txt", "r") as f:
    first = True
    for line in f:
        if first:
            first = False
            continue

        splitted = line.strip().split()[1:]
        # Deal with some peculiar encoding issues with this file
        sentences += [[w.lower() for w in splitted]]

split = [[] for i in range(3)]
with open("datasetSplit.txt", "r") as f:
    first = True
    for line in f:
        if first:
            first = False
            continue

        splitted = line.strip().split(",")
        split[int(splitted[1]) - 1] += [int(splitted[0]) - 1]
        
# Tomamos como corpus el set de entrenamiento:
corpus = [sentences[i] for i in split[0]]

In [7]:
""" Función para entrenar Word Embeddings
"""
def SGDTrainWordVectors(data,                   # Corpus de entrenamiento (debe ser una lista de listas de strings).
                        lm='CBOW',              # Modelo de lenguaje a utilizar.
                        window_size=2,          # Tamaño de la ventana.
                        batch_size=64,          # Tamaño del batch.
                        embedding_dim=100,      # Dimensión de los word embeddings.
                        use_gpu=True,           # Flags para usar las GPUs.
                        epochs=1,               # Cantidad de epochs.
                        learning_rate=1e-2,     # Tasa de aprendizaje.
                        sample_loss_every=100): # Cada cuántas iteraciones calcular la loss.
    
    # Chequeo que se haya pasado bien el corpus:
    data_is_ok = True
    if isinstance(data,list):
        for doc in data:
            if isinstance(doc,list) or not data_is_ok:
                for token in doc:
                    if not isinstance(token,str):
                        data_is_ok = False
                        break
            else:
                data_is_ok = False
                break
    else:
        data_is_ok = False
                        
    if data_is_ok:
        corpus = data
    else:
        raise TypeError('data debe ser una lista de listas de tokens o un texto plano (string)')
        return
    
    # Obtengo los batches de muestras:
    dataset = Word2VecSamples(corpus, window_size=window_size)
    samples_idx = torch.randperm(len(dataset))
    my_sampler = lambda indices: sampler.SubsetRandomSampler(indices)
    dataloader = DataLoader(dataset, batch_size=batch_size, sampler=my_sampler(samples_idx))
    
    vocab_size = len(dataset.vocabulary)    
    
    # Defino el modelo:
    if lm == 'CBOW':
        model = CBOWModel(vocab_size, embedding_dim)
    elif lm == 'SkipGram':
        model = SkipGramModel(vocab_size, embedding_dim)
    else:
        raise TypeError('El modelo de entrenamiento no es válido.')
    
    
    print('Starting training...')
    loss_history = {'iter': [], 'loss': []}
    device = torch.device('cuda:1') if torch.cuda.is_available() and use_gpu else torch.device('cpu')
    model = model.to(device=device)
    optimizer = optim.SGD(model.parameters(), lr=learning_rate)
    batch_len = len(dataloader)
    
    try:
        for e in range(epochs):
            for t, (x,y) in enumerate(dataloader):
                model.train()
                x = x.to(device=device, dtype=torch.long)
                y = y.to(device=device, dtype=torch.long)

                if lm == 'CBOW':
                    scores = model(y)
                    loss = model.loss(scores,x)
                elif lm == 'SkipGram':
                    scores = model(x)
                    loss = model.loss(scores,y)

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                if (e * batch_len + t) % sample_loss_every == 0:
                    print('Epoch: {}, Batch number: {}, Loss: {}'.format(e+1, t,loss.item()))
                    loss_history['iter'].append(e * batch_len + t)
                    loss_history['loss'].append(loss.item())
                    
                    
        return model.emb, dataset.vocabulary, loss_history
                    
    except KeyboardInterrupt:
        
        print('Exiting training...')
        return model.emb, dataset.vocabulary, loss_history


# Modelo de lenguaje:
method = 'SkipGram'
window_size = [1,2,3,4]
embedding_dim = 100

# Parámetros de iteración:
batch_size = 64
epochs = 50
learning_rate = 1e-1
sample_loss_every = 1000
use_gpu = True

history = []
embeddings = []
for ws in window_size:
    embedding_layer, vocab, loss_history = SGDTrainWordVectors(corpus,lm=method,window_size=ws,batch_size=batch_size,embedding_dim=embedding_dim,use_gpu=use_gpu,epochs=epochs,learning_rate=learning_rate,sample_loss_every=sample_loss_every)
    history.append(loss_history)
    embeddings.append(embedding_layer)

fig, ax = plt.subplots()
for i, ws in enumerate(window_size):
    ax.plot(history[i]['iter'],history[i]['loss'],label='Window Size = {}'.format(ws))

ax.legend()

Starting training...
Epoch: 1, Batch number: 0, Loss: 9.918039321899414
Epoch: 1, Batch number: 1000, Loss: 8.455574989318848
Epoch: 1, Batch number: 2000, Loss: 7.912580966949463
Epoch: 2, Batch number: 444, Loss: 7.825655937194824
Epoch: 2, Batch number: 1444, Loss: 7.752556800842285
Epoch: 2, Batch number: 2444, Loss: 7.307127475738525
Epoch: 3, Batch number: 888, Loss: 6.961059093475342
Epoch: 3, Batch number: 1888, Loss: 7.6171464920043945
Epoch: 4, Batch number: 332, Loss: 7.429873943328857
Epoch: 4, Batch number: 1332, Loss: 7.37601900100708
Epoch: 4, Batch number: 2332, Loss: 7.041444301605225
Epoch: 5, Batch number: 776, Loss: 7.479556083679199
Epoch: 5, Batch number: 1776, Loss: 6.841553688049316
Epoch: 6, Batch number: 220, Loss: 6.36747932434082
Epoch: 6, Batch number: 1220, Loss: 6.878442764282227
Epoch: 6, Batch number: 2220, Loss: 7.124186992645264
Epoch: 7, Batch number: 664, Loss: 6.769253730773926
Epoch: 7, Batch number: 1664, Loss: 7.0002055168151855
Epoch: 8, Batch 

Epoch: 9, Batch number: 2552, Loss: 6.869040012359619
Epoch: 10, Batch number: 996, Loss: 6.806644439697266
Epoch: 10, Batch number: 1996, Loss: 7.030974388122559
Epoch: 11, Batch number: 440, Loss: 7.030904293060303
Epoch: 11, Batch number: 1440, Loss: 6.822992324829102
Epoch: 11, Batch number: 2440, Loss: 7.089425086975098
Epoch: 12, Batch number: 884, Loss: 6.76313591003418
Epoch: 12, Batch number: 1884, Loss: 6.732667922973633
Epoch: 13, Batch number: 328, Loss: 6.7406535148620605
Epoch: 13, Batch number: 1328, Loss: 6.536139965057373
Epoch: 13, Batch number: 2328, Loss: 6.961251258850098
Epoch: 14, Batch number: 772, Loss: 6.746321201324463
Epoch: 14, Batch number: 1772, Loss: 6.720941543579102
Epoch: 15, Batch number: 216, Loss: 6.617141246795654
Epoch: 15, Batch number: 1216, Loss: 6.534308433532715
Epoch: 15, Batch number: 2216, Loss: 7.016434669494629
Epoch: 16, Batch number: 660, Loss: 6.828343868255615
Epoch: 16, Batch number: 1660, Loss: 6.650965690612793
Epoch: 17, Batch n

Epoch: 18, Batch number: 2548, Loss: 6.88289213180542
Epoch: 19, Batch number: 992, Loss: 6.851823806762695
Epoch: 19, Batch number: 1992, Loss: 6.934492588043213
Epoch: 20, Batch number: 436, Loss: 6.657252311706543
Epoch: 20, Batch number: 1436, Loss: 6.683864593505859
Epoch: 20, Batch number: 2436, Loss: 6.6248602867126465
Epoch: 21, Batch number: 880, Loss: 6.6704277992248535
Epoch: 21, Batch number: 1880, Loss: 6.597770690917969
Epoch: 22, Batch number: 324, Loss: 6.573245048522949
Epoch: 22, Batch number: 1324, Loss: 6.55191707611084
Epoch: 22, Batch number: 2324, Loss: 6.938693046569824
Epoch: 23, Batch number: 768, Loss: 6.784461975097656
Epoch: 23, Batch number: 1768, Loss: 6.885495662689209
Epoch: 24, Batch number: 212, Loss: 6.743733882904053
Epoch: 24, Batch number: 1212, Loss: 6.560019016265869
Epoch: 24, Batch number: 2212, Loss: 6.801799297332764
Epoch: 25, Batch number: 656, Loss: 6.882972240447998
Epoch: 25, Batch number: 1656, Loss: 6.7736005783081055
Epoch: 26, Batch

Epoch: 27, Batch number: 2544, Loss: 6.763978958129883
Epoch: 28, Batch number: 988, Loss: 6.865246295928955
Epoch: 28, Batch number: 1988, Loss: 7.004471302032471
Epoch: 29, Batch number: 432, Loss: 6.8444647789001465
Epoch: 29, Batch number: 1432, Loss: 6.908734321594238
Epoch: 29, Batch number: 2432, Loss: 6.797450542449951
Epoch: 30, Batch number: 876, Loss: 6.749401092529297
Epoch: 30, Batch number: 1876, Loss: 6.689740180969238
Epoch: 31, Batch number: 320, Loss: 6.781877040863037
Epoch: 31, Batch number: 1320, Loss: 6.719272136688232
Epoch: 31, Batch number: 2320, Loss: 6.724177360534668
Epoch: 32, Batch number: 764, Loss: 6.89730167388916
Epoch: 32, Batch number: 1764, Loss: 6.704092979431152
Epoch: 33, Batch number: 208, Loss: 6.658263206481934
Epoch: 33, Batch number: 1208, Loss: 6.543793678283691
Epoch: 33, Batch number: 2208, Loss: 6.7426886558532715
Epoch: 34, Batch number: 652, Loss: 6.694216251373291
Epoch: 34, Batch number: 1652, Loss: 6.656282424926758
Epoch: 35, Batch

<IPython.core.display.Javascript object>

<matplotlib.legend.Legend at 0x7f10e8d672b0>

In [8]:
# Se visualizan las n palabras más frecuentes:
# n = 100
# freqs = np.array([vocab.get_freq(idx) for idx in range(len(vocab))])
# freqs_indeces = (len(vocab) - np.argsort(freqs) - 1).tolist()[:n]
# visualizeWords = [vocab.index_to_token(idx) for idx in freqs_indeces]

visualizeWords = ["great", "cool", "brilliant", "wonderful", "well", "amazing",
                  "worth", "sweet", "enjoyable", "boring", "bad", "dumb",
                  "annoying", "female", "male", "queen", "king", "man", "woman", "rain", "snow",
                  "hail", "coffee", "tea"]

visualizeIdx = [vocab.token_to_index(word) for word in visualizeWords]

for emb in embeddings:
    visualizeVecs = np.zeros((len(visualizeIdx), embedding_dim),dtype=np.float)
    for i, word in enumerate(visualizeWords):
        idx = vocab.token_to_index(word)
        visualizeVecs[i,:] = next(emb.parameters())[idx,:].cpu().detach().numpy()
    temp = (visualizeVecs - np.mean(visualizeVecs, axis=0))
    covariance = 1.0 / len(visualizeIdx) * temp.T.dot(temp)
    U,S,V = np.linalg.svd(covariance)
    coord = temp.dot(U[:,0:2])

    fig, ax = plt.subplots()
    for i in range(len(visualizeWords)):
        ax.text(coord[i,0], coord[i,1], visualizeWords[i],
            bbox=dict(facecolor='green', alpha=0.1))

    ax.set_xlim((np.min(coord[:,0]), np.max(coord[:,0])))
    ax.set_ylim((np.min(coord[:,1]), np.max(coord[:,1])))

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>