Implementation based on the article [Exploiting Similarities among Languages for Machine Translation](https://https://arxiv.org/abs/1309.4168)

#### **IMPORTS**

In [None]:
import torch
import numpy as np
import time
import pandas as pd
import matplotlib.pyplot as plt

from torch.nn import functional as F
from sklearn.neighbors import NearestNeighbors as NN
from torch import nn
from numpy.linalg import svd
from sklearn.decomposition import PCA
from torchtext.vocab import FastText
from sklearn.model_selection import train_test_split 


In [None]:
# For google colab
from google.colab import drive
drive.mount("/content/drive")
%cd drive/MyDrive/A2

In [None]:
%cd ./drive/MyDrive/Projet word embeddings/

#### **Download FastText vocabulary (word embeddings)**

In [None]:
try: 
  target_vocabulary = FastText('multi.fr', max_vectors=30000)
  source_vocabulary = FastText('multi.en', max_vectors=30000)
except:
  target_vocabulary = FastText('fr', max_vectors=10000)
  source_vocabulary = FastText('en', max_vectors=10000)

In [None]:
dictionary=pd.read_csv("en-fr.txt",sep=" ", header=None)
dictionary.columns=['source','target']
dictionary.head(10)

#### **Define functions:**

- `get_vector`: return embedding vector of given word in a given vocabulary
- `closest_words`: among all the words in the vocabulary, find closest ones to a given vector, which is not necessarily in the vocabulary.

In [None]:
def get_vector(embeddings, word):
  if word not in embeddings.stoi:
    print(word, ' not in vocabulary!')
    return
  return embeddings.vectors[embeddings.stoi[word]]

def closest_words(embeddings, vector, n = 10):
  vector = torch.as_tensor(vector) if isinstance(vector, (np.ndarray, list)) else vector
  if not isinstance(vector, torch.Tensor):
     print('vector of type ', type(vector), ' not accepted!') 
     return
  vector_repeated = vector.repeat(embeddings.vectors.shape[0], 1)
  distances = torch.norm(vector_repeated - embeddings.vectors, dim=1)
  if n==1:
    argmin = torch.argmin(distances)
    return (embeddings.itos[argmin], distances[argmin].item())
  
  idxs = sorted(range(len(distances)), key=lambda k: distances[k])[:n]
  return [(embeddings.itos[idx], distances[idx].item()) for idx in idxs]

def closest_words_cosin(embeddings, vector, n = 10):
  vector = torch.as_tensor(vector) if isinstance(vector, (np.ndarray, list)) else vector
  if not isinstance(vector, torch.Tensor):
     print('vector of type ', type(vector), ' not accepted!') 
     return

  distances = torch.FloatTensor([1 - vector @ i.T for i in embeddings.vectors])
  if n==1:
    argmin = torch.argmin(distances)
    return (embeddings.itos[argmin], distances[argmin].item())
  
  idxs = sorted(range(len(distances)), key=lambda k: distances[k])[:n]
  return [(embeddings.itos[idx], distances[idx].item()) for idx in idxs]
    

In [None]:
closest_words(target_vocabulary, get_vector(target_vocabulary, 'roi') - get_vector(target_vocabulary, 'homme') + get_vector(target_vocabulary, 'femme'))

In [None]:
closest_words_cosin(target_vocabulary, get_vector(target_vocabulary, 'roi') - get_vector(target_vocabulary, 'homme') + get_vector(target_vocabulary, 'femme'))

#### **Get index map translations**

In [None]:
stoi_dict_target=pd.DataFrame.from_dict(target_vocabulary.stoi,orient='index')
stoi_dict_source=pd.DataFrame.from_dict(source_vocabulary.stoi,orient='index')

stoi_dict_target.columns=['index_target']
stoi_dict_source.columns=['index_source']
stoi_dict_target.index.name='word'
stoi_dict_source.index.name='word'
stoi_dict_target.head(10)

In [None]:
words_indices=dictionary.join(stoi_dict_source,on='source')
words_indices=words_indices.join(stoi_dict_target,on='target')

words_indices=words_indices.dropna()

words_indices['index_target'] = words_indices['index_target'].astype(int)
words_indices['index_source'] = words_indices['index_source'].astype(int)

words_indices.head(10)

## TRANSLATE

#### **Split train and test**

Our data is composed by the indices of each word. We'll next divide it into train and test set.

In [None]:
data = words_indices[['index_source', 'index_target']].to_numpy()
X_train, X_test, y_train, y_test = train_test_split(data[:, 0], data[:, 1], test_size=0.2, shuffle=True)


#### **Nearest neighbors**

To compute the nearest neighbors after the training, we'll use `sklearn.nearest_neighbors`. Let's fit our target space into this algorithm

In [None]:
k_s=[1, 2, 5, 8, 10, 15] # accuracies will be computed with respect to many values of neighbors
neighbors_tree = NN(n_neighbors=k_s[-1], algorithm='ball_tree')
neighbors_tree.fit(target_vocabulary.vectors.numpy())


#### **Define loss and gradient**



> Loss

Our optimization problem is defined as follows:

$$
min_w \sum_{i} ||Wx_i - yi||^2 
$$

Where each word (french and english, respectively) $x_i, y_i \in \mathbb{R}^d$, $d$ being the embeddings dimension.



> Gradient


Using minibatch gradient descent, our iterations will perform the following step on the matrix $W$:

$$
W = W - \dfrac{\eta}{b} \sum_{i=1}^{b} \nabla_i  ||Wx_i - yi||^2 
$$

Where $\eta$ is the learning rate and $b$ is the batch size. This can then be calculated as:

$$
W = W - \dfrac{2\eta}{b} (WX - Y)X^T \,\,\,\,\, X,Y \in \mathbb{R}^{dxb}
$$

$X$ and $Y$ here are the matrices having, in each column, the distributed representation for the corresponding words of the mini-batch.



In [None]:
batch_size=128
learning_rate=0.3
W = torch.ones((300, 300))

def loss(W, X, y, bs):
  return torch.norm(W @ X - y)/bs

def grad(W, X, y, bs):
  return 2*(((W@X - y)) @ X.T)/bs

#### **Orthogonal case**

The orthogonal constraint of the optimization criterium leads to the Procustes problem, which can be solved as follows:

$$
\underset{W}{\arg \min}||WX - Y||^2  = \underset{W}{\arg \min} \left ( ||WX||^2 - 2 \langle WX, Y \rangle+ ||Y||^2 \right ) \\
= \underset{W}{\arg \min} \left ( ||X||^2 - 2 \langle WX, Y \rangle + ||Y||^2 \right ) \\
= \underset{W}{\arg \max}  \langle WX, Y \rangle \\
= \underset{W}{\arg \max}  \langle W, YX^T \rangle 
$$

Let  $SVD(YX^T) = U \Sigma V^T$, where $U$ and $V$ are both orthogonal. Then

$$
\underset{W}{\arg \max}  \langle W, YX^T \rangle = \underset{W}{\arg \max}  \langle W, U \Sigma V^T \rangle \\ 
\underset{W}{\arg \max} \langle U^T W V, Σ \rangle
$$

Given that $U^T W V$ is a product of orthogonal matrices, thus an orthogonal matrix, and also that $Σ$ is diagonal, the maximum of the above expression is obtained by making $U^T W V = I$. Then, we have: 

$$
U^T W V = I \therefore W = UV^T 
$$

Hence,

$$
\underset{W}{\arg \min}||WX - Y||^2 = UV^T , \text{where } SVD(YX^T) = U Σ V^T
$$

In [None]:
Y = target_vocabulary.vectors[y_train, :].T
X = source_vocabulary.vectors[X_train, :].T
U, sigma, V_t = svd(Y @ X.T)

In [None]:
W_orthogonal = torch.from_numpy(U@V_t)

#**Orthogonal case using cosine distance**

Another way to minimize $
\underset{W}{\arg \min}||WX - Y||^2 $ , introduced in [Normalized Word Embedding and Orthogonal Transform for Bilingual Word Translation](https://aclanthology.org/N15-1104.pdf) , is to use the cosine distance in the transform learning. The optimization
task can be redefined as follows: $$ \max_{W} \sum_{i} (W_i x_i)^{T}y_i$$
 A simple calculation shows that the gradient is as follows:$$ \bigtriangledown W =\sum_{i} x_i y_i^{T} $$


In [None]:
def cos_loss(W, X, y, bs):
  return 1-torch.trace((W @ X).T @ y)/bs

def cos_grad(W, X, y, bs):
  return X@(y.T)/bs

#### **Training**
For each epoch, we then:

*   Divide the training set in batches
*   Perform step for every batch
*   Compute loss on training and test sets



In [None]:
train_loss=[]
test_loss =[]

for epoch in range(50):
  start, end = 0, batch_size
  loss_train, loss_test = 0, 0
  i=0
  start_time=time.time()
  while i < (X_train.shape[0]/batch_size)+1:
    X_ = source_vocabulary.vectors[X_train[start:end], :].T
    y_ = target_vocabulary.vectors[y_train[start:end], :].T
  
    W-= learning_rate*grad(W, X_, y_, batch_size)

    start = min(start+batch_size, X_train.shape[0]-batch_size)
    end = min(end+batch_size, X_train.shape[0])
    i+=1

  # EVALUATE MODEL

  # On the training set
  X_ = source_vocabulary.vectors[X_train, :].T
  y_ = target_vocabulary.vectors[y_train, :].T

  loss_train = loss(W, X_, y_, X_train.shape[0])

  # On the test set
  X_ = source_vocabulary.vectors[X_test, :].T
  y_ = target_vocabulary.vectors[y_test, :].T
    
  loss_test = loss(W, X_, y_, X_test.shape[0])
  
  train_loss.append(loss_train)
  test_loss.append(loss_test)



  print('EPOCH ', epoch, '|| LOSS TRAIN: {:.2f}'.format(loss_train), '|| LOSS TEST:{:.2f}'.format(loss_test) , '|| TIME: {:.2f} '.format(time.time()-start_time))

In [None]:
plt.subplot(1, 2, 1)
plt.plot(train_loss)
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Train loss')

plt.subplot(1, 2, 2)
plt.plot(test_loss)
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Test loss')

plt.tight_layout()
plt.show()

In [None]:
W_2 = torch.nn.init.orthogonal_(torch.empty(300, 300))

In [None]:
train_loss=[]
test_loss =[]


for epoch in range(50):
  start, end = 0, batch_size
  loss_train, loss_test = 0, 0
  i=0
  start_time=time.time()
  while i < (X_train.shape[0]/batch_size)+1:
    X_ = source_vocabulary.vectors[X_train[start:end], :].T
    y_ = target_vocabulary.vectors[y_train[start:end], :].T
  
    CG=cos_grad(W_2, X_, y_, batch_size)
    W_2 += learning_rate*CG
    U, sigma, V = svd(W_2)
    W_2 = torch.from_numpy(U@V)

    start = min(start+batch_size, X_train.shape[0]-batch_size)
    end = min(end+batch_size, X_train.shape[0])
    i+=1

  # EVALUATE MODEL

  # On the training set
  X_ = source_vocabulary.vectors[X_train, :].T
  y_ = target_vocabulary.vectors[y_train, :].T

  loss_train = cos_loss(W_2, X_, y_, X_train.shape[0])

  # On the test set
  X_ = source_vocabulary.vectors[X_test, :].T
  y_ = target_vocabulary.vectors[y_test, :].T
    
  loss_test = cos_loss(W_2, X_, y_, X_test.shape[0])
  
  train_loss.append(loss_train)
  test_loss.append(loss_test)


  print('EPOCH ', epoch, '|| LOSS TRAIN: {:.2f}'.format(loss_train), '|| LOSS TEST:{:.2f}'.format(loss_test) , '|| TIME: {:.2f} '.format(time.time()-start_time))

In [None]:
plt.subplot(1, 2, 1)
plt.plot(train_loss)
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Train loss')

plt.subplot(1, 2, 2)
plt.plot(test_loss)
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Test loss')

plt.tight_layout()
plt.show()

In [None]:
W_3 = torch.nn.init.orthogonal_(torch.empty(300, 300))

In [None]:
train_loss=[]
test_loss =[]
beta=0.01
for epoch in range(50):
  start, end = 0, batch_size
  loss_train, loss_test = 0, 0
  i=0
  start_time=time.time()
  while i < (X_train.shape[0]/batch_size)+1:
    X_ = source_vocabulary.vectors[X_train[start:end], :].T
    y_ = target_vocabulary.vectors[y_train[start:end], :].T
  
    CG=cos_grad(W_3, X_, y_, batch_size)
    W_3 += learning_rate*CG
    W_3= (1+beta)*W_3 - beta*(W_3 @ W_3.T)@ W_3


    start = min(start+batch_size, X_train.shape[0]-batch_size)
    end = min(end+batch_size, X_train.shape[0])
    i+=1

  # EVALUATE MODEL

  # On the training set
  X_ = source_vocabulary.vectors[X_train, :].T
  y_ = target_vocabulary.vectors[y_train, :].T

  loss_train = cos_loss(W_3, X_, y_, X_train.shape[0])

  # On the test set
  X_ = source_vocabulary.vectors[X_test, :].T
  y_ = target_vocabulary.vectors[y_test, :].T
    
  loss_test = cos_loss(W_3, X_, y_, X_test.shape[0])
  
  train_loss.append(loss_train)
  test_loss.append(loss_test)


  print('EPOCH ', epoch, '|| LOSS TRAIN: {:.2f}'.format(loss_train), '|| LOSS TEST:{:.2f}'.format(loss_test) , '|| TIME: {:.2f} '.format(time.time()-start_time))

In [None]:
plt.subplot(1, 2, 1)
plt.plot(train_loss)
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Train loss')

plt.subplot(1, 2, 2)
plt.plot(test_loss)
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Test loss')

plt.tight_layout()
plt.show()

#### **Evaluation**

Next, we'll see the accuracy of the trained model on the test set

In [None]:
def get_acc(indices, y_test, k_s=[1, 2, 5, 8, 10, 15]):
  accuracies = [0 for i in range(len(k_s))]
  # Count accuracy for every k
  for i, k in enumerate(k_s):
    for j, target_idx in enumerate(y_test):
      accuracies[i]+= 1*(target_idx in indices[j, :k])
    accuracies[i]= 100* accuracies[i]/ X_test.shape[0]
  return accuracies

In [None]:
# Perform prediction
source_vectors_test = source_vocabulary.vectors[X_test, :].T
prediction_test = (W @ source_vectors_test).T
prediction_test_orthogonal = (W_orthogonal @ source_vectors_test).T
prediction_test_orthogonal_cos = (W_2 @ source_vectors_test).T
prediction_test_orthogonal_cos2 = (W_3 @ source_vectors_test).T

# Find nearest neighbours
tic = time.time()
dists, idxs = neighbors_tree.kneighbors(prediction_test.numpy())
tac = time.time()
print('General case done! Computed ', neighbors_tree.n_neighbors , ' neighbors for ', idxs.shape[0] ,' words in {:.3f}'.format(tac-tic), 's')
dists_orthogonal, idxs_orthogonal = neighbors_tree.kneighbors(prediction_test_orthogonal.numpy())
print('Orthogonal case done! Computed ', neighbors_tree.n_neighbors , ' neighbors for ', idxs.shape[0] ,' words in {:.3f}'.format(time.time()-tac), 's')
tic = time.time()
dists_orthogonal_cos, idxs_orthogonal_cos = neighbors_tree.kneighbors(prediction_test_orthogonal_cos.numpy())
print('Orthogonal cosine case by SVD done! Computed ', neighbors_tree.n_neighbors , ' neighbors for ', idxs.shape[0] ,' words in {:.3f}'.format(time.time()-tic), 's')
tac = time.time()
dists_orthogonal_cos2, idxs_orthogonal_cos2 = neighbors_tree.kneighbors(prediction_test_orthogonal_cos2.numpy())
print('Orthogonal cosine case by beta done! Computed ', neighbors_tree.n_neighbors , ' neighbors for ', idxs.shape[0] ,' words in {:.3f}'.format(time.time()-tac), 's')

In [None]:
acc = get_acc(idxs, y_test)
acc_orthogonal = get_acc(idxs_orthogonal, y_test)
acc_orthogonal_cos = get_acc(idxs_orthogonal_cos, y_test)
acc_orthogonal_cos2 = get_acc(idxs_orthogonal_cos2, y_test)

In [None]:
plt.figure(figsize=(8,6))
plt.plot(k_s, acc, color='blue', label='accuracy general case')
plt.plot(k_s, acc_orthogonal, color='red', label='accuracy orthogonal case')
plt.plot(k_s, acc_orthogonal_cos, color='green', label='accuracy orthogonal cosine case by SVD')
plt.plot(k_s, acc_orthogonal_cos2, color='purple', label='accuracy orthogonal cosine case by beta')
plt.legend()
plt.title('Accuracies for different values of k - English to French')
plt.show()

In [None]:
result = []

acc = 0
acc_orthogonal = 0
acc_cos = 0
acc_cos2 = 0
prediction_test = (W @ (source_vocabulary.vectors[X_test, :].T)).T
prediction_test_cos = (W_2 @ (source_vocabulary.vectors[X_test, :].T)).T
prediction_test_cos2 = (W_3 @ (source_vocabulary.vectors[X_test, :].T)).T
prediction_test_orthogonal = (W_orthogonal @ (source_vocabulary.vectors[X_test, :].T)).T
for i, (english_idx, french_idx) in enumerate(zip(X_test, y_test)):
  english_word = source_vocabulary.itos[english_idx]
  french_word=target_vocabulary.itos[french_idx]
  translation = closest_words(target_vocabulary, prediction_test[i, :] , n = 1)
  translation_cos = closest_words_cosin(target_vocabulary, prediction_test_cos[i, :] , n = 1)
  translation_cos2 = closest_words_cosin(target_vocabulary, prediction_test_cos2[i, :] , n = 1)
  translation_orthogonal = closest_words(target_vocabulary, prediction_test_orthogonal[i, :] , n = 1)

  result.append([french_word,english_word, translation[0], translation_orthogonal[0],translation_cos[0],translation_cos2[0]])

In [None]:
class color:
   PURPLE = '\033[95m'
   CYAN = '\033[96m'
   DARKCYAN = '\033[36m'
   BLUE = '\033[94m'
   GREEN = '\033[92m'
   YELLOW = '\033[93m'
   RED = '\033[91m'
   BOLD = '\033[1m'
   UNDERLINE = '\033[4m'
   END = '\033[0m'

In [None]:
#### Validate the Test Data
for i in range(30):
  line_new = '{:<60} traducted : {:>15} (simple SGD) {:>35} (orthogonal) {:>35} (SGD + orthogonal by SVD)  {:>35} (SGD + orthogonal by beta)'.format(color.PURPLE+result[i][1]+color.END+'('+color.GREEN+ result[i][0]+color.END +')',color.YELLOW+ result[i][2] +color.END, color.CYAN+ result[i][3] +color.END, color.RED+ result[i][4] +color.END,color.DARKCYAN+ result[i][5] +color.END)
  print(line_new)

#### **Performance visualization on common words**
##### English to french case

In [None]:
animals_french = ['chien', 'chat', 'cheval', 'vache', 'lion', 'oiseau', 'poisson', 'tigre', 'poule']
animals_english = ['dog', 'cat', 'horse', 'cow', 'lion', 'bird', 'fish', 'tiger', 'chicken']

animals_french_idx = np.array([target_vocabulary.stoi[animal] for animal in animals_french])
animals_english_idx = np.array([source_vocabulary.stoi[animal] for animal in animals_english])


In [None]:
animals_french_vectors = target_vocabulary.vectors[animals_french_idx, :]
animals_english_vectors = source_vocabulary.vectors[animals_english_idx, :]
animals_english_translation_vectors = (W @ animals_english_vectors.T).T

In [None]:
pca_french = PCA(n_components=2)
pca_english = PCA(n_components=2)
pca_english_translated = PCA(n_components=2) 

french_2D = pca_french.fit_transform(animals_french_vectors)

english_2D = pca_english.fit_transform(animals_english_vectors)

english_translated_2D = pca_english_translated.fit_transform(torch.cat([animals_french_vectors, animals_english_translation_vectors], 0))

In [None]:
labels = animals_french + animals_english
fig, ax = plt.subplots(figsize=(10,10))
ax.scatter(english_translated_2D[:len(animals_french),0], english_translated_2D[:len(animals_french),1], color='blue', label='french words')
ax.scatter(english_translated_2D[len(animals_french):,0], english_translated_2D[len(animals_french):,1], color='red', label='english words translated')

plt.legend()
for i, txt in enumerate(labels):
    ax.annotate(txt, (english_translated_2D[i][0], english_translated_2D[i][1]))

plt.title('Comparison between french words and translated english words')

#### **Unsupervised Model**

In [None]:
class Discriminator(nn.Module):
  def __init__(self):
    super(Discriminator, self).__init__()
    self.layers = nn.Sequential(
        nn.Linear(300, 500),
        nn.LeakyReLU(0.2),
        nn.Dropout(0.2),
        nn.BatchNorm1d(500),
        nn.Linear(500, 1),
        nn.Sigmoid()
    )
  def forward(self, x):
    return self.layers(x)
  
class Generator(nn.Module):

  def __init__(self):
    super(Generator, self).__init__()
    self.W = nn.Linear(300, 300, bias=False)
    self.betha = 0.01
    nn.init.eye_(self.W.weight.data)
  
  def keep_orthogonal(self):
    self.W.weight.data.copy_((1+self.betha)*self.W.weight.data - self.betha * (self.W.weight.data @ (self.W.weight.data.T)) @ (self.W.weight.data))
    
  def forward(self, x):
    return self.W(x)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

generator = Generator()
discriminator = Discriminator()
generator = generator.to(device)
discriminator = discriminator.to(device)

criterion_generator = nn.BCEWithLogitsLoss()
criterion_discriminator = nn.BCEWithLogitsLoss()
optimizer_generator = torch.optim.SGD(generator.parameters(), lr=0.01, weight_decay=0.98)
optimizer_discriminator = torch.optim.Adam(discriminator.parameters(), lr=0.05,   betas=(0.5, 0.999))

In [None]:
num_epochs = 6
batch_size = 32
source_embeddings = nn.Embedding.from_pretrained(source_vocabulary.vectors).to(device)
target_embeddings = nn.Embedding.from_pretrained(target_vocabulary.vectors).to(device)
vocab_size = len(source_vocabulary.itos)
epoch_size = 2*int(vocab_size/batch_size)

for epoch in range(num_epochs):
  # TRAIN
  i=0
  loss=0
  gen=0

  for i in range(epoch_size):

    source_idx = torch.randint(0, vocab_size, (batch_size,)).to(device) # generate ramdom indices
    
    vectors_en = source_embeddings(source_idx)
    
    ### TRAIN DISCRIMINATOR ###
    
    # WITH FAKE EXAMPLES (GENERATED BY GENERATOR)
    for j in range(3):
      discriminator.train()
      generator.eval()
      
      discriminator.zero_grad()
      optimizer_discriminator.zero_grad()

      vectors_fake = generator(vectors_en)
      discriminator_labels_fake = discriminator(vectors_fake.data.detach())

      labels_fake = (torch.randn_like(discriminator_labels_fake)*0.2).to(device)  #USING NOISY LABELS

      loss_discriminator_fake = criterion_discriminator(discriminator_labels_fake, labels_fake).to(device)
      D_G_z1 = discriminator_labels_fake.cpu().mean().item()

      # WITH TRUE EXAMPLES
      target_idx = torch.randint(0, vocab_size, (batch_size,)).to(device)
      
      vectors_true = target_embeddings(target_idx)

      discriminator_labels_true = discriminator(vectors_true)
      labels_true = (torch.randn_like(discriminator_labels_true)*0.2 + 1.).to(device)

      loss_discriminator_true = criterion_discriminator(discriminator_labels_true, labels_true).to(device)
      

      loss_discriminator = 0.5*(loss_discriminator_fake + loss_discriminator_true)
      loss_discriminator.backward()
      D_x = discriminator_labels_true.cpu().mean().item()
      optimizer_discriminator.step()

    ### TRAIN GENERATOR ###
    discriminator.eval()
    generator.train()

    target_idx = torch.randint(0, vocab_size, (batch_size,)).to(device)
    generator.zero_grad()
    optimizer_generator.zero_grad()
    vectors_en = target_embeddings(target_idx)

    vectors_fake = generator(vectors_en)
    discriminator_labels_fake = discriminator(vectors_fake)
    labels_true = (torch.randn_like(discriminator_labels_fake)*0.2 + 1.).to(device)

    loss_generator = criterion_generator(discriminator_labels_fake, labels_true)

    D_G_z2 = discriminator_labels_fake.cpu().mean().item()

    loss_generator.backward()
    optimizer_generator.step()
    generator.keep_orthogonal()

      # Output training stats
    if i % 100 == 0 and i!=0:
        print('[%d/%d][%d/%d]\tLoss_D: %.4f\tLoss_G: %.4f\tD(x): %.4f\tD(G(z)): %.4f / %.4f'
              % (epoch, num_epochs, i, epoch_size,
                  loss_discriminator.item(), loss_generator.item(), D_x, D_G_z1, D_G_z2))
        loss=0
        gen=0
    i+=1
  

### **Check accuracy without procrustes refinement**

In [None]:
prediction_test_gan = generator(source_vocabulary.vectors[X_test, :].to(device)).cpu()
tic = time.time()
dists_gan, idxs_gan = neighbors_tree.kneighbors(prediction_test_gan.detach().numpy())
print('GAN case done! Computed ', neighbors_tree.n_neighbors , ' neighbors for ', idxs_gan.shape[0] ,' words in {:.3f}'.format(time.time()-tic), 's')

In [None]:
acc_gan = get_acc(idxs_gan, y_test)

#### Replot to compare with supervised methods

In [None]:
plt.figure(figsize=(8,6))
if acc:
  plt.plot(k_s, acc, color='blue', label='accuracy general case')
if acc_orthogonal:
  plt.plot(k_s, acc_orthogonal, color='red', label='accuracy orthogonal case')

plt.plot(k_s, acc_gan, color='green', label='accuracy gan')
plt.legend()
plt.title('Accuracies for different values of k - English to French')
plt.show()

#### **PROCRUSTES REFINEMENT**

To  perform procrustes refinement, we'll first select the closest predictions made by our generator. These predictions are considered to be accurate and common. We'll then construct our dictionary only with them, and we'll pergorm the orthogonal solution. We'll use a threshold for the distances, here we take the median of the distances.

In [None]:
# We'll first look for the nearest neighbors in the training set.
prediction_train_gan = generator(source_vocabulary.vectors[X_train, :].to(device)).cpu()
tic=time.time()
dists_train_gan, idxs_train_gan = neighbors_tree.kneighbors(prediction_train_gan.detach().numpy(), n_neighbors=1)
print('1-NN for dictionary computed for ', idxs_train_gan.shape[0] ,' words in {:.3f}'.format(time.time()-tic), 's')

In [None]:
median = np.median(dists_train_gan)
indices_translation = torch.tensor([[j, i[0]] for d, i, j in zip(dists_train_gan, idxs_train_gan, X_train) if d < median])

Once we have the words from source and target space that are next to each other by the leasts distances, we can use them to compute our Procrustes solution. This will be our new weights for the generator.

In [None]:
src_idx = indices_translation[:, 0]
tgt_idx = indices_translation[:, 1]

closest_words_source = source_embeddings.weight.data[src_idx.to(device)]
closest_words_target = target_embeddings.weight.data[tgt_idx.to(device)]

In [None]:
M = closest_words_target.transpose(0, 1).mm(closest_words_source).cpu().numpy()
U_, _, V_T = svd(M, full_matrices=True)

In [None]:
new_generator = Generator()
new_generator=new_generator.to(device)
new_generator.W.weight.data.copy_(torch.from_numpy(U_.dot(V_T)).type_as(generator.W.weight.data))

#### Final evaluation

We can finally evaluate our last solution that uses gan+procrustes

In [None]:
prediction_test = new_generator(source_vocabulary.vectors[X_test, :].to(device)).cpu()
tic = time.time()
dists_unsupervised, idxs_unsupervised = neighbors_tree.kneighbors(prediction_test.detach().numpy())
print('Unsupervised (GAN+Procrustes) case done! Computed ', neighbors_tree.n_neighbors , ' neighbors for ', idxs_unsupervised.shape[0] ,' words in {:.3f}'.format(time.time()-tic), 's')

In [None]:
acc_unsupervised = get_acc(idxs_unsupervised, y_test)

### FINAL PLOT


In [None]:
plt.figure(figsize=(10,8))
if acc:
  plt.plot(k_s, acc, color='blue', label='accuracy general case')
if acc_orthogonal:
  plt.plot(k_s, acc_orthogonal, color='red', label='accuracy orthogonal case')
if acc_gan:
  plt.plot(k_s, acc_gan, color='green', label='accuracy gan')

plt.plot(k_s, acc_unsupervised, color='purple', label='accuracy unsupervised method')
plt.legend()
plt.title('Accuracies for different values of k - English to French')
plt.show()