<a href="https://colab.research.google.com/github/LollipopGB/EA_TechnicalTest_DanielGarcia/blob/master/MUSE_Word_embeddings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Generating Word embeddings with Multilingual Universal Sentence Encoder


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
%%capture
#@title Setup Environment
# Install the latest Tensorflow version.
!pip install tensorflow_text
!pip install bokeh
!pip install simpleneighbors[annoy]
!pip install tqdm

Load the libraries needed and define an auxiliar method to show how the encoder works.

In [None]:
import bokeh
import bokeh.models
import bokeh.plotting
import numpy as np
import os
import pandas as pd
import tensorflow.compat.v2 as tf
import tensorflow_hub as hub
from tensorflow_text import SentencepieceTokenizer
import sklearn.metrics.pairwise

from simpleneighbors import SimpleNeighbors
from tqdm import tqdm
from tqdm import trange

def visualize_similarity(embeddings_1, embeddings_2, labels_1, labels_2,
                         plot_title,
                         plot_width=1200, plot_height=600,
                         xaxis_font_size='12pt', yaxis_font_size='12pt'):

  assert len(embeddings_1) == len(labels_1)
  assert len(embeddings_2) == len(labels_2)

  # arccos based text similarity (Yang et al. 2019; Cer et al. 2019)
  sim = 1 - np.arccos(
      sklearn.metrics.pairwise.cosine_similarity(embeddings_1,
                                                 embeddings_2))/np.pi

  embeddings_1_col, embeddings_2_col, sim_col = [], [], []
  for i in range(len(embeddings_1)):
    for j in range(len(embeddings_2)):
      embeddings_1_col.append(labels_1[i])
      embeddings_2_col.append(labels_2[j])
      sim_col.append(sim[i][j])
  df = pd.DataFrame(zip(embeddings_1_col, embeddings_2_col, sim_col),
                    columns=['embeddings_1', 'embeddings_2', 'sim'])

  mapper = bokeh.models.LinearColorMapper(
      palette=[*reversed(bokeh.palettes.YlOrRd[9])], low=df.sim.min(),
      high=df.sim.max())

  p = bokeh.plotting.figure(title=plot_title, x_range=labels_1,
                            x_axis_location="above",
                            y_range=[*reversed(labels_2)],
                            plot_width=plot_width, plot_height=plot_height,
                            tools="save",toolbar_location='below', tooltips=[
                                ('pair', '@embeddings_1 ||| @embeddings_2'),
                                ('sim', '@sim')])
  p.rect(x="embeddings_1", y="embeddings_2", width=1, height=1, source=df,
         fill_color={'field': 'sim', 'transform': mapper}, line_color=None)

  p.title.text_font_size = '12pt'
  p.axis.axis_line_color = None
  p.axis.major_tick_line_color = None
  p.axis.major_label_standoff = 16
  p.xaxis.major_label_text_font_size = xaxis_font_size
  p.xaxis.major_label_orientation = 0.25 * np.pi
  p.yaxis.major_label_text_font_size = yaxis_font_size
  p.min_border_right = 300

  bokeh.io.output_notebook()
  bokeh.io.show(p)


Load the pre-trained model from Tensorflow Hub

In [None]:
# The 16-language multilingual module is the default but feel free
# to pick others from the list and compare the results.
module_url = 'https://tfhub.dev/google/universal-sentence-encoder-multilingual/3'

model = hub.load(module_url)

def embed_text(input):
  return model(input)

# Visualize Text Similarity Between Languages
With the sentence embeddings now in hand, we can visualize semantic similarity across different languages.

## Computing Text Embeddings

We first define a set of sentences translated to various languages in parallel. Then, we precompute the embeddings for all of our sentences.

In [None]:
# Some texts of different lengths in different languages.
english_sentences = ['dog', 'Puppies are nice.', 'I enjoy taking long walks along the beach with my dog.']
french_sentences = ['chien', 'Les chiots sont gentils.', 'J\'aime faire de longues promenades sur la plage avec mon chien.']
spanish_sentences = ['perro', 'Los cachorros son agradables.', 'Disfruto de dar largos paseos por la playa con mi perro.']

multilingual_example = ["Hola, esto es increible", "Willkommen zu einfachen, aber", "verrassend krachtige", "multilingüe", "compréhension du langage naturel"]
multilingual_example_in_en =  ["Hello, this is incredible", "Welcome to simple yet", "surprisingly powerful", "multilingual", "natural language understanding"]

In [None]:
fr_result = embed_text(french_sentences)
es_result = embed_text(spanish_sentences)
en_result = embed_text(english_sentences)

multilingual_result = embed_text(multilingual_example)
multilingual_in_en_result = embed_text(multilingual_example_in_en)

In [None]:
en_result

<tf.Tensor: shape=(3, 512), dtype=float32, numpy=
array([[ 0.10093049, -0.03811244,  0.01992672, ...,  0.06740664,
         0.01838606,  0.05333168],
       [ 0.03641612, -0.03177986,  0.06663753, ...,  0.03586748,
        -0.00162901,  0.04395662],
       [-0.00552391,  0.0059953 , -0.01480544, ..., -0.0398063 ,
         0.03113599, -0.0366742 ]], dtype=float32)>

## Visualizing Similarity

With text embeddings in hand, we can take their dot-product to visualize how similar sentences are between languages. A darker color indicates the embeddings are semantically similar.

### Multilingual Similarity

In [None]:
visualize_similarity(multilingual_in_en_result, multilingual_result,
                     multilingual_example_in_en, multilingual_example,  "Multilingual Universal Sentence Encoder for Semantic Retrieval (Yang et al., 2019)")

### Generate embeddings

Now, we are ready to load our dataset and generate a word embedding per document of length 512.

In [None]:
path = '/content/drive/My Drive/ea_corpora_no_nan.csv'
df = pd.read_csv(path, sep=',', header=0)

In [None]:
df['embeddings'] = df['text'].apply(lambda x: embed_text(x))

In [None]:
df['embeddings_numpy'] = df['embeddings'].apply(lambda x: x.numpy()[0])

In [None]:
df = df.drop(columns=['embeddings'])

In [None]:
df.head()

Unnamed: 0,text,category,language,embeddings_numpy
0,"i read this book because in my town, everyone ...",APR,en,"[-0.01949591, 0.028128225, -0.011201788, 0.040..."
1,recipes appreciated by the family (small and l...,APR,en,"[-0.0028219419, -0.045204442, -0.016848372, 0...."
2,i say no to ease ..... and not to the author w...,APR,en,"[0.026812661, -0.011512865, -0.018854965, 0.04..."
3,milady has found a good vein: anita blake. bas...,APR,en,"[-0.05409476, 0.026707534, -0.0011387945, 0.00..."
4,"460 bc, somewhere in greece: ""gentlemen, i dec...",APR,en,"[0.044432636, 0.037698977, -0.054425534, -0.01..."


In [None]:
df.to_csv('/content/drive/My Drive/ea_embeddings_muse.csv', index=False, encoding='utf-8') 

The embeddings are saved in the column "embeddings_numpy" of the pandas dataframe. We save this source dataframe, and generate the dataframe with the mean vector and the flatten vector in 512 columns to use with our classifier.

In [None]:
df['embeddings_mean'] = df['embeddings_numpy'].apply(lambda x: np.mean(x))

In [None]:
df.head()

Unnamed: 0,text,category,language,embeddings_numpy,embeddings_mean
0,"i read this book because in my town, everyone ...",APR,en,"[-0.01949591, 0.028128225, -0.011201788, 0.040...",0.002181
1,recipes appreciated by the family (small and l...,APR,en,"[-0.0028219419, -0.045204442, -0.016848372, 0....",-0.002749
2,i say no to ease ..... and not to the author w...,APR,en,"[0.026812661, -0.011512865, -0.018854965, 0.04...",0.001731
3,milady has found a good vein: anita blake. bas...,APR,en,"[-0.05409476, 0.026707534, -0.0011387945, 0.00...",-0.000196
4,"460 bc, somewhere in greece: ""gentlemen, i dec...",APR,en,"[0.044432636, 0.037698977, -0.054425534, -0.01...",0.000865


In [None]:
df.to_csv('/content/drive/My Drive/ea_embeddings_muse_mean.csv', index=False, encoding='utf-8') 

In [None]:
data = []

for i in range(len(df)):
  new_row = df.loc[i]['embeddings_numpy'].tolist()
  new_row.insert(0, df.loc[i]['category'])
  new_row.insert(0, df.loc[i]['language'])
  data.append(new_row)

In [None]:
df_flatten = pd.DataFrame(data)

In [None]:
df_flatten = df_flatten.rename(columns={0: 'language', 1: 'category'})

In [None]:
df_flatten.head()

Unnamed: 0,language,category,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,474,475,476,477,478,479,480,481,482,483,484,485,486,487,488,489,490,491,492,493,494,495,496,497,498,499,500,501,502,503,504,505,506,507,508,509,510,511,512,513
0,en,APR,-0.019496,0.028128,-0.011202,0.040384,-0.076767,0.066849,0.036627,-9.3e-05,-0.061016,-0.042657,-0.032176,-0.00689,0.075234,0.018929,-0.074395,-0.034671,0.021954,-0.041494,0.029392,0.070452,-0.002293,-0.027982,0.063027,-0.021701,-0.004593,-0.008772,-0.02901,0.048651,0.039632,-0.038693,-0.00296,-0.014071,-0.069711,-0.047992,-0.054899,-0.065056,0.067219,0.014724,...,-0.068198,-0.014514,-0.049179,0.051022,0.062393,0.033015,-0.030491,0.042946,0.073327,0.072492,-0.06497,-0.073815,0.02686,0.045026,0.023724,-0.031888,-0.062634,0.052747,-0.05026,0.062537,0.014855,-0.064226,-0.034795,0.054743,0.015829,0.01992,-0.021271,-0.040884,0.045538,0.023266,-0.043784,-0.007727,0.005836,-0.046863,-0.003643,0.009728,-0.000819,-0.073423,0.030665,0.023028
1,en,APR,-0.002822,-0.045204,-0.016848,0.053938,0.039466,0.065693,-0.025186,-0.028859,0.004115,-0.023482,-0.043375,-0.004162,0.089176,0.031301,-0.018615,-0.008214,0.030708,-0.041311,0.04432,-0.101668,-0.024996,-0.040864,0.057218,0.00293,-0.044846,-0.023291,-0.049446,0.02772,0.036598,0.067288,-0.004805,-0.015493,-0.064611,-0.003714,-0.056511,0.022298,0.04141,0.048307,...,0.033055,-0.008896,-0.02822,0.027106,0.052413,-0.007867,-0.029445,0.034503,-0.062123,-0.035859,-0.004495,-0.059061,0.044188,0.035345,-0.017996,-0.009548,-0.073447,0.022766,0.040932,0.041743,-0.03836,-0.017726,-0.029402,0.0783,-0.021887,0.024873,-0.040557,0.007796,0.093732,0.044288,0.048659,0.073567,-0.085638,-0.064839,-0.07053,-0.083139,0.011784,-0.018194,0.042321,0.045972
2,en,APR,0.026813,-0.011513,-0.018855,0.043837,0.051915,0.044183,-0.004949,-0.061447,-0.014167,-0.051838,-0.003164,-0.041674,0.023665,0.032776,-0.092978,-0.018006,0.008219,-0.035709,0.002837,-0.095174,0.017576,-0.066266,0.052253,-0.013384,-0.036143,-0.05327,-0.043184,0.019745,-0.007334,0.017349,0.066458,0.023494,0.043763,-0.045874,-0.029353,0.035231,0.011219,-0.00387,...,-0.039585,-0.03619,-0.022402,0.042941,0.08923,0.033549,0.044666,0.032883,0.033441,-0.041564,-0.024675,-0.038195,0.014149,0.041846,-0.015046,-0.080431,-0.020183,-0.023784,0.049803,0.055847,-0.037482,0.008109,0.009762,0.049547,0.057941,-0.040882,0.036111,-0.01447,0.029501,-0.034775,0.032723,-0.029836,-0.03239,-0.014507,-0.048853,-0.016512,0.011402,-0.006926,0.050542,0.072263
3,en,APR,-0.054095,0.026708,-0.001139,0.000139,-0.019378,0.063965,0.02488,0.032522,-0.010525,-0.069841,-0.001611,0.076262,0.077604,-0.034608,-0.086723,-0.070627,0.035821,-0.065694,0.028925,0.062831,-0.013373,-0.002373,0.038586,-0.04197,-0.065884,0.001888,-0.028227,-0.037537,0.001648,-0.061022,0.038477,0.008977,-0.042702,-0.044809,-0.074358,0.005148,0.008144,-0.028245,...,-0.065113,-0.018468,-0.05709,0.082227,0.081851,-0.016397,-0.054665,0.014317,-0.017417,0.000247,0.008288,-0.028343,-0.051716,0.055224,0.028344,-0.075441,0.024648,0.029196,-0.044375,0.073961,0.021255,-0.022954,-0.035203,-0.028361,-0.039311,0.013222,0.030383,-0.026902,0.04977,0.010634,0.045528,-0.062564,0.006448,-0.019202,-0.047049,-0.063975,-0.059755,-0.069993,0.0654,-0.015806
4,en,APR,0.044433,0.037699,-0.054426,-0.01839,-0.04627,0.053568,0.022592,-0.059755,0.066712,-0.004233,0.003811,0.037518,0.063949,0.038765,-0.011554,0.024316,0.01133,-0.058621,0.040758,0.065246,0.010641,-0.063074,0.066435,-0.052723,-0.066983,0.064099,-0.012071,0.038219,-0.002849,-0.063286,0.0528,0.036358,-0.060843,-0.05515,-0.055619,0.034227,0.057048,-0.046706,...,0.06103,0.027919,-0.046093,-0.017373,0.058866,0.057888,0.024246,-0.019282,0.067311,0.065276,-0.059463,-0.064234,-0.058215,-0.04292,0.044207,-0.06743,-0.058655,0.016932,-0.049281,-0.036797,0.046069,-0.031627,0.020915,0.039407,-0.024232,0.049422,-0.033152,-0.04544,0.029371,0.062124,0.045455,0.008011,-0.014587,0.044229,0.035727,0.030701,0.032198,-0.065506,0.03998,0.059288


In [None]:
df_flatten.to_csv('/content/drive/My Drive/ea_embeddings_muse_flatten.csv', index=False, encoding='utf-8') 