### Learning cross-lingual embedding with Skip-gram Negative Sampling Word Embeddings

Author: Jeanne Elizabeth Daniel

November 2019

We construct a massive multilingual vocabulary from all the words found in the training set. 

After training the embedding models on the questions found in the training set, we extract the cross-lingual word embeddings. We construct a sentence embedding by taking the average of all the word embeddings in the sentence (Wieting et al., 2015). Then we train $k$-nearest neighbour classifiers to predict the most appropriate answer, with $k = 1, 5, 25, 50$. The best validation scores were achieved by using cosine as the distance metric and using weighted majority voting, where the contribution of each nearest neighbour is inversely proportion to its distance from the query vector.

In [2]:
import pandas as pd
import gensim
import numpy as np
from gensim.models import Word2Vec, FastText

In [3]:
import preprocess_data

In [4]:
data = pd.read_csv('dataset_7B', delimiter = ';', engine = 'python')
data = data[['helpdesk_question', 'helpdesk_reply', 'set', 'low_resource']] 

In [5]:
responses = pd.DataFrame(data.loc[data['set'] == 'Train']['helpdesk_reply'].value_counts()).reset_index()
responses['reply'] = responses['index']
responses['index'] = responses.index
responses = dict(responses.set_index('reply')['index'])

In [6]:
def create_word2vec(data, skip_gram = 1, size = 100):
    
    """ Create word2vec embedding model. Word2Vec has two variants - CBOW and SGNS.  
    
    Args:
        data: dataframe that contains the questions 
        skip_gram: binary indicator to use either skip-gram negative sampling or 
            continuous bag-of-words (Mikolov et al., 2013)
        size: number of dimensions in embedding
    
    Returns:
        Trained embedding model
    
    """
    
    documents = data['helpdesk_question']
    documents['index'] = documents.index
    processed_docs = documents.apply(preprocess_data.preprocess, args = [0, False])
    print(len(processed_docs))
    model = Word2Vec(processed_docs, min_count = 1, sg = skip_gram, seed= 1, size = size,
                     negative = 5, ns_exponent =  0.75, workers = 5)  
    
    return model
    

In [7]:
def create_sentence_embeddings(embedding_model, sentence):
    
    """ We create sentence embeddings by averaging the embeddings of the words found in the sentence. 
    If no words match, we return a vector of random values.
    
    Args:
        embedding_model:
        sentence: list of words found in sentence
        
    Returns:
        A sentence embedding
    
    """
        
    sentence_vector = np.zeros(100)
    length = 0
    if len(sentence) == 0:
        return (np.random.random(100) - 0.5)/100
    
    if embedding_model.wv.vocab.get(sentence[0]) != None:
        sentence_vector = embedding_model.wv[sentence[0]]
        length += 1
    
    for word in sentence[1:]:
        if embedding_model.wv.vocab.get(word) != None:
            sentence_vector = sentence_vector + 1*np.array(embedding_model.wv[word])
            length += 1
            
    if length == 0:
        return (np.random.random(100) - 0.5)/100
   
    return sentence_vector/length

In [8]:
def create_batch(df, embedding_model, D): 
        
    """ Create batch of feature vectors in matrix form
    
    Args:
        df: dataset of questions
        embedding_model: pretrained embedding model
        D: size of embedding
        
    Returns:
        matrix where rows are embeddings of questions
    
    """    
    
    matrix = np.zeros((df.shape[0], D, ))
    all_text = list(df['helpdesk_question'].apply(preprocess_data.preprocess)) 

    for i in range(len(all_text) -1):
        sentence_vector = create_sentence_embeddings(embedding_model, all_text[i])
        matrix[i] += np.array(sentence_vector)
            
    return matrix 

def label_preprocess(entry):
        
    """ Returns integer ID corresponding to response for easy comparison and classification
    
    Args:
        entry: query item 
        responses: dict containing all the template responses with their corresponding IDs
        
    Return: 
        integer corresponding to each response     
        
    """
    
    if responses.get(entry) != None:
        return responses[entry]
    else:
        return len(responses) #default unknown class

In [9]:
train_df   = data.loc[data['set'] == 'Train']
valid_df   = data.loc[data['set'] == 'Valid']
test_df    = data.loc[data['set'] == 'Test']
test_LR_df = data.loc[(data['set'] == 'Test') & (data['low_resource'] == 'True')]

y_train   = data.loc[data['set'] == 'Train']['helpdesk_reply'].apply(label_preprocess)
y_valid   = data.loc[data['set'] == 'Valid']['helpdesk_reply'].apply(label_preprocess)
y_test    = data.loc[data['set'] == 'Test']['helpdesk_reply'].apply(label_preprocess)
y_test_LR = data.loc[(data['set'] == 'Test') & (data['low_resource'] == 'True')]['helpdesk_reply'].apply(label_preprocess)

In [None]:
w2v = create_word2vec(train_df)

In [31]:
from sklearn.neighbors import KNeighborsClassifier

def train_knn_model(x_train, y_train, metric, k, weights):
    
    """ Fit k-nearest neighbour model to the sentence embeddings
    
    Args:
        x_train: matrix of sentence embeddings
        y_train: class labels associated with each sentence embedding 
        metric: distance metric to use
        k: number of neighbours to consider
        weights: to either use uniform voting (equal weighting) or weighted voting (the weight of 
        each vote is proportional to its distance to query)
        
    Returns:
        A trained KNN classifier
    
    """
    
    
    clf = KNeighborsClassifier(n_neighbors=k, weights= weights, metric = metric)
    clf.fit(x_train, y_train)
    return clf

### Results for Word2Vec Embeddings

In [32]:
x_train = create_batch(train_df, w2v, 100)
x_valid = create_batch(valid_df, w2v, 100)
x_test  = create_batch(test_df, w2v, 100)
x_test_LR = create_batch(test_LR_df, w2v, 100)

96412
96413
31955
31955
32233
32233
6656
6656


In [13]:
clf_1NN = train_knn_model(x_train = x_train, y_train = y_train, metric = 'cosine', 
                          k = 1, weights = 'distance')
score = clf_1NN.score(x_train, y_train)
print("Train accuracy", score)
score = clf_1NN.score(x_valid, y_valid)
print("Validation accuracy", score)

1 Nearest Neighbours
Train accuracy 0.9680226527818114
Validation accuracy 0.4870286340165858


In [14]:
clf_5NN = train_knn_model(x_train = x_train, y_train = y_train, metric = 'cosine', 
                          k = 5, weights = 'distance')
score = clf_5NN.score(x_valid, y_valid)
print("Validation accuracy", score)

5 Nearest Neighbours
Validation accuracy 0.5318103583163825


In [15]:
clf_25NN = train_knn_model(x_train = x_train, y_train = y_train, metric = 'cosine', 
                          k = 25, weights = 'distance')
score = clf_25NN.score(x_valid, y_valid)
print("Validation accuracy", score)

25 Nearest Neighbours
Validation accuracy 0.5711469253637929


In [16]:
clf_50NN = train_knn_model(x_train = x_train, y_train = y_train, metric = 'cosine', 
                          k = 50, weights = 'distance')
score = clf_50NN.score(x_valid, y_valid)
print("Validation accuracy", score)

50 Nearest Neighbours
Validation accuracy 0.574526678141136


In [17]:
score = clf_1NN.score(x_test, y_test)
print("Test accuracy on 1-NN", score)
score = clf_5NN.score(x_test, y_test)
print("Test accuracy on 5-NN", score)
score = clf_25NN.score(x_test, y_test)
print("Test accuracy on 25-NN", score)
score = clf_50NN.score(x_test, y_test)
print("Test accuracy on 50-NN", score)

Test accuracy on 1-NN 0.48766791797226444
Test accuracy on 5-NN 0.5329631123382869
Test accuracy on 25-NN 0.5728601123072627
Test accuracy on 50-NN 0.5742872211708497


In [18]:
score = clf_1NN.score(x_test_LR, y_test_LR)
print("LR Test accuracy on 1-NN", score)
score = clf_5NN.score(x_test_LR, y_test_LR)
print("LR Test accuracy on 5-NN", score)
score = clf_25NN.score(x_test_LR, y_test_LR)
print("LR Test accuracy on 25-NN", score)
score = clf_50NN.score(x_test_LR, y_test_LR)
print("LR Test accuracy on 50-NN", score)

LR Test accuracy on 1-NN 0.3701923076923077
LR Test accuracy on 5-NN 0.4230769230769231
LR Test accuracy on 25-NN 0.4792668269230769
LR Test accuracy on 50-NN 0.482421875


### Assessing the quality of cross-lingual embeddings


We design a small experiment to assess the quality of the cross-lingual embeddings for English and Zulu. The English sentences were synthesized based on frequently occurring questions found in the dataset. The Zulu translations were obtained using google translate and verified by a Zulu speaker. We compute the sentence embedding for each English-Zulu translation pair and calculate the cosine distance between the two embeddings. 

In [25]:
eng_A  = "can you drink coca cola when you are pregnant"
zulu_A = "uma ngikhulelwe ngingaphuza i-coca-cola"

eng_B  = "when can i stop breastfeeding"
zulu_B = "ngingakuyeka nini ukuncelisa ibele"

eng_C  = "when can I start feeding my baby solid food"
zulu_C = "ngingaqala nini ukondla ingane yami ukudla okuqinile"

eng_D  = "what are the signs of labour"
zulu_D = "yiziphi izimpawu zokubeletha"

eng_E  = "when can i find out if my baby is a boy or a girl"
zulu_E = "ngingathola kanjani ukuthi ingane yami umfana noma intombazane"

In [26]:
create_sentence_embeddings(w2v, preprocess_data.preprocess(eng_A))

array([-0.20364249,  0.06353891, -0.13633114, -0.2765745 , -0.21156752,
        0.4096739 ,  0.28027058,  0.19750798,  0.46120617, -0.42735687,
       -0.13965902,  0.23786306,  0.22086824,  0.47389963,  0.64499557,
       -0.4814254 ,  0.01645028,  0.40576452,  0.21119314, -0.0466147 ,
        0.3060975 , -0.7800398 ,  0.47498357, -0.5873072 , -0.5012711 ,
        0.41716594, -0.21051793, -0.25111225, -0.44785514,  0.11893415,
       -0.11089575,  0.68436056,  0.307312  ,  0.05488437, -0.60432065,
       -0.8261534 ,  0.33817112,  0.17779382,  0.69285756, -0.45616245,
       -0.51282483,  0.02924656,  0.24955279, -0.47394207,  0.7558563 ,
       -0.17320566,  0.340093  ,  0.04293355, -0.19062154,  0.23164397,
        0.6927828 ,  0.29991233,  0.3498487 , -0.8781517 ,  0.36495525,
        0.1165664 ,  0.08023944, -0.15665478, -0.62748575,  0.25193146,
        0.03222989, -0.7912734 ,  0.5291057 ,  0.6205608 ,  0.27305987,
        0.24024275,  0.56332576,  0.7538656 , -0.34221843, -0.06

In [27]:
embed_eng_A = create_sentence_embeddings(w2v, preprocess_data.preprocess(eng_A))
embed_eng_B = create_sentence_embeddings(w2v, preprocess_data.preprocess(eng_B))
embed_eng_C = create_sentence_embeddings(w2v, preprocess_data.preprocess(eng_C))
embed_eng_D = create_sentence_embeddings(w2v, preprocess_data.preprocess(eng_D))
embed_eng_E = create_sentence_embeddings(w2v, preprocess_data.preprocess(eng_E))

In [28]:
embed_zulu_A = create_sentence_embeddings(w2v, preprocess_data.preprocess(zulu_A))
embed_zulu_B = create_sentence_embeddings(w2v, preprocess_data.preprocess(zulu_B))
embed_zulu_C = create_sentence_embeddings(w2v, preprocess_data.preprocess(zulu_C))
embed_zulu_D = create_sentence_embeddings(w2v, preprocess_data.preprocess(zulu_D))
embed_zulu_E = create_sentence_embeddings(w2v, preprocess_data.preprocess(zulu_E))

In [29]:
from scipy.spatial.distance import cosine

In [30]:
print("Sentence A:", cosine(embed_eng_A, embed_zulu_A))
print("Sentence B:", cosine(embed_eng_B, embed_zulu_B))
print("Sentence C:", cosine(embed_eng_C, embed_zulu_C))
print("Sentence D:", cosine(embed_eng_D, embed_zulu_D))
print("Sentence E:", cosine(embed_eng_E, embed_zulu_E))

Sentence A: 0.31717658042907715
Sentence B: 0.6085197376973056
Sentence C: 0.5985755920410156
Sentence D: 0.6016709804534912
Sentence E: 0.6159537434577942
