# Vecteurs de mot
# 17/09/2021
# @author : jeremylhour

In [11]:
import numpy as np

import fasttext.util

from sklearn.decomposition import TruncatedSVD

In [18]:
def reduce_to_k_dim(M, k=2):
    """ Reduce a co-occurence count matrix of dimensionality (num_corpus_words, num_corpus_words)
        to a matrix of dimensionality (num_corpus_words, k) using the following SVD function from Scikit-Learn:
            - http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.TruncatedSVD.html
    
        Params:
            M (numpy matrix of shape (number of unique words in the corpus , number of unique words in the corpus)): co-occurence matrix of word counts
            k (int): embedding size of each word after dimension reduction
        Return:
            M_reduced (numpy matrix of shape (number of corpus words, k)): matrix of k-dimensioal word embeddings.
                    In terms of the SVD from math class, this actually returns U * S
    """    
    n_iters = 10     # Use this parameter in your call to `TruncatedSVD`
    M_reduced = None
    print(f"Running Truncated SVD over {len(M)} words...")
    svd = TruncatedSVD(n_components=2, n_iter=n_iters, random_state=42)
    svd.fit(M)
    M_reduced = svd.transform(M)
    print("Done.")
    return M_reduced

def plot_embeddings(M_reduced, word2ind, words):
    """ Plot in a scatterplot the embeddings of the words specified in the list "words".
        NOTE: do not plot all the words listed in M_reduced / word2ind.
        Include a label next to each point.
        
        Params:
            M_reduced (numpy matrix of shape (number of unique words in the corpus , 2)): matrix of 2-dimensioal word embeddings
            word2ind (dict): dictionary that maps word to indices for matrix M
            words (list of strings): words whose embeddings we want to visualize
    """
    # ------------------
    # Write your implementation here.
    
    ind_to_plot =  [word2ind.get(word) for word in words]


    # ------------------

In [3]:
# Chargement du modèle avec des vecteurs de taille 100
ft = fasttext.load_model('fastText/cc.fr.100.bin')

In [25]:
liste_de_mots = ['roi', 'reine', 'homme', 'femme', 'shampooing', 'voiture']
word2ind = dict(zip(liste_de_mots, [i for i in range(len(liste_de_mots))]))


M = np.array([ft.get_word_vector(liste_de_mots[i]) for i in range(len(liste_de_mots))])
    
M_reduced = reduce_to_k_dim(M, k=2)

Running Truncated SVD over 6 words...
Done.


In [None]:
words = []