# Word Embeddings : le modèle Word2Vec

## Imports

In [1]:
import sys

from gensim.models.phrases import Phrases, Phraser
from gensim.models import Word2Vec

import nltk
from nltk.tokenize import wordpunct_tokenize
from unidecode import unidecode

import time

## Chargement et traitement des phrases du corpus

In [2]:
class MySentences(object):
    """Tokenize and Lemmatize sentences"""
    def __init__(self, filename):
        self.filename = filename

    def __iter__(self):
        for line in open(self.filename, encoding='utf-8', errors="backslashreplace"):
            yield [unidecode(w.lower()) for w in wordpunct_tokenize(line)]
#il va lire une ligne du fichier et l'envoyer, ça permet d'éviter le problème de RAM, nous n'allons pas tout charger dans la mémoire de l'ordi

In [3]:
infile = f"../../data/sents.txt"
sentences = MySentences(infile)

### Détection des bigrams

Article intéressant sur le sujet : https://towardsdatascience.com/word2vec-for-phrases-learning-embeddings-for-more-than-one-word-727b6cf723cf

In [4]:
bigram_phrases = Phrases(sentences)

In [5]:
type(bigram_phrases.vocab)
#ça nous permettra d'avoir un vocabulaire construir par mot ssimples et bigammes

dict

In [9]:
bigram_phraser = Phraser(phrases_model=bigram_phrases)

### Extraction des trigrams

In [10]:
trigram_phrases = Phrases(bigram_phraser[sentences])

In [11]:
trigram_phraser = Phraser(phrases_model=trigram_phrases)

### Création d'un corpus d'unigrams, bigrams, trigrams

In [12]:
corpus = list(trigram_phraser[bigram_phraser[sentences]])

## Entrainement d'un modèle Word2Vec sur ce corpus

In [16]:
#définition du modèle
start = time.time()
model = Word2Vec(
    corpus, # On passe le corpus de ngrams que nous venons de créer
    vector_size=32, # Le nombre de dimensions dans lesquelles le contexte des mots devra être réduit, aka. vector_size, vecteur de sortie
    window=15, # La taille du "contexte", de la fênetre, ici 5 mots avant et après le mot observé donc en total 10 mots de fênetre
    min_count=5, # On ignore les mots qui n'apparaissent pas au moins 5 fois dans le corpus
    workers=4, # Permet de paralléliser l'entraînement du modèle en 4 threads
    epochs=5 # Nombre d'itérations du réseau de neurones sur le jeu de données pour ajuster les paramètres avec la descente de gradient, aka. epochs.
)

end = time.time()
print("Training time:", end - start, "seconds")

Training time: 532.0328087806702 seconds


### Sauver le modèle dans un fichier

In [17]:
outfile = f"../../data/tp3modele3.model"
model.save(outfile)

## Explorer le modèle 1 (window 5 et min_count 15)

### Charger le modèle en mémoire

In [105]:
model = Word2Vec.load("../../data/tp3modele1.model")

### Calculer la similarité entre deux termes

In [23]:
model.wv.similarity("ministre", "president")

0.85968995

In [24]:
model.wv.similarity("accident", "collision")

0.553854

In [29]:
model.wv.similarity("louer", "acheter")

0.45428044

In [47]:
model.wv.similarity("congo", "colonie")

0.47603005

In [106]:
model.wv.similarity("voiture", "fiat")

0.3170802

### Chercher les mots les plus proches d'un terme donné

In [64]:
model.wv.most_similar("ixelles", topn=10)

[('ixclles', 0.9585691094398499),
 ('ixelies', 0.954590380191803),
 ('ixellcs', 0.9540612697601318),
 ('lxelles', 0.9313374161720276),
 ('etterbeek', 0.9254962801933289),
 ('ixeiles', 0.9126607179641724),
 ('ixolles', 0.9038993120193481),
 ('elterbeek', 0.8926694989204407),
 ('ixellos', 0.8884459733963013),
 ('ixeues', 0.8764249682426453)]

In [32]:
model.wv.most_similar("accident", topn=10)

[('incendie', 0.8750811219215393),
 ('automobiliste', 0.8537577390670776),
 ('attentat', 0.8516902327537537),
 ('deraillement', 0.8303574323654175),
 ('asphyxie', 0.8239715099334717),
 ('orage', 0.8182372450828552),
 ('eboulement', 0.8173860907554626),
 ('imprudence', 0.8156234622001648),
 ('explosion', 0.8091423511505127),
 ('empoisonnement', 0.8024603128433228)]

In [35]:
model.wv.most_similar("ministre", topn=10)

[('premier_ministre', 0.9177677631378174),
 ('ministre_des_finances', 0.8900978565216064),
 ('chancelier', 0.8873762488365173),
 ('ministere', 0.876909613609314),
 ('gouverneur_general', 0.8712723851203918),
 ('rapporteur', 0.8666431903839111),
 ('president', 0.8596898913383484),
 ('gouverneur', 0.8532224893569946),
 ('ancien_ministre', 0.8411591053009033),
 ('depute', 0.8330938816070557)]

In [36]:
model.wv.most_similar("course", topn=10)

[('premiere_manche', 0.912184476852417),
 ('piste', 0.9003503918647766),
 ('competition', 0.8888614177703857),
 ('seconde_manche', 0.8786814212799072),
 ('poursuite', 0.8386077880859375),
 ('troisieme_fols', 0.8348135352134705),
 ('vitesse', 0.834746241569519),
 ('coursa', 0.8327763080596924),
 ('seconde_periode', 0.8248092532157898),
 ('courso', 0.8228955268859863)]

In [41]:
model.wv.most_similar("bruxelles", topn=10)

[('bru_xelles', 0.9048832058906555),
 ('bruxeues', 0.8654956817626953),
 ('bruxolles', 0.8458254337310791),
 ('bruxellee', 0.8101943731307983),
 ('bruxellos', 0.7884137630462646),
 ('cureghem', 0.771838903427124),
 ('bruxelle', 0.7687538266181946),
 ('bruxellea', 0.7683116793632507),
 ('louvain', 0.7588357329368591),
 ('brnxelles', 0.7568449378013611)]

In [45]:
model.wv.most_similar("congo", topn=10)

[('bresil', 0.8838294744491577),
 ('congo_belge', 0.8726638555526733),
 ('katanga', 0.8712193369865417),
 ('credit_anversois', 0.8330605030059814),
 ('pakistan', 0.8241196274757385),
 ('canada', 0.8230084776878357),
 ('credit_communal', 0.8131006956100464),
 ('portugal', 0.8039137721061707),
 ('kivu', 0.8020492196083069),
 ('caire', 0.7976545095443726)]

### Faire des recherches complexes à travers l'espace vectoriel

In [48]:
print(model.wv.most_similar(positive=['paris', 'londres'], negative=['belgique']))

[('berlin', 0.8242867588996887), ('san_francisco', 0.8019291758537292), ('teheran', 0.7842291593551636), ('tokio', 0.7831219434738159), ('alger', 0.7690127491950989), ('nuremberg', 0.7668445706367493), ('prague', 0.7665507793426514), ('new_york', 0.7661935091018677), ('rome', 0.7609933018684387), ('strasbourg', 0.7599247694015503)]


In [50]:
model.wv.most_similar(positive=["belgique", "algerie"], negative=["congo"])

[('france', 0.7960983514785767),
 ('israel', 0.7824035286903381),
 ('angleterre', 0.7675928473472595),
 ('europe', 0.7521802186965942),
 ('yougoslavie', 0.7378687262535095),
 ('finlande', 0.7290837168693542),
 ('tchecoslovaquie', 0.7286390066146851),
 ('sicile', 0.7286205291748047),
 ('capitale', 0.7230349183082581),
 ('equipe_nationale', 0.718745768070221)]

## Explorer le modèle 2 (window 10 et min_count 10)

In [111]:
model = Word2Vec.load("../../data/tp3modele2.model")

In [74]:
model.wv.similarity("ministre", "president")

0.86451596

In [75]:
model.wv.similarity("accident", "collision")

0.7044664

In [76]:
model.wv.similarity("louer", "acheter")

0.38610443

In [77]:
model.wv.similarity("congo", "colonie")

0.5031018

In [112]:
model.wv.similarity("congo", "protectorat")

0.7196617

In [108]:
model.wv.similarity("voiture", "fiat")

0.26009968

In [117]:
model.wv.similarity("voiture", "FIAT")

KeyError: "Key 'FIAT' not present"

In [116]:
model.wv.similarity("voiture", "opel")

0.37847576

In [118]:
model.wv.similarity("voiture", "automobile")

0.7493417

In [78]:
model.wv.most_similar("ixelles", topn=10)

[('ixellcs', 0.9417428970336914),
 ('ixelies', 0.9354187846183777),
 ('ixclles', 0.9324423670768738),
 ('ixeiles', 0.9160378575325012),
 ('ixeues', 0.9006257653236389),
 ('lxelles', 0.898628830909729),
 ('etterbeek', 0.8935530185699463),
 ('ixellos', 0.8927544355392456),
 ('ixolles', 0.8850511908531189),
 ('anderlccht', 0.8584580421447754)]

In [79]:
model.wv.most_similar("accident", topn=10)

[('incendie', 0.816557765007019),
 ('automobiliste', 0.8141542077064514),
 ('deraillement', 0.7992586493492126),
 ('imprudence', 0.799238920211792),
 ('terrible_accident', 0.7671449184417725),
 ('suicide', 0.7630370855331421),
 ('des_suites', 0.7622869610786438),
 ('asphyxie', 0.7614895701408386),
 ('grave_accident', 0.760545551776886),
 ('attentat', 0.7602865099906921)]

In [80]:
model.wv.most_similar("ministre", topn=10)

[('premier_ministre', 0.9219110608100891),
 ('ministre_des_finances', 0.8900306224822998),
 ('chancelier', 0.8743346929550171),
 ('ministere', 0.8720221519470215),
 ('president', 0.8645159602165222),
 ('gouverneur_general', 0.8575769066810608),
 ('marechal_petain', 0.8500821590423584),
 ('gouverneur', 0.8500649929046631),
 ('mi_nistre', 0.8412063121795654),
 ('garde_des_sceaux', 0.8388331532478333)]

In [81]:
model.wv.most_similar("course", topn=10)

[('piste', 0.902543842792511),
 ('premiere_manche', 0.8972537517547607),
 ('competition', 0.8789701461791992),
 ('etape', 0.8674094080924988),
 ('poursuite', 0.8504911661148071),
 ('minute', 0.8458600640296936),
 ('epreuve', 0.8436217308044434),
 ('distance', 0.8415300846099854),
 ('deuxieme_manche', 0.8403318524360657),
 ('coursa', 0.8356508016586304)]

In [82]:
model.wv.most_similar("bruxelles", topn=10)

[('bruxolles', 0.8755651712417603),
 ('bruxeues', 0.8661399483680725),
 ('bru_xelles', 0.8528369665145874),
 ('bruxellee', 0.8324244618415833),
 ('brnxelles', 0.8063082695007324),
 ('bruxelle', 0.7799174785614014),
 ('bruxellos', 0.7748746871948242),
 ('bruxellea', 0.7604885697364807),
 ('anciennement', 0.7504033446311951),
 ('bruxehes', 0.7502194046974182)]

In [83]:
model.wv.most_similar("congo", topn=10)

[('congo_belge', 0.8879506587982178),
 ('katanga', 0.8625678420066833),
 ('bresil', 0.8565372228622437),
 ('burundi', 0.8373768925666809),
 ('kivu', 0.8110288977622986),
 ('rwanda', 0.8036089539527893),
 ('bud', 0.7995844483375549),
 ('credit_communal', 0.7959468364715576),
 ('monopole', 0.7924444675445557),
 ('pavs', 0.7862558960914612)]

In [84]:
print(model.wv.most_similar(positive=['paris', 'londres'], negative=['belgique']))

[('berlin', 0.8263168334960938), ('teheran', 0.7858179211616516), ('vienne', 0.7761972546577454), ('tokio', 0.7660727500915527), ('new_york', 0.7631980776786804), ('moscou', 0.762507438659668), ('alger', 0.7592315077781677), ('rome', 0.7559995055198669), ('nairobi', 0.7548523545265198), ('tanger', 0.7501811385154724)]


In [86]:
model.wv.most_similar(positive=["belgique", "algerie"], negative=["congo"])

[('france', 0.8318403363227844),
 ('yougoslavie', 0.7711620330810547),
 ('tchecoslovaquie', 0.7708252668380737),
 ('angleterre', 0.7625772356987),
 ('israel', 0.7429633736610413),
 ('finlande', 0.7363646626472473),
 ('reine_elizabeth', 0.7332343459129333),
 ('hollande', 0.7263011932373047),
 ('rhenanie', 0.7213075757026672),
 ('afrique_orientale', 0.7180145978927612)]

## Explorer le modèle 3 (window 15 et min_count 5)

In [109]:
model = Word2Vec.load("../../data/tp3modele3.model")

In [88]:
model.wv.similarity("ministre", "president")

0.8727256

In [89]:
model.wv.similarity("accident", "collision")

0.6808486

In [91]:
model.wv.similarity("louer", "acheter")

0.36893412

In [92]:
model.wv.similarity("congo", "colonie")

0.585178

In [103]:
model.wv.similarity("voiture", "fiat")

0.37885135

In [93]:
model.wv.most_similar("ixelles", topn=10)

[('ixelies', 0.922954797744751),
 ('ixclles', 0.9190654754638672),
 ('ixellcs', 0.9181351065635681),
 ('ixeues', 0.9028537273406982),
 ('ixellos', 0.8826266527175903),
 ('etterbeek', 0.8713135719299316),
 ('ixeiles', 0.8711690306663513),
 ('lxelles', 0.8649140000343323),
 ('anderlccht', 0.8587402105331421),
 ('ixolles', 0.8572970628738403)]

In [94]:
model.wv.most_similar("accident", topn=10)

[('automobiliste', 0.8714745044708252),
 ('terrible_accident', 0.8180086016654968),
 ('tragique_accident', 0.8178462982177734),
 ('imprudence', 0.8140143156051636),
 ('grave_accident', 0.8063476085662842),
 ('incendie', 0.7967401146888733),
 ('accident_survenu', 0.7885307669639587),
 ('aiguilleur', 0.7825018763542175),
 ('penible_accident', 0.7821693420410156),
 ('empoisonnement', 0.7814363241195679)]

In [95]:
model.wv.most_similar("ministre", topn=10)

[('premier_ministre', 0.9292763471603394),
 ('ministre_des_finances', 0.8907565474510193),
 ('chancelier', 0.8827196955680847),
 ('president', 0.8727256655693054),
 ('gouverneur_general', 0.866456925868988),
 ('ministere', 0.8601433038711548),
 ('cabinet', 0.851016640663147),
 ('rapporteur', 0.8467535376548767),
 ('gouvernement', 0.8454146385192871),
 ('foreign_office', 0.8395442962646484)]

In [96]:
model.wv.most_similar("course", topn=10)

[('piste', 0.9024394750595093),
 ('premiere_manche', 0.8995347619056702),
 ('competition', 0.8678640127182007),
 ('etape', 0.8668174743652344),
 ('seconde_manche', 0.8572721481323242),
 ('poursuite', 0.8562272787094116),
 ('performance', 0.8447163105010986),
 ('coursa', 0.8381932973861694),
 ('troisieme_manche', 0.8372411727905273),
 ('derniere_epreuve', 0.8280327916145325)]

In [97]:
model.wv.most_similar("bruxelles", topn=10)

[('bruxeues', 0.852427065372467),
 ('bruxolles', 0.8487377762794495),
 ('eruxelles', 0.7996711134910583),
 ('bru_xelles', 0.7989866733551025),
 ('bruxelle', 0.7928500771522522),
 ('bruxehes', 0.7743095755577087),
 ('bruxellee', 0.7738189697265625),
 ('bruxellos', 0.7721118927001953),
 ('xelles', 0.754986584186554),
 ('brnxelles', 0.7480197548866272)]

In [98]:
model.wv.most_similar("congo", topn=10)

[('congo_belge', 0.9028385281562805),
 ('katanga', 0.8994674682617188),
 ('bresil', 0.8394917249679565),
 ('kivu', 0.8351226449012756),
 ('burundi', 0.8339802622795105),
 ('credit_anversois', 0.8162552714347839),
 ('credit_foncier', 0.8106235265731812),
 ('rwanda', 0.8002264499664307),
 ('monopole', 0.7711485624313354),
 ('trust', 0.7693899273872375)]

In [99]:
print(model.wv.most_similar(positive=['paris', 'londres'], negative=['belgique']))

[('berlin', 0.8069779276847839), ('teheran', 0.782444417476654), ('san_francisco', 0.771608829498291), ('berlin_berlin', 0.7668904662132263), ('alger', 0.759846568107605), ('djakarta', 0.7566512823104858), ('munich', 0.7564538717269897), ('new_york', 0.7528628706932068), ('beyrouth', 0.75173020362854), ('istamboul', 0.7481323480606079)]


In [100]:
model.wv.most_similar(positive=["belgique", "algerie"], negative=["congo"])

[('france', 0.8169873952865601),
 ('capitulation', 0.7587389945983887),
 ('proclamation', 0.7529585957527161),
 ('tchecoslovaquie', 0.7514312267303467),
 ('mission_militaire', 0.7494114637374878),
 ('angleterre', 0.7486898303031921),
 ('equipe_italienne', 0.7407755851745605),
 ('afrique_orientale', 0.7371795177459717),
 ('commission_interalliee', 0.736163318157196),
 ('celebre_chorale', 0.7358113527297974)]

In [110]:
model.wv.most_similar(positive=["femme", "emploi"], negative=["homme"])




[('journeej', 0.7396256923675537),
 ('selon_capacite', 0.7281174659729004),
 ('employee', 0.7109299302101135),
 ('maison_serieuse', 0.7045323252677917),
 ('pension_extra', 0.6990687251091003),
 ('perforatrice', 0.6894639730453491),
 ('firme_importante', 0.6804764866828918),
 ('mise_au_courant', 0.679768443107605),
 ('blouse_lingerie', 0.6772360801696777),
 ('deplacement_payes', 0.6689394116401672)]