**Entraînement du Word2Vec avec Google Colaboratory**
Avant tout, assurez vous d'avoir plusieurs choses :

*   Google Colaboratory doit être synchonisé avec Google Drive

In [0]:
# Télécharge data_lelu_cleaned.txt
%%bash
if [ ! -f "/content/drive/My Drive/data_lelu_cleaned.txt" ]; then
  # Installation de LRZIP
  apt-get install lrzip
  # Téléchargement de l'archive
  wget https://github.com/LaraProject/notebooks/raw/master/data_lelu_cleaned.txt.lrz
  # Décompression
  lrunzip data_lelu_cleaned.txt.lrz
fi

In [0]:
# Assemblage des deux datasets
%%bash
if [ -f "/content/drive/My Drive/data_lelu_cleaned.txt" ]; then
  cat "/content/drive/My Drive/data_lelu_cleaned.txt" "/content/drive/My Drive/data_facebook_cleaned.txt" > data.txt
fi
if [ -f "/content/data_lelu_cleaned.txt" ]; then
  cat "/content/data_lelu_cleaned.txt" "/content/drive/My Drive/data_facebook_cleaned.txt" > data.txt
fi

In [0]:
from gensim.models.callbacks import CallbackAny2Vec
import time
class EpochLogger(CallbackAny2Vec):
  '''Callback to log information about training'''
  def __init__(self):
    self.epoch = 0
    self.t0 = time.time_ns()
    self.curbatch = 0
    
  def on_batch_begin(self, model):
    total = model.corpus_total_words
    batch_size = model.batch_words
    number_of_batch = total/batch_size
    self.curbatch += 1
    dt = (time.time_ns() - self.t0) / 1e9
    dt /= self.curbatch
    eta = max(0, round((number_of_batch - self.curbatch) * dt, 2))
    n = 25
    ratio = (self.curbatch / number_of_batch)*n
    dots = "."*(n-int(ratio))
    done = "#"*(int(ratio))
    print(f"\r[{done}{dots}] ({eta} s - {self.curbatch}/{int(number_of_batch)})" + " "*6, end='', flush=True)
    
  def on_epoch_begin(self, model):
    self.epochs = model.epochs
    self.t0 = time.time_ns()
    self.curbatch = 0
    print("Epoch {}/{}".format(self.epoch+1, self.epochs))
    #print(model.__dict__)

  def on_epoch_end(self, model):
    dt = (time.time_ns() - self.t0) / 1e9
    eta = (self.epochs - self.epoch - 1) * dt
    eta_min = int(eta // 60)
    eta_s = round(eta % 60, 2)
    print("")
    print(f"Duration: {dt} s, ETA: {eta_min} minutes {eta_s} seconds")
    self.epoch += 1

In [0]:
from gensim.models import FastText

class MyIter(object):
  def __iter__(self):
    with open("data.txt", 'r') as f:
      count = 0
      for line in f.readlines() + ["<unk> "*100 + "\n"]:
        if (count % 2) == 0:
          sentence = line[11:-1].split()
        else:
          sentence = ["<start>"] + line[9:-1].split() + ["<end>"]
        yield sentence
        count += 1

In [0]:
# Construction du modèle
model = FastText(size=100, window=5, min_count=20, workers=2)
model.build_vocab(sentences=MyIter())
total_examples = model.corpus_count

In [0]:
# Entraînement
model.train(sentences=MyIter(), total_examples=total_examples, epochs=5, callbacks=[EpochLogger())

In [0]:
# Sauvegarde
model.wv.save_word2vec_format("/content/drive/My Drive/word2vec_vectors.txt")
model.save("/content/drive/My Drive/word2vec_model.bin")

**Conservez précieusement les nouveaux fichiers apparus sur votre Drive**