---
# Packages to install :

In [None]:
!pip install transformers sentencepiece

Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m24.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m28.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m20.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m47.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting

---
# Import :

In [None]:
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
import tensorflow as tf
from transformers import TFCamembertModel, CamembertTokenizer
import numpy as np
from tqdm import tqdm
import pandas as pd
from google.colab import drive, files

---
# Load tokenizer & Camembert :

In [None]:
tokenizer = CamembertTokenizer.from_pretrained("camembert-base")
camembert = TFCamembertModel.from_pretrained("camembert-base")

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/811k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/508 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFCamembertModel: ['lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.bias']
- This IS expected if you are initializing TFCamembertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFCamembertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFCamembertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFCamembertModel for predictions without further training.


---
# Load data and reshape :

In [None]:
drive.mount("./drive")

Mounted at ./drive


In [None]:
df = pd.read_csv("/content/drive/MyDrive/cmantique/semFr.csv")

In [None]:
df.head()

Unnamed: 0,sentence1,sentence2,label
0,La médecine intégrative associe médecine conve...,La médecine classique exclut totalement les tr...,-1
1,La médecine intégrative associe médecine conve...,La naturopathie est combinée à la médecine con...,0
2,La médecine intégrative associe médecine conve...,La naturopathie est intégrée aux soins médicau...,1
3,Ces approches thérapeutiques complémentaires a...,"Ignorant la prévention, ces maladies sont trai...",-1
4,Ces approches thérapeutiques complémentaires a...,La prévention des maladies est un objectif maj...,0


In [None]:
df = df.dropna(axis=0)

In [None]:
df.describe()

Unnamed: 0,label
count,100140.0
mean,0.0
std,0.816501
min,-1.0
25%,-1.0
50%,0.0
75%,1.0
max,1.0


In [None]:
s1, s2 = df["sentence1"].values.tolist(), df["sentence2"].values.tolist()
labels = df["label"].values.tolist()

In [None]:
tokenized_s1 = [tokenizer(sentence, max_length=128, padding="max_length")["input_ids"] for sentence in tqdm(s1)]
tokenized_s2 = [tokenizer(sentence, max_length=128, padding="max_length")["input_ids"] for sentence in tqdm(s2)]

100%|██████████| 100140/100140 [00:23<00:00, 4257.53it/s]
100%|██████████| 100140/100140 [00:20<00:00, 4788.92it/s]


In [None]:
labels = list(map(lambda element: element+1, labels))

In [None]:
encoded_labels = tf.raw_ops.OneHot(indices=labels, depth=3, on_value=1, off_value=0, axis=-1).numpy().tolist()

In [None]:
def make_train_val_test(s1, s2, labels):
  np.random.seed(42)

  dataset = []
  max_len = len(s1)
  train_slice = int(0.8 * max_len)
  val_and_test_slice = int(0.1 * max_len)

  for i in range(max_len):
    dataset.append([s1[i], s2[i], labels[i]])

  dataset = np.array(dataset)
  np.random.shuffle(dataset)

  train = dataset[:int(0.8 * max_len)]
  val = dataset[int(0.8 * max_len):int(0.9 * max_len)]
  test = dataset[int(0.9 * max_len):]

  train_ds = tf.data.Dataset.from_tensor_slices(((np.array([e[0] for e in train]), np.array([e[1] for e in train])), np.array([e[2] for e in train])))
  val_ds = tf.data.Dataset.from_tensor_slices(((np.array([e[0] for e in val]), np.array([e[1] for e in val])), np.array([e[2] for e in val])))
  test_ds = tf.data.Dataset.from_tensor_slices(((np.array([e[0] for e in test]), np.array([e[1] for e in test])), np.array([e[2] for e in test])))

  return train_ds, val_ds, test_ds

In [None]:
train_ds, val_ds, test_ds = make_train_val_test(tokenized_s1, tokenized_s2, encoded_labels)

  dataset = np.array(dataset)


In [None]:
subset = train_ds.take(1)
for element in subset:
  print(f"{element[0][0]}, {type(element[0][0])}\n\n{element[0][1]}, {type(element[0][1])}\n\n{element[1]}, {type(element[1])}\n\n\n")

[    5   153   411     8  1873  5238 23399    30    17    11  1464   135
   342     8 16084    10 18787     9     6     1     1     1     1     1
     1     1     1     1     1     1     1     1     1     1     1     1
     1     1     1     1     1     1     1     1     1     1     1     1
     1     1     1     1     1     1     1     1     1     1     1     1
     1     1     1     1     1     1     1     1     1     1     1     1
     1     1     1     1     1     1     1     1     1     1     1     1
     1     1     1     1     1     1     1     1     1     1     1     1
     1     1     1     1     1     1     1     1     1     1     1     1
     1     1     1     1     1     1     1     1     1     1     1     1
     1     1     1     1     1     1     1     1], <class 'tensorflow.python.framework.ops.EagerTensor'>

[    5 19340   135   342    43    28  1873  5238   304    24    20 16084
    10    32   586    18   236   540   359     9     6     1     1     1
     1     1     1

In [None]:
### BATCHSIZE ###

# batchsize = 8
# batchsize = 512
batchsize = 1024  #16gb
# batchsize = 2048
# batchsize = 4096  #40gb

In [None]:
train_ds = train_ds.batch(batchsize)
val_ds = val_ds.batch(batchsize)

In [None]:
AUTOTUNE = tf.data.AUTOTUNE

train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)
test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)

---
# Create cmantique model :

In [None]:
# class Cmantique(Model):
#   def __init__(self, bert=camembert, max_len=128):
#     super(Cmantique, self).__init__()
#     self.bert_model = bert
#     for layer in self.bert_model.layers:
#       layer.trainable = False

#     self.max_len = max_len
#     self.dense_intermediate = Dense(512)
#     self.dense_final = Dense(512, activation="tanh")
#     self.classifier = Dense(3)

#   def call(self, pair):
#     s1, s2 = pair

#     mask1 = tf.cast(tf.math.not_equal(s1, 0), tf.int32)
#     mask2 = tf.cast(tf.math.not_equal(s2, 0), tf.int32)

#     embeddings1 = self.bert_model(input_ids=s1, attention_mask=mask1)[0]
#     embeddings2 = self.bert_model(input_ids=s2, attention_mask=mask2)[0]

#     avg_embeddings1 = tf.math.reduce_mean(embeddings1, axis=1)
#     avg_embeddings2 = tf.math.reduce_mean(embeddings2, axis=1)

#     dense_intermediate_output1 = self.dense_intermediate(avg_embeddings1)
#     dense_intermediate_output2 = self.dense_intermediate(avg_embeddings2)
#     dense_final_output1 = self.dense_final(dense_intermediate_output1)
#     dense_final_output2 = self.dense_final(dense_intermediate_output2)

#     diff = tf.abs(dense_final_output1 - dense_final_output2)
#     classifier_output = self.classifier(diff)
#     return classifier_output

#   def get_sentence_embedding(self, sentence):
#     mask = tf.cast(tf.math.not_equal(sentence, 0), tf.int32)
#     embedding = self.bert_model(input_ids=sentence, attention_mask=mask)[0]
#     avg_embedding = tf.math.reduce_mean(embedding, axis=1)
#     dense_intermediate_output = self.dense_intermediate(avg_embedding)
#     dense_final_output = self.dense_final(dense_intermediate_output)
#     return dense_final_output

In [None]:
class Cmantique(Model):
  def __init__(self, bert=camembert, max_len=128):
    super(Cmantique, self).__init__()
    self.bert_model = bert
    for layer in self.bert_model.layers:
      layer.trainable = False

    self.max_len = max_len
    self.dense_intermediate1 = Dense(512)
    self.dense_intermediate2 = Dense(512)
    self.dense_final = Dense(512, activation="tanh")
    self.classifier = Dense(3)

  def call(self, pair):
    s1, s2 = pair

    mask1 = tf.cast(tf.math.not_equal(s1, 0), tf.int32)
    mask2 = tf.cast(tf.math.not_equal(s2, 0), tf.int32)

    embeddings1 = self.bert_model(input_ids=s1, attention_mask=mask1)[0]
    embeddings2 = self.bert_model(input_ids=s2, attention_mask=mask2)[0]

    avg_embeddings1 = tf.math.reduce_mean(embeddings1, axis=1)
    avg_embeddings2 = tf.math.reduce_mean(embeddings2, axis=1)

    dense_intermediate1_output1 = self.dense_intermediate1(avg_embeddings1)
    dense_intermediate1_output2 = self.dense_intermediate1(avg_embeddings2)
    dense_intermediate2_output1 = self.dense_intermediate2(dense_intermediate1_output1)
    dense_intermediate2_output2 = self.dense_intermediate2(dense_intermediate1_output2)
    dense_final_output1 = self.dense_final(dense_intermediate2_output1)
    dense_final_output2 = self.dense_final(dense_intermediate2_output2)

    diff = tf.abs(dense_final_output1 - dense_final_output2)
    classifier_output = self.classifier(diff)
    return classifier_output

  def get_sentence_embedding(self, sentence):
    mask = tf.cast(tf.math.not_equal(sentence, 0), tf.int32)
    embedding = self.bert_model(input_ids=sentence, attention_mask=mask)[0]
    avg_embedding = tf.math.reduce_mean(embedding, axis=1)
    dense_intermediate1_output = self.dense_intermediate1(avg_embedding)
    dense_intermediate2_output = self.dense_intermediate2(dense_intermediate1_output)
    dense_final_output = self.dense_final(dense_intermediate2_output)
    return dense_final_output

In [None]:
cmantique = Cmantique()

In [None]:
resume = Cmantique()
dummy_input = ([np.zeros((1, 1), dtype=np.int32)], [np.zeros((1, 1), dtype=np.int32)])
_ = resume(dummy_input)  # Exécutez une passe d'échantillonnage pour déterminer les formes des couches
resume.summary()  # Affichez le résumé du modèle

Model: "cmantique_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 tf_camembert_model (TFCamem  multiple                 110621952 
 bertModel)                                                      
                                                                 
 dense_4 (Dense)             multiple                  393728    
                                                                 
 dense_5 (Dense)             multiple                  262656    
                                                                 
 dense_6 (Dense)             multiple                  262656    
                                                                 
 dense_7 (Dense)             multiple                  1539      
                                                                 
Total params: 111,542,531
Trainable params: 920,579
Non-trainable params: 110,621,952
___________________________________

In [None]:
cmantique.compile(optimizer='adam', loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True), metrics=["accuracy"])

In [None]:
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="./logs", histogram_freq=1)

In [None]:
cmantique.fit(train_ds, validation_data=val_ds, epochs=40, batch_size=batchsize, callbacks=[tensorboard_callback])

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40

In [None]:
!zip -r ./logs_5.zip ./logs
files.download("./logs_6.zip")

  adding: logs/ (stored 0%)
  adding: logs/validation/ (stored 0%)
  adding: logs/validation/events.out.tfevents.1687779118.d254e62e2c46.3281.3.v2 (deflated 77%)
  adding: logs/train/ (stored 0%)
  adding: logs/train/events.out.tfevents.1687778966.d254e62e2c46.3281.2.v2 (deflated 82%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

---
# Tests :

In [None]:
test = tokenizer.encode("test du la fonction du model.", return_tensors="tf")

In [None]:
cmantique.get_sentence_embedding(test).numpy().shape