<a href="https://colab.research.google.com/github/MouadEttali/Text_Mining/blob/main/Transformers_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installations

In [None]:
!pip install transformers
#!pip install torch

Collecting transformers
  Downloading transformers-4.11.3-py3-none-any.whl (2.9 MB)
[K     |████████████████████████████████| 2.9 MB 4.3 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 46.7 MB/s 
Collecting huggingface-hub>=0.0.17
  Downloading huggingface_hub-0.0.19-py3-none-any.whl (56 kB)
[K     |████████████████████████████████| 56 kB 4.4 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 45.9 MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 18.7 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempti

# Packages

In [None]:
from transformers import BertModel, BertTokenizer
import torch
import numpy as np
import time
import pandas as pd

# Load Bert pretrained model

In [None]:
model_name = "bert-base-cased" # prendre en compte la casse

In [None]:
if torch.cuda.is_available():
  device = 'cuda'
else:
  device = 'cpu'

In [None]:
model = BertModel.from_pretrained(model_name, output_hidden_states=True) # récupérer la sortie des hidden layers
tokenizer = BertTokenizer.from_pretrained(model_name) # Chaque modèle à son propre tokenizer

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/416M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/426k [00:00<?, ?B/s]

In [None]:
if device == 'cuda':
  model.cuda() 

# Test of the tokenizer

In [None]:
sample = "let's encode a sentence"
print("raw: ", sample)
tokenized = tokenizer.tokenize(sample)[:512] # Taille maximale des séquences dans bert, même si la sequence est longue
print('tokenized: ', tokenized) 
input_inds = tokenizer.convert_tokens_to_ids(tokenized)
print('encoded: ', input_inds)
## encode ne fait pas parti du vocabulaire
## même principe que la bpe-isation

raw:  let's encode a sentence
tokenized:  ['let', "'", 's', 'en', '##code', 'a', 'sentence']
encoded:  [1519, 112, 188, 4035, 13775, 170, 5650]


In [None]:
tensor_sentence = torch.tensor([input_inds])
tensor_sentence = tensor_sentence.to(device) # put tensor on device = 'cuda'
tensor_sentence

tensor([[ 1519,   112,   188,  4035, 13775,   170,  5650]], device='cuda:0')

In [None]:
output = model(tensor_sentence)
hidden_states = output.hidden_states
len(hidden_states) # la première couche correspond à la couche d'embedding

13

In [None]:
hidden_states[0].shape

torch.Size([1, 7, 768])

In [None]:
hidden_states[0][0].detach().cpu().numpy() # pour le convertir en array 
                                           # detach pour l'éliminer de l'arbre de calcul
                                           # cpu pour quitter cuda
hidden_states = hidden_states[1:]

In [None]:
word_embeddings = [layer_embeddings[0].mean(axis=0).detach().cpu().numpy() for layer_embeddings in hidden_states]

In [None]:
word_embeddings[0].shape
# model, tokenizer, phrase, et retourne liste word_embeddings

(768,)

# Function for tokenizer a sentence

In [None]:
def encode_sentence(model, tokenizer, sentence):  
  tokenized = tokenizer.tokenize(sentence)[:512]
  input_inds = tokenizer.convert_tokens_to_ids(tokenized)

  tensor_sentence = torch.tensor([input_inds])
  tensor_sentence = tensor_sentence.to(device)

  output = model(tensor_sentence)
  hidden_states = output.hidden_states

  hidden_states = hidden_states[1:]

  document_embeddings = [layer_embeddings[0].mean(axis=0).detach().cpu().numpy() for layer_embeddings in hidden_states]

  return document_embeddings

In [None]:
sentences = ["The first sentence", "the second sentence", "the third sentence"]
encoded_sentences = []
for sentence in sentences :
  document_embeddings = encode_sentence(model, tokenizer, sentence)
  document_embeddings = np.vstack(document_embeddings) # or np.array
  encoded_sentences.append(document_embeddings)

In [None]:
concat_embeddings = np.array(encoded_sentences)
concat_embeddings.shape

(3, 12, 768)

In [None]:
final_embeddings = np.swapaxes(concat_embeddings, 0,1)
final_embeddings.shape

(12, 3, 768)

In [None]:
# A function for many sentences
def encode_sentences(model, tokenizer, sentences):
  encoded_sentences = []
  for sentence in sentences :
    document_embeddings = encode_sentence(model, tokenizer, sentence)
    document_embeddings = np.vstack(document_embeddings) # or np.array()
    encoded_sentences.append(document_embeddings)

  concat_embeddings = np.array(encoded_sentences)
  final_embeddings = np.swapaxes(concat_embeddings, 0,1)
  return final_embeddings

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Encode all the dataset

In [None]:
df = pd.read_csv("/content/drive/MyDrive/classic3.csv", index_col=0)
df

Unnamed: 0,text,label
0,Milestones in Cataloging In the case of the pr...,cisi
1,childhood psychosis. a description is given of...,med
2,neonatal hepatitis or familial neonatal obstru...,med
3,Handbook of Comparative Librarianship The firs...,cisi
4,Design and Evaluation of Information Systems T...,cisi
...,...,...
3886,modification of autistic behavior with lsd-25....,med
3887,Patterns of Evaluation in Science: Institution...,cisi
3888,The government of the American Public Library ...,cisi
3889,base pressure at subsonic speeds in the presen...,cran


In [None]:
texts = df['text'].values

In [None]:
s = time.time()
print(device)
matrices = encode_sentences(model, tokenizer, texts)
print(time.time() - s)

cuda
169.19515752792358


In [None]:
matrices.shape

(12, 3891, 768)