In [None]:
!pip install datasets



In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Lire le fichier 'DATA.csv' en Pandas DataFrame et séparation des données pour entrainement et teste
try:
  cv_dataset = pd.read_csv('DATA.csv')
  X_train, X_test, y_train, y_test = train_test_split(cv_dataset[' cv'], cv_dataset['section'], test_size=0.1, random_state=42, shuffle=True)
except Exception as e:
  print(e)

# Conversion des données en liste pour future tokenization
X_train = X_train.to_list()
X_test = X_test.to_list()


In [None]:
from transformers import AutoTokenizer
from datasets import Dataset
from sklearn.preprocessing import LabelEncoder

# Tokenization des données et sauvegarde du tokenizer et récupération des BatchEncoding
tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base")
tokenized_train = tokenizer(X_train, padding=True, truncation=True, return_tensors="pt")
tokenized_test = tokenizer(X_test, padding=True, truncation=True, return_tensors="pt")
tokenizer.save_pretrained("tokenizer")

# Convertir les étiquettes cibles (Profile, Éducation etc..) en représentation numérique
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
label_encoder_2 = LabelEncoder()
y_test_encoded = label_encoder_2.fit_transform(y_test)

y_train_tokenized = [[label] * len(tokens) for label, tokens in zip(y_train_encoded, tokenized_train['input_ids'])]
y_test_tokenized = [[label] * len(tokens) for label, tokens in zip(y_test_encoded, tokenized_test['input_ids'])]

# Convertir les BatchEncoding du tokenizer en Dataset pour l'entrainement
train_dataset = Dataset.from_dict({
    'input_ids': tokenized_train['input_ids'],
    'attention_mask': tokenized_train['attention_mask'],
    'labels': y_train_tokenized
})

test_dataset = Dataset.from_dict({
    'input_ids': tokenized_test['input_ids'],
    'attention_mask': tokenized_test['attention_mask'],
    'labels': y_test_tokenized
})

In [None]:
from transformers import XLMRobertaForTokenClassification

# Téléchargement du model et calculation du nombre total des étiquette cibles
num_labels = len(label_encoder.classes_)
model = XLMRobertaForTokenClassification.from_pretrained("FacebookAI/xlm-roberta-base", num_labels=num_labels)

N = 1
for layer in model.roberta.encoder.layer[:N]:  # Replace N with the number of layers you want to freeze
    for param in layer.parameters():
        param.requires_grad = False

for layer in model.roberta.encoder.layer[N:]:  # Replace N with the number of layers you want to unfreeze
    for param in layer.parameters():
        param.requires_grad = True

# Ensure the classifier is trainable
for param in model.classifier.parameters():
    param.requires_grad = True

# Configurer le model pour entrainement
model.train()

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at FacebookAI/xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


XLMRobertaForTokenClassification(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bi

In [None]:
from transformers import TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np

# Les paramètres de fine tuning
finetuned_params = TrainingArguments(
    output_dir="output",
    learning_rate=3e-5,
    per_device_train_batch_size=6,
    num_train_epochs=10,
    logging_steps=10,
    weight_decay=0.01,
    do_predict=True,
)

# Configurer l'entraineur
trainer = Trainer (
    model=model,
    args=finetuned_params,
    train_dataset=train_dataset,
    compute_metrics=compute_metrics
)

# Entrainement
trainer.train()

# Sauvegarde du model
model.save_pretrained("model")

Step,Training Loss
10,3.2106
20,2.9337
30,2.5216
40,2.4975
50,2.1061
60,1.7281
70,1.6984
80,1.4398
90,1.2548
100,1.3432


In [None]:
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=2)  # Get predictions for each token
    labels = p.label_ids

    # Flatten the predictions and labels to compute metrics globally
    preds_flat = preds.flatten()
    labels_flat = labels.flatten()

    precision, recall, f1, _ = precision_recall_fscore_support(labels_flat, preds_flat, average='micro') # Changed 'binary' to 'micro'
    accuracy = accuracy_score(labels_flat, preds_flat)
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

model_test = XLMRobertaForTokenClassification.from_pretrained("model")
trainer = Trainer (
    model=model_test,
    args=finetuned_params,
    train_dataset=train_dataset,
    compute_metrics=compute_metrics
)
model.eval()
results = trainer.evaluate(eval_dataset=train_dataset, metric_key_prefix="eval")
print(results)


{'eval_loss': 0.056512847542762756, 'eval_accuracy': 0.9862629060690002, 'eval_precision': 0.9862629060690002, 'eval_recall': 0.9862629060690002, 'eval_f1': 0.9862629060690002, 'eval_runtime': 4.7517, 'eval_samples_per_second': 75.973, 'eval_steps_per_second': 9.681}


In [None]:
from transformers import pipeline
import re

samples = pd.read_csv('DATA.csv')
sample_1 = samples[' cv'][4]

ner = pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy='simple')
output = ner(sample_1)

for entity in output:
  label_num = re.sub(r'LABEL_', '', entity['entity_group'])
  label = label_encoder.inverse_transform(np.array([int(label_num)]))[0] # Changed this line
  print(f"{entity['word']} ({label})")

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


Faculté des Sciences Ben M’Sick, Diplôme d'études universitaires générales (DEUG), 2019 - 2021 (Education)


In [1]:
!pip install pdf2image

Collecting pdf2image
  Downloading pdf2image-1.17.0-py3-none-any.whl.metadata (6.2 kB)
Downloading pdf2image-1.17.0-py3-none-any.whl (11 kB)
Installing collected packages: pdf2image
Successfully installed pdf2image-1.17.0


In [None]:
import os
from pdf2image import convert_from_bytes
from PIL import Image
import traceback
import pytesseract
import cv2

def img_to_text(image):
	image = np.array(image)
	return pytesseract.image_to_string(image)

def convert_pdf_to_png(filename=None, content=None):
    try:
        images = convert_from_bytes(content, dpi=300, grayscale=True)
        return images
    except Exception as e:
        traceback.print_exc()
        print(e)
        return None


def pdf_to_text(pdf_file, filename):
    images = convert_pdf_to_png(filename=filename, content=pdf_file)
    pages = []

    for img in images:
        pages.append(img_to_text(img))

    text = " ".join(pages)

    # Remove special characters
    text = re.sub('(?<=\n)[e|),](?= )', '', text)
    text = re.sub('[*&«¢]', '', text)

    # Remove extra spaces and new lines
    text = re.sub('[ ]{3,}', ' ', text)
    return text


def load_cv_text(idx):
        cv_files = os.listdir('CVs')
        content = None

        with open('CVs/'+cv_files[idx], "rb") as cv_file:
                content = cv_file.read()

        return pdf_to_text(pdf_file=content, filename=cv_files[idx])


idx = int(input("Which CV would you like to use ? "))
text = load_cv_text(idx)
output = ner(text)
for entity in output:
  label_num = re.sub(r'LABEL_', '', entity['entity_group'])
  label = label_encoder.inverse_transform(np.array([int(label_num)]))[0] # Changed this line
  print(f"{entity['word']} ({label})")