In [None]:
import pandas as pd
import numpy as np
from transformers import BertTokenizerFast
from datasets import Dataset
import torch
from sklearn.model_selection import train_test_split
from transformers import DistilBertForTokenClassification , DistilBertTokenizerFast , BertForTokenClassification
from peft import get_peft_model, LoraConfig, TaskType

In [None]:
df = pd.read_json("/content/train_cleaned.json")
df.head()

Unnamed: 0,sentence,aspects
0,The decor is not special at all but their food...,"[{'term': 'decor', 'polarity': 'negative', 'fr..."
1,"when tables opened up, the manager sat another...","[{'term': 'tables', 'polarity': 'neutral', 'fr..."
2,Though the menu includes some unorthodox offer...,"[{'term': 'menu', 'polarity': 'neutral', 'from..."
3,"service is good although a bit in your face, w...","[{'term': 'service', 'polarity': 'positive', '..."
4,they didn't have to change anything about the ...,"[{'term': 'menu', 'polarity': 'neutral', 'from..."


In [None]:
import re

def generate_bio_by_word(sentence, aspects):
    char_labels = ["O"] * len(sentence)
    # On compile une regex pour détecter tout caractère alphanumérique
    alnum_pattern = re.compile(r"[A-Za-z0-9À-ÖØ-öø-ÿ]")

    for asp in aspects:
        start = asp["from"]
        end = asp["to"]
        # Si la portion de phrase ne contient aucun caractère alphanumérique,
        # on ne l’étiquette pas comme aspect.
        if not alnum_pattern.search(sentence[start:end]):
            continue

        if start < len(char_labels):
            char_labels[start] = "B-ASP"
            for i in range(start + 1, min(end, len(char_labels))):
                char_labels[i] = "I-ASP"

    bio_labels = []
    current_pos = 0

    for word in sentence.split():
        # Sauter les espaces
        while current_pos < len(sentence) and sentence[current_pos] == " ":
            current_pos += 1

        if current_pos < len(sentence):
            label = char_labels[current_pos]
        else:
            label = "O"
        bio_labels.append(label)
        current_pos += len(word)

    return bio_labels


In [None]:
df["bio_labels"] = df.apply(lambda row: generate_bio_by_word(row["sentence"], row["aspects"]), axis=1)

In [None]:
df[[ "sentence", "bio_labels"]]

Unnamed: 0,sentence,bio_labels
0,The decor is not special at all but their food...,"[O, B-ASP, O, O, O, O, O, O, O, B-ASP, O, O, B..."
1,"when tables opened up, the manager sat another...","[O, B-ASP, O, O, O, B-ASP, O, O, O, O, O]"
2,Though the menu includes some unorthodox offer...,"[O, O, B-ASP, O, O, O, O, O, B-ASP, I-ASP, I-A..."
3,"service is good although a bit in your face, w...","[B-ASP, O, O, O, O, O, O, O, O, O, O, O, O, O,..."
4,they didn't have to change anything about the ...,"[O, O, O, O, O, O, O, O, B-ASP, O, O, O, O, O,..."
...,...,...
8288,The design of the space is good .,"[O, O, O, O, B-ASP, O, O, O]"
8289,"I was there for brunch recently , and we were ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, B-A..."
8290,The waiter delivered our food while holding wh...,"[O, B-ASP, O, O, O, O, O, O, O, O, O, O, O, O,..."
8291,The waitress came to check in on us every few ...,"[O, B-ASP, O, O, O, O, O, O, O, O, O, O, O, O,..."


In [None]:
import torch
import re
from transformers import BertTokenizerFast

def align_bio_with_bert_tokens(
    sentence: str,
    word_bio_labels: list,
    tokenizer: BertTokenizerFast,
    max_len: int = 128
):
    """
    Aligne les étiquettes BIO mot par mot avec les sous‐tokens de BERT,
    tout en ignorant les sous‐tokens de ponctuation pure.
    """
    # Tokenisation avec offsets pour garder l’alignement caractère→sous-token
    encoding = tokenizer(
        sentence,
        return_offsets_mapping=True,
        padding="max_length",
        truncation=True,
        max_length=max_len,
        return_attention_mask=True,
        return_tensors="pt"
    )
    input_ids = encoding["input_ids"].squeeze(0)
    attention_mask = encoding["attention_mask"].squeeze(0)
    offsets = encoding["offset_mapping"].squeeze(0)

    # Labels initialisés à -100 (ignorés lors du calcul de la loss)
    labels = [-100] * max_len
    label_map = {"O": 0, "B-ASP": 1, "I-ASP": 2}

    # Regex pour détecter au moins un caractère alphanumérique
    alnum_pattern = re.compile(r"[A-Za-z0-9À-ÖØ-öø-ÿ]")

    # Découpage des mots
    words = sentence.split()
    if len(words) != len(word_bio_labels):
        raise ValueError(f"Mismatch: {len(words)} mots vs {len(word_bio_labels)} labels.")

    word_id = 0
    word_start = 0

    for token_idx, (start, end) in enumerate(offsets.tolist()):
        # Si le sous-token est un padding ([PAD]) ou [CLS]/[SEP], on continue
        if start == end:
            continue

        # On extrait la portion de texte correspondant à ce sous-token
        token_text = sentence[start:end]

        # Si la portion de texte ne contient aucun caractère alphanumérique,
        # on la considère comme ponctuation pure et on l’ignore (label = -100).
        if not alnum_pattern.search(token_text):
            continue

        # Trouver à quel mot appartiennent ces offsets
        # Si on dépasse la fin du mot actuel, on avance word_id
        while word_id < len(words):
            word = words[word_id]
            word_end = word_start + len(word)
            # Si le sous-token est entièrement contenu dans la portée du mot
            if start >= word_start and end <= word_end:
                # Appliquer l’étiquette BIO du mot au(s) sous-token(s)
                labels[token_idx] = label_map[word_bio_labels[word_id]]
                break
            else:
                # On passe au mot suivant
                word_start = word_end + 1  # +1 pour sauter l’espace
                word_id += 1

    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": torch.tensor(labels, dtype=torch.long)
    }

In [None]:
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

def process_row(row):
    return align_bio_with_bert_tokens(
        sentence=row["sentence"],
        word_bio_labels=row["bio_labels"],
        tokenizer=tokenizer,
        max_len=128
    )

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
processed = df.apply(process_row, axis=1)

In [None]:
processed

Unnamed: 0,0
0,"{'input_ids': [tensor(101), tensor(1996), tens..."
1,"{'input_ids': [tensor(101), tensor(2043), tens..."
2,"{'input_ids': [tensor(101), tensor(2295), tens..."
3,"{'input_ids': [tensor(101), tensor(2326), tens..."
4,"{'input_ids': [tensor(101), tensor(2027), tens..."
...,...
8288,"{'input_ids': [tensor(101), tensor(1996), tens..."
8289,"{'input_ids': [tensor(101), tensor(1045), tens..."
8290,"{'input_ids': [tensor(101), tensor(1996), tens..."
8291,"{'input_ids': [tensor(101), tensor(1996), tens..."


In [None]:
# Convertir en listes PyTorch ou batch
input_ids = [item['input_ids'] for item in processed]
attention_masks = [item['attention_mask'] for item in processed]
labels = [item['labels'] for item in processed]

In [None]:
# Convertir en Dataset HuggingFace
hf_dataset = Dataset.from_dict({
    "input_ids": input_ids,
    "attention_mask": attention_masks,
    "labels": labels
})

In [None]:
hf_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 8293
})

In [None]:
# Fractionner en entraînement et validation (par exemple : 90% / 10%)
dataset = hf_dataset.train_test_split(test_size=0.1)
train_dataset = dataset['train']
val_dataset = dataset['test']

In [None]:
model = BertForTokenClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=3,
    id2label={0: "O", 1: "B-ASP", 2: "I-ASP"},
    label2id={"O": 0, "B-ASP": 1, "I-ASP": 2}
)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)


In [None]:
from transformers import TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np

# Dictionnaires de mappage des labels
id2label = {0: "O", 1: "B-ASP", 2: "I-ASP"}
label2id = {label: idx for idx, label in id2label.items()}

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    true_labels = []
    true_predictions = []

    for pred_seq, label_seq in zip(predictions, labels):
        for pred, true in zip(pred_seq, label_seq):
            if true != -100:
                true_labels.append(true)
                true_predictions.append(pred)

    accuracy = accuracy_score(true_labels, true_predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(
        true_labels,
        true_predictions,
        average='weighted',
        zero_division=0
    )

    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }
training_args = TrainingArguments(
    output_dir="/content/results_final",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="epoch",
    logging_dir="/content/logs2",
    report_to=[],
)

# Création du Trainer Hugging Face
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)


  trainer = Trainer(


In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.153438,0.942608,0.946873,0.942608,0.944014
2,0.181700,0.139788,0.947419,0.950518,0.947419,0.948554
3,0.116700,0.142096,0.947815,0.949835,0.947815,0.948564


TrainOutput(global_step=1401, training_loss=0.13336899108668893, metrics={'train_runtime': 548.4877, 'train_samples_per_second': 40.82, 'train_steps_per_second': 2.554, 'total_flos': 1462556494404864.0, 'train_loss': 0.13336899108668893, 'epoch': 3.0})

In [None]:
metrics = trainer.evaluate()
print(metrics)


{'eval_loss': 0.14209556579589844, 'eval_accuracy': 0.9478152592257189, 'eval_precision': 0.9498346881157217, 'eval_recall': 0.9478152592257189, 'eval_f1': 0.9485639142563341, 'eval_runtime': 6.0365, 'eval_samples_per_second': 137.496, 'eval_steps_per_second': 8.614, 'epoch': 3.0}


In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
# Chemin vers ton dossier dans Google Drive
save_path = "/content/drive/MyDrive/absa_model/aspect_extractor/"

# Sauvegarder le modèle
model.save_pretrained(save_path)

# Sauvegarder aussi le tokenizer
tokenizer.save_pretrained(save_path)


('/content/drive/MyDrive/absa_model/aspect_extractor/tokenizer_config.json',
 '/content/drive/MyDrive/absa_model/aspect_extractor/special_tokens_map.json',
 '/content/drive/MyDrive/absa_model/aspect_extractor/vocab.txt',
 '/content/drive/MyDrive/absa_model/aspect_extractor/added_tokens.json',
 '/content/drive/MyDrive/absa_model/aspect_extractor/tokenizer.json')

In [None]:
from transformers import AutoModelForTokenClassification, AutoTokenizer

model_path = "/content/drive/MyDrive/absa_model/aspect_extractor/"

model = AutoModelForTokenClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)


In [None]:
from transformers import TokenClassificationPipeline
import torch

pipeline = TokenClassificationPipeline(
    model=model,
    tokenizer=tokenizer,
    aggregation_strategy=None,
    device=0 if torch.cuda.is_available() else -1
)

# Phrase de test
sentence = "the chiken tacos was good , but the service was bad"
predictions = pipeline(sentence)

# Reconstruction des aspects avec les scores
aspects = []
current_aspect = ""
current_scores = []

for pred in predictions:
    token = pred['word']
    label = pred['entity']
    score = pred['score']

    if token.startswith("##"):
        token = token[2:]
        current_aspect += token
        current_scores.append(score)
    else:
        if label == "B-ASP":
            if current_aspect:
                avg_score = sum(current_scores) / len(current_scores)
                aspects.append((current_aspect, avg_score))
            current_aspect = token
            current_scores = [score]
        elif label == "I-ASP":
            current_aspect += " " + token
            current_scores.append(score)
        else:
            if current_aspect:
                avg_score = sum(current_scores) / len(current_scores)
                aspects.append((current_aspect, avg_score))
                current_aspect = ""
                current_scores = []

# Ajouter le dernier aspect s'il y en a un
if current_aspect:
    avg_score = sum(current_scores) / len(current_scores)
    aspects.append((current_aspect, avg_score))

# Affichage final
print("Aspects extraits :")
for asp, sc in aspects:
    print(f"- {asp} (score : {sc:.2f})")


Device set to use cpu


Aspects extraits :
- chiken tacos (score : 0.99)
- service (score : 0.99)
