# Fine Tunning T5 small model pour generer les aspects et opinions a partir de text

In [None]:
!pip install datasets transformers SentencePiece accelerate rouge

imports

In [None]:
from datasets import load_dataset , Dataset
import pandas as pd
import torch
from torch.utils.data import DataLoader
from transformers import (
    T5ForConditionalGeneration,
    T5Tokenizer,
    AdamW,
    Trainer,
    TrainingArguments,
    AutoTokenizer,
    AutoModelForCausalLM,
    GenerationConfig,
    DataCollatorForSeq2Seq
    )
import json
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk import word_tokenize, pos_tag
from sklearn.model_selection import train_test_split
import ast
import numpy as np
from rouge import Rouge
import re
from nltk.tokenize import sent_tokenize
from textblob import TextBlob
import ast

# NLTK Downloading

In [None]:
nltk.download('punkt')
nltk.download('vader_lexicon')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

# load the datasets

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Master 2/absa_df_extracted.csv')

total_size = len(df)
adjusted_total_size = total_size - (total_size % 1000)

train_ratio = 0.85
val_test_ratio = 0.15
val_ratio_of_val_test = 0.5

train_size = int(adjusted_total_size * train_ratio)
val_test_size = adjusted_total_size - train_size
val_size = int(val_test_size * val_ratio_of_val_test)
test_size = val_test_size - val_size

train_df = df.sample(n=train_size, random_state=42).reset_index(drop=True)
remaining_df = df.drop(train_df.index).reset_index(drop=True)
val_df = remaining_df.sample(n=val_size, random_state=42).reset_index(drop=True)
test_df = remaining_df.drop(val_df.index).reset_index(drop=True)

print(f"Taille totale ajustée: {adjusted_total_size}")
print(f"Taille de l'ensemble d'entraînement: {len(train_df)}")
print(f"Taille de l'ensemble de validation: {len(val_df)}")
print(f"Taille de l'ensemble de test: {len(test_df)}")

Taille totale ajustée: 28000
Taille de l'ensemble d'entraînement: 23800
Taille de l'ensemble de validation: 2100
Taille de l'ensemble de test: 2885


### sauvegarde pour chargement ulterieur

In [None]:
train_df.to_csv('/content/drive/MyDrive/Master 2/absa_train_df.csv', index=False)
val_df.to_csv('/content/drive/MyDrive/Master 2/absa_val_df.csv', index=False)
test_df.to_csv('/content/drive/MyDrive/Master 2/absa_test_df.csv', index=False)

### Chargement

In [None]:
train_df= pd.read_csv('/content/drive/MyDrive/Master 2/absa_train_df.csv')
val_df= pd.read_csv('/content/drive/MyDrive/Master 2/absa_val_df.csv')
test_df= pd.read_csv('/content/drive/MyDrive/Master 2/absa_test_df.csv')

In [None]:
print(f"Taille de l'ensemble d'entraînement: {len(train_df)}")
print(f"Taille de l'ensemble de validation: {len(val_df)}")
print(f"Taille de l'ensemble de test: {len(test_df)}")

Taille de l'ensemble d'entraînement: 23800
Taille de l'ensemble de validation: 2100
Taille de l'ensemble de test: 2885


# Definition des fonction de formatage des données

In [None]:
def clean_elements(elements):
    cleaned_elements = []
    for element in elements:
        cleaned_element = re.sub(r'[^\w\s]', '', element)
        cleaned_elements.append(cleaned_element.strip())
    return cleaned_elements
def format_data(text, main_aspect, extracted_aspects, global_polarity, extracted_opinions):
    extracted_aspects_list = ast.literal_eval(extracted_aspects)
    extracted_opinions_list = ast.literal_eval(extracted_opinions)
    extracted_aspects_list = clean_elements(extracted_aspects_list)
    extracted_opinions_list = clean_elements(extracted_opinions_list)
    if not pd.isna(main_aspect) and main_aspect not in extracted_aspects_list:
        cleaned_main_aspect = re.sub(r'[^\w\s]', '', main_aspect)
        extracted_aspects_list.append(cleaned_main_aspect)
    extracted_aspects_list = list(set(extracted_aspects_list))
    extracted_opinions_list = list(set(extracted_opinions_list))
    if not extracted_opinions_list:
        extracted_opinions_list = ['neutral']
    extracted_aspects_str = ', '.join(extracted_aspects_list)
    extracted_opinions_str = ', '.join(extracted_opinions_list)
    input_text = f"review: {text}"
    target_text = f"main aspect: {main_aspect} extracted aspects: {extracted_aspects_str} extracted opinions: {extracted_opinions_str} global polarity: {global_polarity}  </s>"

    return input_text, target_text

def prepare_data(df, tokenizer):
    dataset = []
    for _, row in df.iterrows():
        input_text, target_text = format_data(row['text'], row['aspect'], row['extracted_aspects'], row['aspect_polarity'],row["extracted_opinions"])
        input_encoding = tokenizer(
            input_text,
            padding="max_length",
            truncation=True,
            max_length=150,
            return_tensors="pt"
        )
        target_encoding = tokenizer(
            target_text,
            padding="max_length",
            truncation=True,
            max_length=150,
            return_tensors="pt"
        )

        dataset.append({
            "input_ids": input_encoding.input_ids.squeeze(),
            "attention_mask": input_encoding.attention_mask.squeeze(),
            "labels": target_encoding.input_ids.squeeze()
        })

    return dataset

In [None]:
for _, row in train_df.iterrows():
        input_text, target_text = format_data(row['text'], row['aspect'], row['extracted_aspects'], row['aspect_polarity'], row['extracted_opinions'])
        print(target_text)
        break

main aspect: games extracted aspects: music, facebook, pictures, games, email, daughter extracted opinions: neutral global polarity: neutral  </s>


# Chargement du model T5 small pre entrainé

In [None]:
tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5ForConditionalGeneration.from_pretrained('t5-small')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

# Verification de la longeur de token

In [None]:
max_len_input = 0
max_len_target = 0

for _, row in train_df.iterrows():
    input_text, target_text = format_data(row['text'], row['aspect'], row['extracted_aspects'], row['aspect_polarity'], row['extracted_opinions'])
    input_tokens = tokenizer.tokenize(input_text)
    target_tokens = tokenizer.tokenize(target_text)
    max_len_input = max(max_len_input, len(input_tokens))
    max_len_target = max(max_len_target, len(target_tokens))

print(f"Nombre maximal de tokens pour input_text: {max_len_input}")
print(f"Nombre maximal de tokens pour target_text: {max_len_target}")

Nombre maximal de tokens pour input_text: 139
Nombre maximal de tokens pour target_text: 103


#Lancement du fine tunning du model

In [None]:
train_dataset = prepare_data(train_df, tokenizer)
val_dataset = prepare_data(val_df, tokenizer)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=40,
    per_device_train_batch_size=54,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    evaluation_strategy="epoch"
)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator
)
trainer.train()

model.save_pretrained('/content/drive/MyDrive/Master 2/absa_t5_model_150L__')
tokenizer.save_pretrained('/content/drive/MyDrive/Master 2/absa_t5_model_150L__')



Epoch,Training Loss,Validation Loss
1,No log,0.394571
2,4.206300,0.296317
3,0.305300,0.257554
4,0.259100,0.232443
5,0.236700,0.212427
6,0.220800,0.194648
7,0.204900,0.180328
8,0.195000,0.168681
9,0.195000,0.15694
10,0.186000,0.147863


('/content/drive/MyDrive/Master 2/absa_t5_model_150L_/tokenizer_config.json',
 '/content/drive/MyDrive/Master 2/absa_t5_model_150L_/special_tokens_map.json',
 '/content/drive/MyDrive/Master 2/absa_t5_model_150L_/spiece.model',
 '/content/drive/MyDrive/Master 2/absa_t5_model_150L_/added_tokens.json')

# Chargement du model fine tuné

In [None]:
model_path = '/content/drive/MyDrive/Master 2/my_t5_model_150L_'
tokenizer = T5Tokenizer.from_pretrained(model_path)
model = T5ForConditionalGeneration.from_pretrained(model_path)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


# Generation des predictions pour calcul de la performances

In [None]:
test_dataset = prepare_data(test_df, tokenizer)
def test_model(model, tokenizer, test_dataset, max_length=150):
    model.eval()
    predictions = []
    i = 0
    for batch in test_dataset:
        if(i%700==0):print( "itteration n° ",i)
        i+=1
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        if input_ids.ndim == 1:
            input_ids = input_ids.unsqueeze(0)
        if attention_mask.ndim == 1:
            attention_mask = attention_mask.unsqueeze(0)
        outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=max_length)
        prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
        predictions.append(prediction)
    return predictions
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
test_predictions = test_model(model, tokenizer, test_dataset, max_length=150)



itteration n°  0
itteration n°  700
itteration n°  1400
itteration n°  2100
itteration n°  2800


# Sauvegarde des predictions pour utilisations ulterieurs

In [None]:
file_path = '/content/drive/MyDrive/Master 2/test_predictions.csv'

In [None]:
df_predictions = pd.DataFrame({'predictions': test_predictions})
df_predictions.to_csv(file_path, index=False)

In [None]:
df_loaded = pd.read_csv(file_path)
test_predictions  = df_loaded['predictions'].tolist()

In [None]:
print((test_predictions[0]))

main aspect: hotel extracted aspects: importance, beauty, staff, area, nights, kindness, site, management, night extracted opinions: importance, beauty, kindness, suffering, bad global polarity: negative


# Associations des valeurs de generation et valeurs du dataset

In [None]:
truths=[]
predictions=[]
for i, (prediction, row) in enumerate(zip(test_predictions, test_df.iterrows())):
    actual_label = row[1]
    input_text, target_text = format_data(row[1]['text'], row[1]['aspect'], row[1]['extracted_aspects'], row[1]['aspect_polarity'], row[1]['extracted_opinions'])
    truths.append(target_text[:-5])
    predictions.append(prediction)
    print(f"Test Example {i + 1}:")
    print(f"Prediction   : {prediction}")
    print(f"Ground truth : {target_text[:-5]}")
    print("-" * 50)


[1;30;43mLe flux de sortie a été tronqué et ne contient que les 5000 dernières lignes.[0m
Test Example 1636:
Prediction   : main aspect: hotel extracted aspects: business, hall, hotel extracted opinions: fantastic global polarity: positive
Ground truth : main aspect: transportation extracted aspects: business, transportation, hall, hotel extracted opinions: fantastic global polarity: neutral 
--------------------------------------------------
Test Example 1637:
Prediction   : main aspect: rooms extracted aspects: level, lot, rooms, food, bugs extracted opinions: desired, good global polarity: negative
Ground truth : main aspect: rooms extracted aspects: level, lot, rooms, food, bugs extracted opinions: desired, good global polarity: negative 
--------------------------------------------------
Test Example 1638:
Prediction   : main aspect: hotel extracted aspects: entrance, navigator, thing, taxi, hotel extracted opinions: bad global polarity: negative
Ground truth : main aspect: hote

In [None]:
print(len(predictions))
print(len(truths))

2885
2885


#Calcul des metrics

In [None]:
def extract_component_indices(text):
    words = text.split()
    try:
        main_aspect_index = words.index('aspect:') + 1 if 'aspect:' in words else -1
        global_polarity_index = words.index('polarity:') + 1 if 'polarity:' in words else -1
        aspects_start_index = words.index('aspects:') + 1 if 'aspects:' in words else -1
        aspects_end_index = words.index('opinions:') - 2 if 'opinions:' in words else -1
        opinions_start_index = words.index('opinions:') + 1 if 'opinions:' in words else -1
        opinions_end_index = global_polarity_index - 2 if global_polarity_index != -1 else -1
    except ValueError:
        return -1, -1, -1, -1, -1, -1

    return main_aspect_index, global_polarity_index, aspects_start_index, aspects_end_index, opinions_start_index, opinions_end_index

MAIN_ASPECT_IDX = 0
GLOBAL_POLARITY_IDX = 1
ASPECTS_START_IDX = 2
ASPECTS_END_IDX = 3
OPINIONS_START_IDX = 4
OPINIONS_END_IDX = 5

In [None]:
def calculate_metrics(preds, truths):
    tp, fp, fn = 0, 0, 0
    jaccard_scores = []
    for pred_elements, truth_elements in zip(preds, truths):
        pred_set = set(pred_elements)
        truth_set = set(truth_elements)
        tp += len(pred_set.intersection(truth_set))
        fp += len(pred_set.difference(truth_set))
        fn += len(truth_set.difference(pred_set))
        jaccard_score = len(pred_set.intersection(truth_set)) / len(pred_set.union(truth_set)) if len(pred_set.union(truth_set)) != 0 else 0
        jaccard_scores.append(jaccard_score)
    precision = tp / (tp + fp) if (tp + fp) != 0 else 0
    accuracy = tp / (tp + fp + fn) if (tp + fp + fn) != 0 else 0
    recall = tp / (tp + fn) if (tp + fn) != 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0
    mean_jaccard = sum(jaccard_scores) / len(jaccard_scores) if jaccard_scores else 0
    return precision, accuracy, recall, f1_score, mean_jaccard

def calculate_metrics_for_single_element(preds, truths):
    tp, fp, fn, tn = 0, 0, 0, 0

    for pred, truth in zip(preds, truths):
        #print(pred," -- " ,truth)
        if pred == truth:
            if pred:
                tp += 1
            else:
                tn += 1
        else:
            if pred:
                fp += 1
            else:
                fn += 1

    precision = tp / (tp + fp) if (tp + fp) != 0 else 0
    recall = tp / (tp + fn) if (tp + fn) != 0 else 0
    accuracy = (tp + tn) / (tp + tn + fp + fn) if (tp + tn + fp + fn) != 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0

    return precision, accuracy, recall, f1_score


def separate_components(preds, truths):
    preds_aspects, preds_opinions, preds_main_aspects, preds_global_polarities = [], [], [], []
    truths_aspects, truths_opinions, truths_main_aspects, truths_global_polarities = [], [], [], []
    for pred, truth in zip(preds, truths):
        pred_idx = extract_component_indices(pred)
        truth_idx = extract_component_indices(truth)
        preds_aspects.append(pred.split()[pred_idx[ASPECTS_START_IDX]:pred_idx[ASPECTS_END_IDX]])
        preds_opinions.append(pred.split()[pred_idx[OPINIONS_START_IDX]:pred_idx[OPINIONS_END_IDX]])
        preds_main_aspects.append(pred.split()[pred_idx[MAIN_ASPECT_IDX]])
        preds_global_polarities.append(pred.split()[pred_idx[GLOBAL_POLARITY_IDX]])
        truths_aspects.append(truth.split()[truth_idx[ASPECTS_START_IDX]:truth_idx[ASPECTS_END_IDX]])
        truths_opinions.append(truth.split()[truth_idx[OPINIONS_START_IDX]:truth_idx[OPINIONS_END_IDX]])
        truths_main_aspects.append(truth.split()[truth_idx[MAIN_ASPECT_IDX]])
        truths_global_polarities.append(truth.split()[truth_idx[GLOBAL_POLARITY_IDX]])
    return preds_aspects, preds_opinions, preds_main_aspects, preds_global_polarities, truths_aspects, truths_opinions, truths_main_aspects, truths_global_polarities


preds_aspects, preds_opinions, preds_main_aspects, preds_global_polarities, truths_aspects, truths_opinions, truths_main_aspects, truths_global_polarities = separate_components(predictions, truths)

precision_aspects, accuracy_aspects, recall_aspects,f1_aspects,jaccard_aspects = calculate_metrics(preds_aspects, truths_aspects)
precision_opinions, accuracy_opinions, recall_opinions,f1_opinions,jaccard_opinions = calculate_metrics (preds_opinions, truths_opinions)

precision_main_aspect, accuracy_main_aspect, recall_main_aspect,f1_main_aspect = calculate_metrics_for_single_element(preds_main_aspects, truths_main_aspects)
precision_global_polarity, accuracy_global_polarity, recall_global_polarity,f1_global_polarity = calculate_metrics_for_single_element(preds_global_polarities, truths_global_polarities)

print("-"*50)
print("Precision Aspects:", precision_aspects)
print("Accuracy Aspects:", accuracy_aspects)
print("Recall Aspects:", recall_aspects)
print("F1 Aspects:", f1_aspects)
print("jaccard Aspects:", jaccard_aspects)
print("-"*50)
print("Precision Opinions:", precision_opinions)
print("Accuracy Opinions:", accuracy_opinions)
print("Recall Opinions:", recall_opinions)
print("F1 Opinions:", f1_opinions)
print("jaccard Opinions:", jaccard_opinions)
print("-"*50)
print("Precision Main Aspect:", precision_main_aspect)
print("Accuracy Main Aspect:", accuracy_main_aspect)
print("Recall Main Aspect:", recall_main_aspect)
print("F1 Main Aspect:", f1_main_aspect)
print("-"*50)
print("Precision Global Polarity:", precision_global_polarity)
print("Accuracy Global Polarity:", accuracy_global_polarity)
print("Recall Global Polarity:", recall_global_polarity)
print("F1 Global Polarity:", f1_global_polarity)
print("-"*50)

[1;30;43mLe flux de sortie a été tronqué et ne contient que les 5000 dernières lignes.[0m
facilities  --  facilities
service  --  service
hotel  --  treatment
service  --  facilities
room  --  room
hotel  --  location
accommodation  --  room
hotel  --  bathroom
hotel  --  meals
service  --  service
room  --  service
hotel  --  staff
food  --  meals
service  --  hotel
rooms  --  rooms
location  --  cleanliness
hotel  --  wifi
food  --  service
location  --  location
location  --  parking
food  --  prices
location  --  location
hotel  --  views
facilities  --  bathroom
rooms  --  rooms
rooms  --  facilities
food  --  facilities
hotel  --  hotel
location  --  residence
bedding  --  meals
hotel  --  pool
staff  --  rooms
hotel  --  management
hotel  --  meals
hotel  --  employees
prices  --  prices
room  --  room
staff  --  reception
hotel  --  hotel
hotel  --  position
prices  --  prices
hotel  --  hotel
location  --  location
prices  --  hotel
internet  --  internet
hotel  --  hotel
am

# Mise en forme de l'output du model

In [None]:
def parse_model_output(output):
    main_aspect_match = re.search(r"main aspect: ([^,]+?)(?= extracted aspects:)", output)
    aspects_match = re.search(r"extracted aspects: ([^,]+?(?:, [^,]+)*)(?= extracted opinions:)", output)
    opinions_match = re.search(r"extracted opinions: ([^,]+?(?:, [^,]+)*)(?= global polarity:)", output)
    polarity_match = re.search(r"global polarity: (\w+)", output)
    main_aspect = main_aspect_match.group(1) if main_aspect_match else None
    aspects = [aspect.strip() for aspect in aspects_match.group(1).split(",")] if aspects_match else []
    opinions = [opinion.strip() for opinion in opinions_match.group(1).split(",")] if opinions_match else []
    global_polarity = polarity_match.group(1) if polarity_match else None
    aspects = list(dict.fromkeys(aspects))
    return {
        "main_aspect": main_aspect,
        "extracted_aspects": aspects,
        "extracted_opinions": opinions,
        "global_polarity": global_polarity
    }

In [None]:
def generate_single_prediction(model, tokenizer, text, max_length=150):
    model.eval()
    formatted_text = f"review: {text}"
    input_encoding = tokenizer(
        formatted_text,
        padding="max_length",
        truncation=True,
        max_length=max_length,
        return_tensors="pt"
    ).to(device)

    output = model.generate(
        input_ids=input_encoding.input_ids,
        attention_mask=input_encoding.attention_mask,
        max_length=max_length
    )
    prediction = tokenizer.decode(output[0], skip_special_tokens=True)
    return prediction
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

# Phrases simple de test

In [None]:
sentences = [
    "The service was exceptionally friendly and efficient.",
    "Their new dessert menu is both inventive and tasty.",
    "The ambiance of the restaurant was cozy and inviting.",
    "I found the pasta to be wonderfully flavorful and perfectly cooked.",
    "The coffee was rich in aroma and perfectly brewed.",
    "The steak was tender, juicy, and cooked to perfection.",
    "The vegetarian options were both varied and delightful.",
    "The sushi rolls were fresh and expertly prepared.",
    "The wine selection complemented our meal beautifully.",
    "The presentation of the dishes was elegant and appealing.",
    "The pizza had a perfectly crispy crust and fresh toppings.",
    "The waiter provided excellent recommendations and attentive service.",
    "The spicy dishes had just the right amount of heat.",
    "The homemade bread was warm and delicious.",
    "The view from the dining room was breathtaking.",
    "The restaurant's atmosphere was modern and chic.",
    "The salad was crisp, fresh, and well-dressed.",
    "The seafood was extremely fresh and succulent.",
    "The chocolate cake was decadently rich and moist.",
    "The portion sizes were generous and satisfying.",
    "The soups were aromatic and bursting with flavor.",
    "The buffet had a great variety of choices.",
    "The outdoor seating area was pleasant and comfortable.",
    "The chicken was tender and seasoned perfectly.",
    "The homemade pasta was a standout dish.",
    "The cocktails were creative and well-crafted.",
    "The burgers were juicy and packed with flavor.",
    "The brunch menu had excellent choices for everyone.",
    "The tea selection was impressive and well-curated.",
    "The crepes were thin, light, and delicious.",
    "The ribs were fall-off-the-bone tender.",
    "The vegan dishes were innovative and tasty.",
    "The lamb was cooked to tender perfection.",
    "The ice cream was creamy and full of flavor.",
    "The dining experience was relaxing and enjoyable.",
    "The pastries were flaky, buttery, and delightful.",
    "The staff made us feel very welcome.",
    "The smoothies were fresh and perfectly blended.",
    "The appetizers were a great start to the meal.",
    "The decor added a unique charm to the restaurant.",
    "The pancakes were fluffy and delicious.",
    "The local ingredients made a noticeable difference.",
    "The curry had a perfect balance of spices.",
    "The atmosphere was lively and engaging.",
    "The kids' menu had great options.",
    "The sandwiches were fresh and very filling.",
    "The salads were an explosion of flavors.",
    "The duck was cooked to a crispy finish.",
    "The themed decor created a fun dining experience.",
    "The restaurant's cleanliness was top-notch.",
    "The nachos were loaded and flavorful.",
    "The tapas were the perfect size for sharing.",
    "The grilled vegetables were seasoned beautifully.",
    "The risotto was creamy and perfectly cooked.",
    "The fresh juices were a refreshing treat.",
    "The vegan burger was surprisingly delicious.",
    "The omelets were fluffy and packed with ingredients.",
    "The traditional dishes were authentic and tasty.",
    "The gluten-free options were plentiful and varied.",
    "The restaurant had a great energy and vibe.",
    "The cheesecake was smooth and creamy.",
    "The hot chocolate was rich and decadent.",
    "The artisanal cheeses were a delightful treat.",
    "The fish tacos were light and flavorful.",
    "The staff's attention to detail was impressive.",
    "The French fries were crispy and well-seasoned.",
    "The cocktail list was innovative and exciting.",
    "The breakfast burrito was hearty and satisfying.",
    "The location of the restaurant was convenient and accessible.",
    "The dim sum was authentic and delicious.",
    "The craft beer selection was excellent.",
    "The fresh fruit platter was beautifully arranged.",
    "The restaurant's ambiance was romantic and serene.",
    "The quiche was light and packed with flavor.",
    "The gourmet pizza options were unique and tasty.",
    "The seafood pasta was rich and flavorful.",
    "The Korean BBQ was an interactive and fun experience.",
    "The organic salad was fresh and nutritious.",
    "The bistro setting was quaint and charming.",
    "The homemade soups were comforting and hearty.",
    "The stir-fry was vibrant and packed with veggies.",
    "The outdoor patio had a great view and atmosphere.",
    "The tap water was refreshing and had a clean taste.",
    "The fine dining experience was unforgettable.",
    "The farm-to-table concept was evident in the freshness of the food.",
    "The seasonal menu always has something new and exciting.",
    "The Greek salad was authentic and refreshing.",
    "The sushi bar offered a wide variety of fresh options.",
    "The staff went above and beyond to ensure a great experience.",
    "The paella was packed with seafood and flavor.",
    "The gelato flavors were unique and delicious.",
    "The craft cocktails were a highlight of the evening.",
    "The vegetable curry was hearty and satisfying.",
    "The French toast was sweet and perfectly cooked.",
    "The farm-fresh eggs made a difference in taste.",
    "The BBQ chicken pizza was a perfect blend of flavors.",
    "The artisan breads were a great accompaniment to the meal.",
    "The ambiance made for a perfect date night.",
    "The Thai noodles were spicy and full of flavor.",
    "The gourmet sandwiches were a delightful lunch option."
]
outputs = []
for sentence in sentences:
      prediction = generate_single_prediction(model, tokenizer, sentence, max_length=150)
      print("-"*50)
      print("text : ",sentence)
      print(prediction)
      outputs.append(parse_model_output(prediction))
      print("-"*50)

--------------------------------------------------
text :  The service was exceptionally friendly and efficient.
main aspect: service extracted aspects: service extracted opinions: friendly, efficient global polarity: positive
--------------------------------------------------
--------------------------------------------------
text :  Their new dessert menu is both inventive and tasty.
main aspect: dessert menu extracted aspects: dessert menu, menu, inventive extracted opinions: inventive global polarity: positive
--------------------------------------------------
--------------------------------------------------
text :  The ambiance of the restaurant was cozy and inviting.
main aspect: restaurant extracted aspects: ambiance, restaurant extracted opinions: inviting global polarity: positive
--------------------------------------------------
--------------------------------------------------
text :  I found the pasta to be wonderfully flavorful and perfectly cooked.
main aspect: pasta 

In [None]:
outputs = []
sentences =[]
for index, row in test_df.iterrows():
    prediction = generate_single_prediction(model, tokenizer, row["text"], max_length=150)

    print("-"*50)
    print("Text:", row["text"])
    print("Prediction:", prediction)

    parsed_output = parse_model_output(prediction)
    outputs.append(parsed_output)
    sentences.append(row["text"])
    print("-"*50)

[1;30;43mLe flux de sortie a été tronqué et ne contient que les 5000 dernières lignes.[0m
--------------------------------------------------
Text: the hotel is average, the business hall is fantastic - you need to update
Prediction: main aspect: hotel extracted aspects: business, hall, hotel extracted opinions: fantastic global polarity: positive
--------------------------------------------------
--------------------------------------------------
Text: the rooms weren't at the desired level, and we found a lot of bugs in the rooms, and the food level wasn't good either.
Prediction: main aspect: rooms extracted aspects: level, lot, rooms, food, bugs extracted opinions: desired, good global polarity: negative
--------------------------------------------------
--------------------------------------------------
Text: the only bad thing to try is to get to the hotel entrance by taxi, not by a navigator.
Prediction: main aspect: hotel extracted aspects: entrance, navigator, thing, taxi, ho

In [None]:
print(len(outputs))
print(len(sentences))

2885
2885


# Mise en forme de l'output final

In [None]:
# a partir des generations de t5 affiner les resultats avec NLTK
def find_closest_opinion(segment, aspect, opinions):
    words = nltk.word_tokenize(segment)
    aspect_index = words.index(aspect) if aspect in words else -1
    closest_opinion = None
    min_distance = float('inf')
    for opinion in opinions:
        if opinion in words:
            opinion_index = words.index(opinion)
            distance = abs(opinion_index - aspect_index)
            if distance < min_distance:
                min_distance = distance
                closest_opinion = opinion
    return closest_opinion

def analyze_sentiment(segment):
    if not segment:
        return "neutral"
    sentiment = TextBlob(segment).sentiment
    return "positive" if sentiment.polarity > 0 else "negative" if sentiment.polarity < 0 else "neutral"
data =[]
for text, output in zip(sentences, outputs):
    text_sans_virgules = text.replace(',', '.')
    phrases = sent_tokenize(text_sans_virgules)
    aspect_opinion_sentiments = []
    for phrase in phrases:
        for aspect in output["extracted_aspects"]:
            if aspect in phrase:
                closest_opinion = find_closest_opinion(phrase, aspect, output["extracted_opinions"])
                if closest_opinion:
                    sentiment = analyze_sentiment(phrase)
                    if aspect!=closest_opinion :
                      aspect_opinion_sentiments.append((aspect, closest_opinion, sentiment))

    print(text_sans_virgules)
    print("-"*50)
    print("Main Aspect:", output["main_aspect"])
    print("Extracted Aspects:", output["extracted_aspects"])
    print("Extracted Opinions:", output["extracted_opinions"])
    print("Global Polarity:", output["global_polarity"])
    print("-"*50)
    print("Aspect-Opinion Sentiments:")
    sorted_aspect_opinion_sentiments = sorted(aspect_opinion_sentiments, key=lambda x: x[0])
    for tup in sorted_aspect_opinion_sentiments :
      print(tup)
    print("*"*50)
    data_dict = {
    "text": text_sans_virgules,
    "main_aspect": output["main_aspect"],
    "extracted_aspects": output["extracted_aspects"],
    "extracted_opinions":output["extracted_opinions"],
    "global_polarity": output["global_polarity"],
    "aspect_opinion_sentiments": sorted_aspect_opinion_sentiments
      }
    data.append(data_dict)
    # break
with open('/content/drive/MyDrive/Master 2/output.json', 'w') as json_file:
    json.dump(data, json_file, indent=4)

In [None]:
outputs = []
sentences =[]
for index, row in test_df.iterrows():
    prediction = generate_single_prediction(model, tokenizer, row["text"], max_length=150)

    print("-"*50)
    print("Text:", row["text"])
    print("Prediction:", prediction)

    parsed_output = parse_model_output(prediction)
    outputs.append(parsed_output)
    sentences.append(row["text"])
    print("-"*50)

In [None]:
outputs = []
sentences = []

for index, row in test_df.iterrows():

    text_sans_virgules = row["text"].replace(',', '.')
    phrases = sent_tokenize(text_sans_virgules)
    predictions_par_phrase = []
    text_sans_virgules
    print("*"*50)
    print("text :", text_sans_virgules)
    for phrase in phrases:
        prediction = generate_single_prediction(model, tokenizer, phrase, max_length=150)
        parsed_output = parse_model_output(prediction)
        predictions_par_phrase.append(parsed_output)

        print("Phrase:", phrase)
        print("Prediction:", prediction)
        print("-"*50)

    outputs.append(predictions_par_phrase)
    sentences.append(row["text"])

[1;30;43mLe flux de sortie a été tronqué et ne contient que les 5000 dernières lignes.[0m
Prediction: main aspect: views extracted aspects: views, view, kaaba extracted opinions: neutral global polarity: neutral
--------------------------------------------------
Phrase: the hotel's entrance is good from the back and not directly on the haram.
Prediction: main aspect: hotel extracted aspects: back, entrance, haram, hotel extracted opinions: good global polarity: positive
--------------------------------------------------
**************************************************
text : i stayed at this hotel for three nights in september.
Phrase: i stayed at this hotel for three nights in september.
Prediction: main aspect: hotel extracted aspects: september, nights, hotel extracted opinions: neutral global polarity: positive
--------------------------------------------------
**************************************************
text : i checked out of the hotel half an hour after my arrival and

In [None]:
#
data = []
for text, predictions_par_phrase in zip(sentences, outputs):
    text_sans_virgules = text.replace(',', '.')
    phrases = sent_tokenize(text_sans_virgules)
    aspect_opinion_sentiments = []

    for prediction in predictions_par_phrase:
        for phrase in phrases:
            aspects_to_use = [prediction["main_aspect"]] if prediction["main_aspect"] and prediction["main_aspect"] != "nan" else prediction["extracted_aspects"]

            for aspect in aspects_to_use:
                if aspect in phrase:
                    closest_opinion = find_closest_opinion(phrase, aspect, prediction["extracted_opinions"])
                    if closest_opinion and aspect != closest_opinion:
                        sentiment = analyze_sentiment(phrase)# NLTK sentiment analysis pour affiner le sentiment lié a une opnion precise la generation etant trop brut
                        aspect_opinion_sentiments.append((aspect, closest_opinion, sentiment))

    # Éliminer les doublons en convertissant la liste en un ensemble, puis en la reconvertissant en liste
    aspect_opinion_sentiments = list(set(aspect_opinion_sentiments))
    sorted_aspect_opinion_sentiments = sorted(aspect_opinion_sentiments, key=lambda x: x[0])

    print(text_sans_virgules)
    print("-"*50)
    print("Aspect-Opinion Sentiments:")
    for tup in sorted_aspect_opinion_sentiments:
        print(tup)
    print("*"*50)

    data_dict = {
        "text": text_sans_virgules,
        "main_aspect": prediction["main_aspect"],
        "extracted_aspects": prediction["extracted_aspects"],
        "extracted_opinions": prediction["extracted_opinions"],
        "global_polarity": prediction["global_polarity"],
        "aspect_opinion_sentiments": sorted_aspect_opinion_sentiments
    }
    data.append(data_dict)

with open('/content/drive/MyDrive/Master 2/output.json', 'w') as json_file:
    json.dump(data, json_file, indent=4)


[1;30;43mLe flux de sortie a été tronqué et ne contient que les 5000 dernières lignes.[0m
--------------------------------------------------
Aspect-Opinion Sentiments:
('hotel', 'hopes', 'positive')
('hotel', 'wrong', 'negative')
('nothing', 'wrong', 'negative')
('restaurant', 'acceptable', 'neutral')
('rooms', 'good', 'positive')
('staff', 'helpful', 'neutral')
**************************************************
there are many chairs in the sun. it was a really nice holiday!! we will definitely get back to this hotel.
--------------------------------------------------
Aspect-Opinion Sentiments:
('hotel', 'definitely', 'neutral')
**************************************************
and the design of the rooms is wonderful and spacious and modern in terms of furniture and design. in addition to equipping them with all the requirements needed by the tourist
--------------------------------------------------
Aspect-Opinion Sentiments:
('furniture', 'wonderful', 'positive')
****************

# Test de generalisation sur un dataset de triplet

In [None]:
dataset  = load_dataset('NEUDM/aste-data-v2', 'default')

Downloading data:   0%|          | 0.00/2.86M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/694k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.15M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [None]:
dataset_df = pd.DataFrame(dataset['test'])

In [None]:
dataset_df

Unnamed: 0,task_type,dataset,input,output,situation,label,extra,instruction
0,generation,aste-data-v2,"[Boot time is super fast , around anywhere fro...","[['Boot time', 'fast', 'positive']]",none,,,"Task: Extracting aspect terms ,their opinion w..."
1,generation,aste-data-v2,[tech support would not fix the problem unless...,"[['tech support', 'not fix', 'negative']]",none,,,"Task: Extracting aspect terms ,their opinion w..."
2,generation,aste-data-v2,[Set up was easy .],"[['Set up', 'easy', 'positive']]",none,,,"Task: Extracting aspect terms ,their opinion w..."
3,generation,aste-data-v2,[Did not enjoy the new Windows 8 and touchscre...,"[['Windows 8', 'not enjoy', 'negative'], ['tou...",none,,,"Task: Extracting aspect terms ,their opinion w..."
4,generation,aste-data-v2,[Other than not being a fan of click pads ( in...,"[['internal speakers', 'lousy', 'negative']]",none,,,"Task: Extracting aspect terms ,their opinion w..."
...,...,...,...,...,...,...,...,...
1463,generation,aste-data-v2,[My friend enjoyed the grilled Alaskan King Sa...,"[['grilled Alaskan King Salmon', 'enjoyed', 'p...",none,,,"Task: Extracting aspect terms ,their opinion w..."
1464,generation,aste-data-v2,[I had a taste of all three items on her plate...,"[['plate', 'superb', 'positive']]",none,,,"Task: Extracting aspect terms ,their opinion w..."
1465,generation,aste-data-v2,[Our server continued to be attentive througho...,"[['server', 'attentive', 'positive'], [""Ray 's...",none,,,"Task: Extracting aspect terms ,their opinion w..."
1466,generation,aste-data-v2,[While I could have done without the youth who...,"[['server', 'wonderful', 'positive'], ['food',...",none,,,"Task: Extracting aspect terms ,their opinion w..."


In [None]:
input_texts=["oh speaking of bathroom. the mens bathroom was disgusting.","The dessert ( we had a pear torte ) was good  but. once again. the staff was unable to provide appropriate drink suggestions.","The dining experience was relaxing and enjoyable.","The pastries were flaky, buttery, and delightful.","The staff made us feel very welcome.","The smoothies were fresh and perfectly blended."]
for input_text in input_texts:
    text_sans_virgules = input_text.replace(',', '.')
    phrases = sent_tokenize(text_sans_virgules)
    predictions_par_phrase = []
    # prediction = generate_single_prediction(model, tokenizer, input_text, max_length=150)
    print("*"*50)
    print("text :", text_sans_virgules)
    print("-"*50)
    for phrase in phrases:
        prediction = generate_single_prediction(model, tokenizer, phrase, max_length=150)
        parsed_output = parse_model_output(prediction)
        predictions_par_phrase.append(parsed_output)

        print("Phrase:", phrase)
        print("Prediction:", prediction)
        # print("-"*50)

    aspect_opinion_sentiments = []

    for prediction in predictions_par_phrase:
        for phrase in phrases:
            aspects_to_use = [prediction["main_aspect"]] if prediction["main_aspect"] and prediction["main_aspect"] != "nan" else prediction["extracted_aspects"]
            for aspect in aspects_to_use:
                if aspect in phrase:
                    closest_opinion = find_closest_opinion(phrase, aspect, prediction["extracted_opinions"])
                    if closest_opinion and aspect != closest_opinion:
                        sentiment = analyze_sentiment(phrase)
                        aspect_opinion_sentiments.append((aspect, closest_opinion, sentiment))


    aspect_opinion_sentiments = list(set(aspect_opinion_sentiments))
    sorted_aspect_opinion_sentiments = sorted(aspect_opinion_sentiments, key=lambda x: x[0])


    print("-"*50)
    print("Aspect-Opinion Sentiments:")
    for tup in sorted_aspect_opinion_sentiments:
        print(tup)
    print("-"*50)

    print("*"*50)


**************************************************
text : oh speaking of bathroom. the mens bathroom was disgusting.
--------------------------------------------------
Phrase: oh speaking of bathroom.
Prediction: main aspect: bathroom extracted aspects: bathroom extracted opinions: neutral global polarity: negative
Phrase: the mens bathroom was disgusting.
Prediction: main aspect: bathroom extracted aspects: bathroom, mens extracted opinions: disgusting global polarity: negative
--------------------------------------------------
Aspect-Opinion Sentiments:
('bathroom', 'disgusting', 'negative')
--------------------------------------------------
**************************************************
**************************************************
text : The dessert ( we had a pear torte ) was good  but. once again. the staff was unable to provide appropriate drink suggestions.
--------------------------------------------------
Phrase: The dessert ( we had a pear torte ) was good  but.
Pre

In [None]:
outputs = []
sentences =[]
triplets =[]
for index, row in dataset_df.iterrows():
    # Générer la prédiction pour le texte courant
    # print(row["input"][0])
    text_sans_virgules = row["input"][0].replace(',', '.')
    phrases = sent_tokenize(text_sans_virgules)
    predictions_par_phrase = []
    # prediction = generate_single_prediction(model, tokenizer, row["input"][0], max_length=150)
    print("*"*50)
    print("text :", text_sans_virgules)
    for phrase in phrases:
        prediction = generate_single_prediction(model, tokenizer, phrase, max_length=150)
        parsed_output = parse_model_output(prediction)
        predictions_par_phrase.append(parsed_output)

        print("Phrase:", phrase)
        print("Prediction:", prediction)
        print("-"*50)
    # print("-"*50)
    # print("Text:", row["input"][0])
    # print("Prediction:", prediction)
    triplet_list = ast.literal_eval( row["output"])
    print("Triplet:", triplet_list)


    outputs.append(predictions_par_phrase)
    sentences.append(row["input"][0])
    triplets.append(triplet_list)
    print("-"*50)
    # break



[1;30;43mLe flux de sortie a été tronqué et ne contient que les 5000 dernières lignes.[0m
Phrase: filthy bathroom .
Prediction: main aspect: bathroom extracted aspects: bathroom extracted opinions: filthy global polarity: negative
--------------------------------------------------
Triplet: [['Service', 'ok', 'negative'], ['Service', 'unfriendly', 'negative'], ['bathroom', 'filthy', 'negative']]
--------------------------------------------------
**************************************************
text : The bar drinks were Eh . ok to say the least .
Phrase: The bar drinks were Eh .
Prediction: main aspect: bar drinks extracted aspects: drinks, bar drinks, Eh extracted opinions: neutral global polarity: positive
--------------------------------------------------
Phrase: ok to say the least .
Prediction: main aspect: nan extracted aspects: extracted opinions: ok global polarity: neutral
--------------------------------------------------
Triplet: [['bar drinks', 'ok', 'negative']]
-------

In [None]:
print(triplets)

[[['Boot time', 'fast', 'positive']], [['tech support', 'not fix', 'negative']], [['Set up', 'easy', 'positive']], [['Windows 8', 'not enjoy', 'negative'], ['touchscreen functions', 'not enjoy', 'negative']], [['internal speakers', 'lousy', 'negative']], [['use', 'fast', 'positive'], ['use', 'light', 'positive'], ['use', 'simple', 'positive']], [['Works', 'well', 'positive'], ['apple OS', 'happy', 'positive']], [['features', 'not light and slim', 'positive']], [['log on', 'pleased', 'positive'], ['log on', 'fast', 'positive'], ['WiFi connection', 'pleased', 'positive'], ['WiFi connection', 'speedy', 'positive'], ['battery life', 'pleased', 'positive'], ['battery life', 'long', 'positive']], [['delete key', 'not yet discovered', 'negative']], [['interneting', 'difficult', 'negative']], [['priced', 'right', 'positive']], [['track pad', 'not very good', 'negative']], [['graphics', 'outstanding', 'positive']], [['mountain lion', 'slow', 'negative']], [['durability', 'Strong', 'positive'], 

In [None]:

data = []
triplets_predicted=[]
triplets_ground_truth=[]
for text, predictions_par_phrase, triplet_list in zip(sentences, outputs ,triplets):
    text_sans_virgules = text.replace(',', '.')
    phrases = sent_tokenize(text_sans_virgules)
    aspect_opinion_sentiments = []

    for prediction in predictions_par_phrase:
        for phrase in phrases:
            aspects_to_use = [prediction["main_aspect"]] if prediction["main_aspect"] and prediction["main_aspect"] != "nan" else prediction["extracted_aspects"]

            for aspect in aspects_to_use:
                if aspect in phrase:
                    closest_opinion = find_closest_opinion(phrase, aspect, prediction["extracted_opinions"])
                    if closest_opinion and aspect != closest_opinion:
                        sentiment = analyze_sentiment(phrase)
                        aspect_opinion_sentiments.append((aspect, closest_opinion, sentiment))

    # Éliminer les doublons en convertissant la liste en un ensemble, puis en la reconvertissant en liste
    aspect_opinion_sentiments = list(set(aspect_opinion_sentiments))
    sorted_aspect_opinion_sentiments = sorted(aspect_opinion_sentiments, key=lambda x: x[0])

    # Affichage et sauvegarde des données
    print(text_sans_virgules)
    print("-"*50)
    print("Aspect-Opinion Sentiments:")
    tri=[]
    for tup in sorted_aspect_opinion_sentiments:
        print(tup)
        tri.append(tup)
    triplets_predicted.append(tri)
    triplets_ground_truth.append(triplet_list)
    print("-"*50)
    print("Ground truth triplet: ",triplet_list)
    print("*"*50)

    data_dict = {
        "text": text_sans_virgules,
        "main_aspect": prediction["main_aspect"],
        "extracted_aspects": prediction["extracted_aspects"],
        "extracted_opinions": prediction["extracted_opinions"],
        "global_polarity": prediction["global_polarity"],
        "aspect_opinion_sentiments": sorted_aspect_opinion_sentiments
    }
    data.append(data_dict)

# Sauvegarde des données dans un fichier JSON
with open('/content/drive/MyDrive/Master 2/output.json', 'w') as json_file:
    json.dump(data, json_file, indent=4)


[1;30;43mLe flux de sortie a été tronqué et ne contient que les 5000 dernières lignes.[0m
Aspect-Opinion Sentiments:
('room', 'sure', 'positive')
--------------------------------------------------
Ground truth triplet:  [['bill', 'big', 'negative']]
**************************************************
Not only was the sushi fresh . they also served other entrees allowed each guest something to choose from and we all left happy ( try the duck !
--------------------------------------------------
Aspect-Opinion Sentiments:
('entrees', 'happy', 'positive')
('sushi', 'fresh', 'positive')
--------------------------------------------------
Ground truth triplet:  [['sushi', 'fresh', 'positive'], ['duck', 'try', 'positive']]
**************************************************
good variety but nothing surprising .
--------------------------------------------------
Aspect-Opinion Sentiments:
('variety', 'good', 'positive')
--------------------------------------------------
Ground truth triplet:  [

In [None]:
print(triplets_predicted)
print(triplets_ground_truth)

In [None]:
triplets_ground_truth = [[tuple(sublist) for sublist in list] for list in triplets_ground_truth]
print(triplets_ground_truth)

[[('Boot time', 'fast', 'positive')], [('tech support', 'not fix', 'negative')], [('Set up', 'easy', 'positive')], [('Windows 8', 'not enjoy', 'negative'), ('touchscreen functions', 'not enjoy', 'negative')], [('internal speakers', 'lousy', 'negative')], [('use', 'fast', 'positive'), ('use', 'light', 'positive'), ('use', 'simple', 'positive')], [('Works', 'well', 'positive'), ('apple OS', 'happy', 'positive')], [('features', 'not light and slim', 'positive')], [('log on', 'pleased', 'positive'), ('log on', 'fast', 'positive'), ('WiFi connection', 'pleased', 'positive'), ('WiFi connection', 'speedy', 'positive'), ('battery life', 'pleased', 'positive'), ('battery life', 'long', 'positive')], [('delete key', 'not yet discovered', 'negative')], [('interneting', 'difficult', 'negative')], [('priced', 'right', 'positive')], [('track pad', 'not very good', 'negative')], [('graphics', 'outstanding', 'positive')], [('mountain lion', 'slow', 'negative')], [('durability', 'Strong', 'positive'), 

In [None]:
def calculate_tp_fp_fn(predicted, ground_truth):
    # print(predicted," -- ", ground_truth)
    # tp = len(set(predicted).intersection(set(ground_truth)))
    # fp = len(set(predicted) - set(ground_truth))
    # fn = len(set(ground_truth) - set(predicted))
    pred_set = set(predicted)
    truth_set = set(ground_truth)
    tp = len(pred_set.intersection(truth_set))
    fp = len(pred_set.difference(truth_set))
    fn = len(truth_set.difference(pred_set))
    print(" predicted :",predicted," -- ground_truth :", ground_truth," --- TP:",tp," - FP:", fp," - FN:", fn)
    print("*"*50)
    return tp, fp, fn

def calculate_jaccard(predicted, ground_truth):
    intersection = len(set(predicted).intersection(set(ground_truth)))
    union = len(set(predicted).union(set(ground_truth)))
    return intersection / union if union != 0 else 0

metrics = []
for predicted, ground_truth in zip(triplets_predicted, triplets_ground_truth):
    tp, fp, fn = calculate_tp_fp_fn(predicted, ground_truth)
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1_score = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
    jaccard_score = calculate_jaccard(predicted, ground_truth)
    metrics.append((precision, recall, f1_score, jaccard_score))

average_precision = sum(m[0] for m in metrics) / len(metrics)
average_recall = sum(m[1] for m in metrics) / len(metrics)
average_f1_score = sum(m[2] for m in metrics) / len(metrics)
average_jaccard_score = sum(m[3] for m in metrics) / len(metrics)

print(f"Average Precision: {average_precision}")
print(f"Average Recall: {average_recall}")
print(f"Average F1 Score: {average_f1_score}")
print(f"Average Jaccard Score: {average_jaccard_score}")


 predicted : [('Boot time', 'super', 'positive')]  -- ground_truth : [('Boot time', 'fast', 'positive')]  --- TP: 0  - FP: 1  - FN: 1
**************************************************
 predicted : [('tech support', 'support', 'neutral')]  -- ground_truth : [('tech support', 'not fix', 'negative')]  --- TP: 0  - FP: 1  - FN: 1
**************************************************
 predicted : [('Set up', 'easy', 'positive')]  -- ground_truth : [('Set up', 'easy', 'positive')]  --- TP: 1  - FP: 0  - FN: 0
**************************************************
 predicted : [('Windows 8', 'enjoy', 'negative')]  -- ground_truth : [('Windows 8', 'not enjoy', 'negative'), ('touchscreen functions', 'not enjoy', 'negative')]  --- TP: 0  - FP: 1  - FN: 2
**************************************************
 predicted : [('notebook', 'like', 'negative'), ('speakers', 'lousy', 'negative'), ('things', 'hard', 'negative')]  -- ground_truth : [('internal speakers', 'lousy', 'negative')]  --- TP: 0  - FP: 3  

# Conclusion

Cette généralisation met en évidence les faiblesses de la méthode, notamment sa difficulté à identifier des aspects inédits. De plus, l'utilisation de NLTK pour le traitement du langage naturel présente des limitations, étant moins adaptée pour des tâches complexes ou spécifiques. Les imperfections dans la génération de triplets, particulièrement dans l'identification précise des aspects et des opinions, sont amplifiées par les contraintes des outils et méthodes utilisés. Ces défauts deviennent plus apparents lors des tentatives d'affinement ou d'optimisation du processus.

Les problèmes liés aux données de base sont également critiques : la qualité et la diversité des données influencent grandement la performance des modèles ABSA. Les données limitées ou biaisées peuvent mener à des prédictions peu fiables, et les modèles pourraient peiner à généraliser à partir de ces données.

L'approche adoptée pour généraliser à partir des listes d'aspects et d'opinions de chaque texte est certes audacieuse mais peut être ambiguë, compte tenu de la complexité et de la subtilité du langage naturel. Cela peut conduire à des interprétations erronées des sentiments exprimés.

Pour améliorer cette approche, il serait judicieux d'envisager des techniques plus sophistiquées voir des modèles de langage pré-entraînés comme GPT. Ces technologies sont plus aptes à gérer la complexité du langage naturel et à identifier des aspects et opinions nouveaux.
En outre, un ensemble de données plus riche et diversifié pourrait favoriser une meilleure généralisation des modèles. partir d'un dataset faible en relation en raisons de la necessité d'avoir une quantité suffisante reduit drastiquement la qualité de ce dernier.