In [None]:
import pandas as pd

# Charger les données
data = pd.read_csv('/content/MentalQA_500_data.csv', sep='\t')

# Concaténer toutes les questions et réponses en une seule chaîne de texte
all_text = ' '.join(data['question'].astype(str) + ' ' + data['answer'].astype(str))

# Diviser le texte en mots et utiliser un ensemble pour les mots uniques
vocabulary = set(all_text.split())  # Ensemble des mots uniques

# Taille du vocabulaire
vocab_size = len(vocabulary)

# Affichage du résultat
print(f"Taille du vocabulaire unique : {vocab_size}")


In [None]:
import pandas as pd

# Charger les données
data = pd.read_csv('/content/MentalQA_500_data.csv', sep='\t')  # Assure-toi que le chemin et le séparateur sont corrects

print(data.head())


In [None]:
# Extraire les questions et réponses de ton DataFrame
questions = data['question'].tolist()
answers = data['answer'].tolist()
labels = data['final_AS'].apply(lambda x: int(x[2]) - 1).tolist()  # Assure-toi que les labels sont bien extraits

In [None]:
!pip install sentence_transformers

In [None]:
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup
import torch
from torch.utils.data import Dataset, DataLoader
from sentence_transformers import SentenceTransformer, util
from sklearn.metrics import accuracy_score, recall_score, f1_score

# Charger le tokenizer et modèle XLM-Roberta
tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')
model = XLMRobertaForSequenceClassification.from_pretrained('xlm-roberta-base', num_labels=3)

In [None]:
# Charger le modèle Sentence-BERT pour l'analyse sémantique
semantic_model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')

# Classe pour préparer les données avec analyse sémantique
class SegmentedDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.segments = []
        self.segment_labels = []
        self.prepare_data()

    def prepare_data(self):
        for text, label in zip(self.texts, self.labels):
            question, answer = text, text
            combined_text = self.analyse_semantique(question, answer)

            tokens = self.tokenizer.encode(combined_text, truncation=False)
            for i in range(0, len(tokens), self.max_length):
                segment = tokens[i:i+self.max_length]
                self.segments.append(segment)
                self.segment_labels.append(label)

    def analyse_semantique(self, question, answer):
        # Diviser la réponse en segments (par exemple, par phrases)
        segments = answer.split('.')
        segments = [segment.strip() for segment in segments if segment]

        # Encoder la question et les segments avec Sentence-BERT
        question_embedding = semantic_model.encode(question, convert_to_tensor=True)
        segment_embeddings = semantic_model.encode(segments, convert_to_tensor=True)

        # Calculer la similarité cosinus
        similarities = util.pytorch_cos_sim(question_embedding, segment_embeddings)

        # Trier les segments par pertinence
        segments_tries = [x for _, x in sorted(zip(similarities.tolist()[0], segments), reverse=True)]

        # Combiner les segments triés pour former une réponse plus pertinente
        combined_text = ' '.join(segments_tries)

        return combined_text

    def __len__(self):
        return len(self.segments)

    def __getitem__(self, idx):
        input_ids = torch.tensor(self.segments[idx])
        label = torch.tensor(self.segment_labels[idx])

        attention_mask = torch.ones_like(input_ids)

        padding_length = self.max_length - input_ids.size(0)
        if padding_length > 0:
            input_ids = torch.cat([input_ids, torch.zeros(padding_length, dtype=torch.long)], dim=0)
            attention_mask = torch.cat([attention_mask, torch.zeros(padding_length, dtype=torch.long)], dim=0)

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': label
        }

In [None]:
# Créer le dataset et le dataloader avec tes données
dataset = SegmentedDataset(questions, labels, tokenizer, max_length=512)
dataloader = DataLoader(dataset, batch_size=8, shuffle=True)

In [None]:
optimizer = AdamW(model.parameters(), lr=2e-5)
total_steps = len(dataloader) * 3
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

In [None]:
from sklearn.metrics import f1_score, recall_score, accuracy_score

In [None]:
# Boucle d'entraînement
for epoch in range(5):
    model.train()
    total_loss = 0
    all_preds = []
    all_labels = []

    for batch in dataloader:
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()

        logits = outputs.logits
        preds = torch.argmax(logits, dim=-1).cpu().numpy()
        labels = labels.cpu().numpy()

        all_preds.extend(preds)
        all_labels.extend(labels)

    # Calcul des métriques à la fin de l'époque
    accuracy = accuracy_score(all_labels, all_preds)
    recall = recall_score(all_labels, all_preds, average='weighted')
    f1 = f1_score(all_labels, all_preds, average='weighted')

    avg_loss = total_loss / len(dataloader)
    print(f"Époque {epoch+1}/3 - Perte Moyenne: {avg_loss:.4f} - Précision: {accuracy:.4f} - Rappel: {recall:.4f} - F1-score: {f1:.4f}")