In [1]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [2]:
transcription_sex_df = pd.read_csv('data/transcriptions_with_sex.csv')
firstname_sex_df = pd.read_csv('data/firstname_with_sex.csv', sep=";")

## Créations de nouvelles instances à partir du fichier de prénoms

In [4]:
def common_sex(row):
    sex = str(np.where(row['male'] > row['female'], 'masculin', 'feminin'))
    if sex == 'masculin':
        proba = row['male'] / (row['female'] + row['male'])
    else:
        proba = row['female'] / (row['female'] + row['male'])
    return sex, proba
    
firstname_sex_df['sex'], firstname_sex_df['proba'] = tuple(pd.Series(data=[t[i] for t in firstname_sex_df.apply(common_sex, axis=1)]) for i in range(len(firstname_sex_df.apply(common_sex, axis=1).iloc[0])))

In [5]:
firstname_sex_df['sentence'] = firstname_sex_df.apply(lambda row : f"{row['firstname']} est un prénom porté à {round(100*row['proba'],2)}% par des personnes de sexe", axis=1)

In [6]:
firstname_sex_df['sentence'][0]

'marie est un prénom porté à 99.58% par des personnes de sexe'

## Entraînement sans utiliser les exemples ambigus

In [33]:
class TextDataset(Dataset):
    def __init__(self, texts, targets, tokenizer, max_length, label_encoder=None):
        self.texts = texts
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_length = max_length
        
        if label_encoder is None:
            self.label_encoder = LabelEncoder()
            self.targets_encoded = self.label_encoder.fit_transform(targets)
        else:
            self.label_encoder = label_encoder
            self.targets_encoded = self.label_encoder.transform(targets)
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        target_encoded = self.targets_encoded[idx]
        
        inputs = self.tokenizer(text, max_length=self.max_length, truncation=True, padding='max_length', return_tensors='pt')
        input_ids = inputs['input_ids'].squeeze(0)
        attention_mask = inputs['attention_mask'].squeeze(0)
        
        return input_ids, attention_mask, torch.tensor(target_encoded) 


mapping = {'homme': 'masculin', 'femme': 'feminin', 'ambigu': 'ambigu'}
X_train, X_test, y_train, y_test = train_test_split(transcription_sex_df['prediction'].values, transcription_sex_df['sex'].map(mapping).values, test_size=0.2)
indices = np.where(y_train != 'ambigu')[0]
print(len(y_train) - len(indices))
X_train = X_train[indices]
y_train = y_train[indices]
X_train = list(X_train) + list(firstname_sex_df['sentence'].values)
y_train = list(y_train) + list(firstname_sex_df['sex'].values)

tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-cased') 
max_length = 50

label_encoder = LabelEncoder() 
label_encoder.fit(y_test) 

train_dataset = TextDataset(X_train, y_train, tokenizer, max_length, label_encoder=label_encoder)
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)

test_dataset = TextDataset(X_test, y_test, tokenizer, max_length, label_encoder=label_encoder)
test_dataloader = DataLoader(test_dataset, batch_size=8, shuffle=True)


8


In [36]:
model_name = "bert-base-multilingual-cased"
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

In [37]:
num_epochs = 6
learning_rate = 1e-5
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
loss_fct = torch.nn.CrossEntropyLoss()

for epoch in range(num_epochs):
    model.train()  
    total_loss = 0
    total_correct = 0
    total_samples = 0
    
    for batch in train_dataloader:
        input_ids, attention_mask, labels = batch
        
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)
        
        optimizer.zero_grad()
        
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        
        loss = loss_fct(logits, labels)
        
        total_loss += loss.item()

        _, predicted = torch.max(logits, 1)  
        
        total_correct += (predicted == labels).sum().item()
        total_samples += labels.size(0)
        
        loss.backward()
        optimizer.step()
        
    avg_train_loss = total_loss / len(train_dataloader)
    accuracy = total_correct / total_samples
    print(f"Epoch {epoch + 1}/{num_epochs}, Average Training Loss: {avg_train_loss} / Accuracy on the train set: {accuracy}")


Epoch 1/6, Average Training Loss: 0.4286242856669152 / Accuracy on the train set: 0.8004207573632539
Epoch 2/6, Average Training Loss: 0.25552917063348995 / Accuracy on the train set: 0.8925666199158485
Epoch 3/6, Average Training Loss: 0.18678646446560665 / Accuracy on the train set: 0.9251051893408134
Epoch 4/6, Average Training Loss: 0.13518956766229942 / Accuracy on the train set: 0.9510518934081347
Epoch 5/6, Average Training Loss: 0.09688296085333184 / Accuracy on the train set: 0.9652173913043478
Epoch 6/6, Average Training Loss: 0.07139564866386458 / Accuracy on the train set: 0.9746143057503507


In [38]:
model.eval()
total_correct = 0
total_samples = 0
total_loss = 0

loss_fct = torch.nn.CrossEntropyLoss()  

with torch.no_grad():
    for batch in test_dataloader:
        input_ids, attention_mask, labels = batch
        
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        labels = labels.to(torch.long)
        
        loss = loss_fct(logits, labels)
        
        total_loss += loss.item()
        
        _, predicted = torch.max(logits, 1) 
        
        total_correct += (predicted == labels).sum().item()
        total_samples += labels.size(0)

accuracy = total_correct / total_samples
average_loss = total_loss / len(test_dataloader)
print(f"Accuracy on the test set: {accuracy}")
print(f"Average Loss on the test set: {average_loss}")


Accuracy on the test set: 0.9387755102040817
Average Loss on the test set: 0.23971509115238274


### En gelant les poids initialisés du BERT (sauf le classifier évidemment)

In [16]:
model_name = "bert-base-multilingual-cased"
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for name, param in model.named_parameters():
    if 'classifier' not in name:
        param.requires_grad = False

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
num_epochs = 8
learning_rate = 1e-5
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
loss_fct = torch.nn.CrossEntropyLoss()

for epoch in range(num_epochs):
    model.train() 
    total_loss = 0
    total_correct = 0
    total_samples = 0
    
    for batch in train_dataloader:
        input_ids, attention_mask, labels = batch
        
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)
        
        optimizer.zero_grad()
        
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        
        loss = loss_fct(logits, labels)
        
        total_loss += loss.item()

        _, predicted = torch.max(logits, 1)  
        
        total_correct += (predicted == labels).sum().item()
        total_samples += labels.size(0)
        
        loss.backward()
        optimizer.step()
        
    avg_train_loss = total_loss / len(train_dataloader)
    accuracy = total_correct / total_samples
    print(f"Epoch {epoch + 1}/{num_epochs}, Average Training Loss: {avg_train_loss} / Accuracy on the train set: {accuracy}")


Epoch 1/8, Average Training Loss: 0.894007900637896 / Accuracy on the train set: 0.48835904628330995
Epoch 2/8, Average Training Loss: 0.7601979710996953 / Accuracy on the train set: 0.49663394109396913
Epoch 3/8, Average Training Loss: 0.7304985786499999 / Accuracy on the train set: 0.5046283309957924
Epoch 4/8, Average Training Loss: 0.7210308641462583 / Accuracy on the train set: 0.5074333800841515
Epoch 5/8, Average Training Loss: 0.7166003268796767 / Accuracy on the train set: 0.520617110799439
Epoch 6/8, Average Training Loss: 0.7108411114044788 / Accuracy on the train set: 0.5230014025245442
Epoch 7/8, Average Training Loss: 0.709268730384352 / Accuracy on the train set: 0.514586255259467
Epoch 8/8, Average Training Loss: 0.7078663926621723 / Accuracy on the train set: 0.5200561009817671


In [18]:
model.eval()  
total_correct = 0
total_samples = 0
total_loss = 0

with torch.no_grad():
    for batch in test_dataloader:
        input_ids, attention_mask, labels = batch
        
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        
        loss_fct = torch.nn.CrossEntropyLoss()
        loss = loss_fct(logits, labels)
        
        total_loss += loss.item()
        
        _, predicted = torch.max(logits, 1)
        
        total_correct += (predicted == labels).sum().item()
        total_samples += labels.size(0)

accuracy = total_correct / total_samples
average_loss = total_loss / len(test_dataloader)
print(f"Accuracy on the test set: {accuracy}")
print(f"Average Loss on the test set: {average_loss}")


Accuracy on the test set: 0.5102040816326531
Average Loss on the test set: 0.7926105090550014


## Entraînement avec les exemples ambigus

In [25]:
class TextDataset(Dataset):
    def __init__(self, texts, targets, tokenizer, max_length, label_encoder=None):
        self.texts = texts
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_length = max_length
        
        if label_encoder is None:
            self.label_encoder = LabelEncoder()
            self.targets_encoded = self.label_encoder.fit_transform(targets)
        else:
            self.label_encoder = label_encoder
            self.targets_encoded = self.label_encoder.transform(targets)
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        target_encoded = self.targets_encoded[idx]
        
        inputs = self.tokenizer(text, max_length=self.max_length, truncation=True, padding='max_length', return_tensors='pt')
        input_ids = inputs['input_ids'].squeeze(0)
        attention_mask = inputs['attention_mask'].squeeze(0)
        
        return input_ids, attention_mask, torch.tensor(target_encoded)  
    

mapping = {'homme': 'masculin', 'femme': 'feminin', 'ambigu': 'ambigu'}
X_train, X_test, y_train, y_test = train_test_split(transcription_sex_df['prediction'].values, transcription_sex_df['sex'].map(mapping).values, test_size=0.2)
X_train = list(X_train) + list(firstname_sex_df['sentence'].values)
y_train = list(y_train) + list(firstname_sex_df['sex'].values)

tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-cased') 
max_length = 50

label_encoder = LabelEncoder() 
label_encoder.fit(y_test)  

train_dataset = TextDataset(X_train, y_train, tokenizer, max_length, label_encoder=label_encoder)
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)

test_dataset = TextDataset(X_test, y_test, tokenizer, max_length, label_encoder=label_encoder)
test_dataloader = DataLoader(test_dataset, batch_size=8, shuffle=True)


In [26]:
model_name = "bert-base-multilingual-cased"
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

In [27]:
num_epochs = 6
learning_rate = 1e-5
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
loss_fct = torch.nn.CrossEntropyLoss()

for epoch in range(num_epochs):
    model.train()  
    total_loss = 0
    total_correct = 0
    total_samples = 0
    
    for batch in train_dataloader:
        input_ids, attention_mask, labels = batch
        
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)
        
        optimizer.zero_grad()
        
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        
        loss = loss_fct(logits, labels)
        
        total_loss += loss.item()

        _, predicted = torch.max(logits, 1)
        
        total_correct += (predicted == labels).sum().item()
        total_samples += labels.size(0)
        
        loss.backward()
        optimizer.step()
        
    avg_train_loss = total_loss / len(train_dataloader)
    accuracy = total_correct / total_samples
    print(f"Epoch {epoch + 1}/{num_epochs}, Average Training Loss: {avg_train_loss} / Accuracy on the train set: {accuracy}")


Epoch 1/6, Average Training Loss: 0.4077452677974742 / Accuracy on the train set: 0.8260016811431774
Epoch 2/6, Average Training Loss: 0.26845205455501164 / Accuracy on the train set: 0.891846455589801
Epoch 3/6, Average Training Loss: 0.21105752439339293 / Accuracy on the train set: 0.9159428411319698
Epoch 4/6, Average Training Loss: 0.15650675323070104 / Accuracy on the train set: 0.9380778929672177
Epoch 5/6, Average Training Loss: 0.11234018444928391 / Accuracy on the train set: 0.9564303726534044
Epoch 6/6, Average Training Loss: 0.090800859912065 / Accuracy on the train set: 0.9662370411880078


In [28]:
model.eval() 
total_correct = 0
total_samples = 0
total_loss = 0

loss_fct = torch.nn.CrossEntropyLoss() 

with torch.no_grad():
    for batch in test_dataloader:
        input_ids, attention_mask, labels = batch
        
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        labels = labels.to(torch.long)
        
        loss = loss_fct(logits, labels)
        
        total_loss += loss.item()
        
        _, predicted = torch.max(logits, 1)
        
        total_correct += (predicted == labels).sum().item()
        total_samples += labels.size(0)

accuracy = total_correct / total_samples
average_loss = total_loss / len(test_dataloader)
print(f"Accuracy on the test set: {accuracy}")
print(f"Average Loss on the test set: {average_loss}")


Accuracy on the test set: 0.9795918367346939
Average Loss on the test set: 0.08515478331329566
