In [26]:
## !pip install transformers

In [1]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
import pandas as pd
from transformers import pipeline

In [41]:
## !pip install sentencepiece

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting sentencepiece
  Downloading sentencepiece-0.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.7 kB)
Downloading sentencepiece-0.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m23.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.2.0


In [17]:
transcription_sex_df = pd.read_csv('transcriptions_with_sex.csv')

In [52]:
model_name = "bert-base-multilingual-cased"
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)  
tokenizer = AutoTokenizer.from_pretrained(model_name)


classifier = pipeline("zero-shot-classification", model="flaubert/flaubert_large_cased")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of FlaubertForSequenceClassification were not initialized from the model checkpoint at flaubert/flaubert_large_cased and are newly initialized: ['sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Failed to determine 'entailment' label id from the label2id mapping in the model config. Setting to -1. Define a descriptive label2id mapping in the model config to ensure correct outputs.


In [24]:
def predict(input_text):
    class_labels = ["homme", "femme",]
    prompt = f"La personne décrite par : '{input_text}',  est de sexe : {', '.join(class_labels)}."
    input_ids = tokenizer(prompt, return_tensors="pt")["input_ids"]
    
    with torch.no_grad():
        logits = model(input_ids).logits
    probabilities = torch.softmax(logits, dim=-1).squeeze()
    return probabilities, class_labels

In [None]:
def predict_gender(row):
    probabilities, class_labels = predict(row['prediction'])
    return class_labels[torch.argmax(probabilities)]

In [None]:
transcription_sex_df['prediction_sex'] = transcription_sex_df.apply(predict_gender, axis=1)

In [53]:
def predict_sklearn(input_text, classifier):
    class_labels = ["masculin", "feminin"]
    result = classifier(input_text, candidate_labels=class_labels, hypothesis_template="Cette personne est de sexe {}.")
    predicted_label = result['labels'][0]
    mapping = {'masculin': 'homme', 'feminin': 'femme'}
    return mapping[predicted_label]

In [56]:
transcription_sex_df['prediction_sex'] = transcription_sex_df['prediction'].apply(predict_sklearn, classifier=classifier)

In [49]:
accuracy = sum(1 for x, y in zip(transcription_sex_df['prediction_sex'], transcription_sex_df['sex']) if x == y) / len(transcription_sex_df)

print(f"La précision du zero-shot classification est de : {round(100*accuracy,2)} %")

La précision du zero-shot classification est de : 51.04 %


In [59]:
models_to_test = [
    "bert-base-multilingual-cased",
    "flaubert/flaubert_large_cased",
    "facebook/bart-large",
    "camembert/camembert-large",
    "xlm-roberta-base"
]

for model_name in models_to_test:
    classifier = pipeline("zero-shot-classification", model=model_name)
    transcription_sex_df['prediction_sex'] = transcription_sex_df['prediction'].apply(predict_sklearn, classifier=classifier)
    accuracy = sum(1 for x, y in zip(transcription_sex_df['prediction_sex'], transcription_sex_df['sex']) if x == y) / len(transcription_sex_df)
    print(f"La précision du zero-shot classification pour {model_name} est de : {round(100*accuracy,2)} %")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Failed to determine 'entailment' label id from the label2id mapping in the model config. Setting to -1. Define a descriptive label2id mapping in the model config to ensure correct outputs.


La précision du zero-shot classification pour bert-base-multilingual-cased est de : 56.85 %


Some weights of FlaubertForSequenceClassification were not initialized from the model checkpoint at flaubert/flaubert_large_cased and are newly initialized: ['sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Failed to determine 'entailment' label id from the label2id mapping in the model config. Setting to -1. Define a descriptive label2id mapping in the model config to ensure correct outputs.


La précision du zero-shot classification pour flaubert/flaubert_large_cased est de : 48.55 %


Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large and are newly initialized: ['classification_head.dense.bias', 'classification_head.dense.weight', 'classification_head.out_proj.bias', 'classification_head.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Failed to determine 'entailment' label id from the label2id mapping in the model config. Setting to -1. Define a descriptive label2id mapping in the model config to ensure correct outputs.


La précision du zero-shot classification pour facebook/bart-large est de : 45.64 %


Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert/camembert-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight', 'roberta.embeddings.word_embeddings.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Failed to determine 'entailment' label id from the label2id mapping in the model config. Setting to -1. Define a descriptive label2id mapping in the model config to ensure correct outputs.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


La précision du zero-shot classification pour camembert/camembert-large est de : 50.62 %


config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

Failed to determine 'entailment' label id from the label2id mapping in the model config. Setting to -1. Define a descriptive label2id mapping in the model config to ensure correct outputs.


La précision du zero-shot classification pour xlm-roberta-base est de : 50.21 %


In [7]:
from transformers import pipeline
from sklearn.metrics import accuracy_score, classification_report

classifier = pipeline("zero-shot-classification", model="flaubert/flaubert_large_cased")

labels =  ["masculin", "feminin"]

predicted_sexes = []
for text in transcription_sex_df['prediction']:
    # Effectuer la classification
    result = classifier(text, candidate_labels=labels, hypothesis_template="Cette personne est de sexe {}.")
    predicted_label = result['labels'][0]    
    predicted_sexes.append(predicted_label)

transcription_sex_df['predicted_sex'] = predicted_sexes

label_mapping = {
    'masculin': 'homme',
    'feminin': 'femme'
}

transcription_sex_filtered = transcription_sex_df[transcription_sex_df['sex'] != 'ambigu']

# Appliquer le mapping pour harmoniser les labels prédits avec les labels originaux
transcription_sex_filtered['predicted_sex_mapped'] = transcription_sex_filtered['predicted_sex'].map(label_mapping)

accuracy = accuracy_score(transcription_sex_filtered['sex'], transcription_sex_filtered['predicted_sex_mapped'])
print(f"Précision du modèle: {accuracy}")
print(classification_report(transcription_sex_filtered['sex'], transcription_sex_filtered['predicted_sex_mapped'], target_names=["masculin", "feminin"]))

Some weights of FlaubertForSequenceClassification were not initialized from the model checkpoint at flaubert/flaubert_large_cased and are newly initialized: ['sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Failed to determine 'entailment' label id from the label2id mapping in the model config. Setting to -1. Define a descriptive label2id mapping in the model config to ensure correct outputs.


Précision du modèle: 0.6077586206896551
              precision    recall  f1-score   support

    masculin       0.58      0.57      0.57       107
     feminin       0.63      0.64      0.64       125

    accuracy                           0.61       232
   macro avg       0.61      0.61      0.61       232
weighted avg       0.61      0.61      0.61       232



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  transcription_sex_filtered['predicted_sex_mapped'] = transcription_sex_filtered['predicted_sex'].map(label_mapping)


In [8]:
## !pip install sacremoses

In [10]:
from transformers import pipeline
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd


# Liste des modèles BERT à tester
models_to_test = [
    "bert-base-multilingual-cased",
    "camembert/camembert-base",
    "flaubert/flaubert_large_cased",
    "facebook/bart-large",
    "camembert/camembert-large"
]

# Labels pour la classification
labels = ["masculin", "feminin"]

# Mapping des labels prédits vers les labels originaux
label_mapping = {
    'masculin': 'homme',
    'feminin': 'femme'
}

# Itérer sur chaque modèle
for model_name in models_to_test:
    print(f"Évaluation du modèle: {model_name}")

    # Initialiser le pipeline de classification zero-shot
    classifier = pipeline("zero-shot-classification", model=model_name)

    # Classifier les textes
    predicted_sexes = []
    for text in transcription_sex_df['prediction']:
        result = classifier(text, candidate_labels=labels, hypothesis_template="Cette personne est de sexe {}.")
        predicted_label = result['labels'][0]
        predicted_sexes.append(predicted_label)

    # Ajouter les résultats au DataFrame
    transcription_sex_df['predicted_sex'] = predicted_sexes

    # Filtrer pour exclure les entrées 'ambigu'
    transcription_sex_filtered = transcription_sex_df[transcription_sex_df['sex'] != 'ambigu']

    # Appliquer le mapping des labels prédits
    transcription_sex_filtered['predicted_sex_mapped'] = transcription_sex_filtered['predicted_sex'].map(label_mapping)

    # Calculer la précision
    accuracy = accuracy_score(transcription_sex_filtered['sex'], transcription_sex_filtered['predicted_sex_mapped'])

    # Afficher les résultats
    print(f"Précision du modèle: {accuracy:.4f}")
    print(classification_report(transcription_sex_filtered['sex'], transcription_sex_filtered['predicted_sex_mapped'], target_names=list(label_mapping.values())))

    # Pour la lisibilité des résultats
    print("\n" + "="*50 + "\n")

Évaluation du modèle: bert-base-multilingual-cased


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Failed to determine 'entailment' label id from the label2id mapping in the model config. Setting to -1. Define a descriptive label2id mapping in the model config to ensure correct outputs.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  transcription_sex_filtered['predicted_sex_mapped'] = transcription_sex_filtered['predicted_sex'].map(label_mapping)


Précision du modèle: 0.7069
              precision    recall  f1-score   support

       homme       0.71      0.62      0.66       107
       femme       0.71      0.78      0.74       125

    accuracy                           0.71       232
   macro avg       0.71      0.70      0.70       232
weighted avg       0.71      0.71      0.70       232



Évaluation du modèle: camembert/camembert-base


Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert/camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Failed to determine 'entailment' label id from the label2id mapping in the model config. Setting to -1. Define a descriptive label2id mapping in the model config to ensure correct outputs.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  transcription_sex_filtered['pr

Précision du modèle: 0.5388
              precision    recall  f1-score   support

       homme       0.00      0.00      0.00       107
       femme       0.54      1.00      0.70       125

    accuracy                           0.54       232
   macro avg       0.27      0.50      0.35       232
weighted avg       0.29      0.54      0.38       232



Évaluation du modèle: flaubert/flaubert_large_cased


Some weights of FlaubertForSequenceClassification were not initialized from the model checkpoint at flaubert/flaubert_large_cased and are newly initialized: ['sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Failed to determine 'entailment' label id from the label2id mapping in the model config. Setting to -1. Define a descriptive label2id mapping in the model config to ensure correct outputs.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  transcription_sex_filtered['predicted_sex_mapped'] = transcription_sex_filtered['predicted_sex'].map(label_mapping)


Précision du modèle: 0.4612
              precision    recall  f1-score   support

       homme       0.45      0.81      0.58       107
       femme       0.50      0.16      0.24       125

    accuracy                           0.46       232
   macro avg       0.48      0.49      0.41       232
weighted avg       0.48      0.46      0.40       232



Évaluation du modèle: facebook/bart-large


Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large and are newly initialized: ['classification_head.dense.bias', 'classification_head.dense.weight', 'classification_head.out_proj.bias', 'classification_head.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Failed to determine 'entailment' label id from the label2id mapping in the model config. Setting to -1. Define a descriptive label2id mapping in the model config to ensure correct outputs.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  transcription_sex_filtered['predicted_sex_mapped'] = transcription_sex_filtered['predicted_sex'].map(label_mapping)


Précision du modèle: 0.5431
              precision    recall  f1-score   support

       homme       0.57      0.04      0.07       107
       femme       0.54      0.98      0.70       125

    accuracy                           0.54       232
   macro avg       0.56      0.51      0.38       232
weighted avg       0.56      0.54      0.41       232



Évaluation du modèle: camembert/camembert-large


Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert/camembert-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight', 'roberta.embeddings.word_embeddings.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Failed to determine 'entailment' label id from the label2id mapping in the model config. Setting to -1. Define a descriptive label2id mapping in the model config to ensure correct outputs.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Précision du modèle: 0.5302
              precision    recall  f1-score   support

       homme       0.49      0.79      0.61       107
       femme       0.63      0.31      0.42       125

    accuracy                           0.53       232
   macro avg       0.56      0.55      0.51       232
weighted avg       0.57      0.53      0.50       232





A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  transcription_sex_filtered['predicted_sex_mapped'] = transcription_sex_filtered['predicted_sex'].map(label_mapping)
