In [89]:
import pandas as pd
import numpy as np

In [90]:
firstname = pd.read_csv('data/firstname_with_sex.csv', sep=';')
transcriptions = pd.read_csv('data/transcriptions_with_sex.csv', sep=',')

In [91]:
transcriptions = transcriptions[['prediction','sex']]

## Prénom seulement

### En utilisant seulement le fichier des prénoms

In [92]:
import re
regex_prenom = r"prénom:\s*([\wéèêëîïôöûüàâäÉÈÊËÎÏÔÖÛÜÀÂÄ'-]+(?:\s+[\wéèêëîïôöûüàâäÉÈÊËÎÏÔÖÛÜÀÂÄ'-]+)*)(?=\s+date|\s+lieux| )"

transcriptions['prenom_pred'] = transcriptions['prediction'].apply(lambda x: re.search(regex_prenom, x).group(1).lower() if re.search(regex_prenom, x) else '')

In [93]:
transcriptions = pd.merge(transcriptions, firstname, left_on='prenom_pred', right_on='firstname', how='left')

transcriptions['prediction_sex'] = np.where(transcriptions['male'] > 3*transcriptions['female'], 'homme', np.where(3*transcriptions['male'] < transcriptions['female'], 'femme', 'ambigu'))

In [94]:
accuracy = sum(1 for x, y in zip(transcriptions['prediction_sex'], transcriptions['sex']) if x == y) / len(transcriptions)

print(f"La précision du modèle se basant sur le fichier des prénoms est de : {round(100*accuracy,2)} %")

La précision du modèle se basant sur le fichier des prénoms est de : 81.33 %


### En ajoutant un détecteur de genre

In [95]:
## !pip install gender-guesser

In [96]:
import gender_guesser.detector as gender

In [97]:
d = gender.Detector(case_sensitive=False)
def predict_gender(row):
    if pd.isna(row['firstname']):
        return d.get_gender(row['prenom_pred'], u'france')
    else:
        return row['prediction_sex']

transcriptions['prediction_sex'] = transcriptions.apply(predict_gender, axis=1)

transcriptions['temp_prediction_sex'] = transcriptions['prediction_sex'].replace(['unknown','andy'], 'ambigu')
transcriptions['temp_prediction_sex'] = transcriptions['temp_prediction_sex'].replace('male', 'homme')
transcriptions['temp_prediction_sex'] = transcriptions['temp_prediction_sex'].replace('female', 'femme')

transcriptions['prediction_sex'] = transcriptions['prediction_sex'].replace('andy', 'ambigu')
transcriptions['prediction_sex'] = transcriptions['prediction_sex'].replace('male', 'homme')
transcriptions['prediction_sex'] = transcriptions['prediction_sex'].replace('female', 'femme')

In [98]:
accuracy = sum(1 for x, y in zip(transcriptions['temp_prediction_sex'], transcriptions['sex']) if x == y) / len(transcriptions)

print(f"La précision du modèle en rajoutant le package gender-guesser est de : {round(100*accuracy,2)} %")

La précision du modèle en rajoutant le package gender-guesser est de : 81.74 %


In [99]:
transcriptions.drop(columns=['temp_prediction_sex'], inplace=True)

## En cherchant le prénom le plus proche dans le fichier des prénoms

In [101]:
## !pip install fuzzywuzzy

In [102]:
from fuzzywuzzy import fuzz

In [103]:
choices = list(firstname['firstname'])

def pick_closest_firstname(target_name, choices):
    max_score = -1
    closest_name = None

    for name in choices:
        score = fuzz.ratio(target_name, name)
        if score > max_score:
            max_score = score
            closest_name = name

    return closest_name

def change_firstname(row):
    if row['prediction_sex'] == 'unknown':
        return pick_closest_firstname(row['prenom_pred'], choices)
    else:
        return row['prenom_pred']
    
transcriptions['prenom_pred'] = transcriptions.apply(change_firstname, axis=1)


In [104]:
transcriptions = pd.merge(transcriptions[['prediction', 'sex', 'prenom_pred','prediction_sex']], firstname, left_on='prenom_pred', right_on='firstname', how='left')

def change_prediction(row):
    if row['prediction_sex'] == 'unknown':
        return np.where(row['male'] > 3*row['female'], 'homme', np.where(3*row['male'] < row['female'], 'femme', 'ambigu'))
    else:
        return row['prediction_sex']


transcriptions['prediction_sex'] = transcriptions.apply(change_prediction, axis=1)

In [105]:
accuracy = sum(1 for x, y in zip(transcriptions['prediction_sex'], transcriptions['sex']) if x == y) / len(transcriptions)

print(f"La précision du modèle en rajoutant le package fuzzywuzzy est de : {round(100*accuracy,2)} %")

La précision du modèle en rajoutant le package fuzzywuzzy est de : 89.21 %


## Avec la relation également

In [106]:
regex_relation = r"relation:\s*([\w\s'-]+)(?=\s+employeur|\s+profession|\s+date_naissance|\s+lieux_naissance|$)"

transcriptions['relation'] = transcriptions['prediction'].apply(lambda x: re.search(regex_relation, x).group(1) if re.search(regex_relation, x) else '')

In [107]:
choices = ['fils', 'fille', 'femme', 'belle-mère']

def pick_closest_relation(target_name, choices):
    max_score = -1
    closest_relation = None

    for name in choices:
        score = fuzz.ratio(target_name, name)
        if score > max_score:
            max_score = score
            closest_relation = name

    return closest_relation, max_score

def change_relation(row):
    closest_relation, max_score = pick_closest_relation(row['relation'], choices)
    if max_score > 70:
        if closest_relation == 'fils':
            return 'homme'
        else:
            return 'femme'
    else:
        return row['prenom_pred']
    
transcriptions['prenom_pred'] = transcriptions.apply(change_relation, axis=1)



In [108]:
accuracy = sum(1 for x, y in zip(transcriptions['prediction_sex'], transcriptions['sex']) if x == y) / len(transcriptions)

print(f"La précision du modèle en rajoutant la relation est de : {round(100*accuracy,2)} %")

La précision du modèle en rajoutant la relation est de : 89.21 %
