In [1]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
tokenizer = AutoTokenizer.from_pretrained("Jean-Baptiste/camembert-ner")
model = AutoModelForTokenClassification.from_pretrained("Jean-Baptiste/camembert-ner")

nlp = pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy="simple")

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Device set to use mps:0


In [27]:
path = '/Users/raphaelcourivaud/dev/zlv/export.tsv'

In [32]:
import pandas as pd
import re
from tqdm import tqdm
tqdm.pandas()

In [33]:
df = pd.read_csv(path, sep='\t')

In [38]:
def _predict_and_replace(x: str):
    results = nlp(x)
    anonymized_text = x
    for span in reversed(results):
        end_at = span["end"]
        start_at = span["start"]
        tag = span["entity_group"]
        if tag == 'PER':
            replacement = "<NOM>" # fake.name()
        elif tag == 'LOC':
            replacement = "<VILLE>" # fake.city()
        elif tag == 'ORG':
            replacement = "<ENTREPRISE>" # fake.city()
        else:
            continue
        anonymized_text = anonymized_text[:start_at] +" " +  replacement + " " + anonymized_text[end_at:]
            
    return anonymized_text.strip().replace("  ", " ")

In [39]:
def _clean_phone_and_email(x: str):
    phone_regex = r"(?:(?:(?:\+|00)33[ ]?(?:\(0\)[ ]?)?)|0){1}[1-9]{1}([ .-]?)(?:\d{2}\1?){3}\d{2}"
    email_regex = r"[\w\.]+@([\w-]+\.)+[\w-]{2,4}"
    
    result = re.sub(phone_regex, '<TELEPHONE>', x)
    result = re.sub(email_regex, '<EMAIL>', result)
    return result


In [42]:
def process(x: str):
    x = _clean_phone_and_email(x)
    x = _predict_and_replace(x)
    return x

process("test 0615408741 r.courivaud@gmail.com")

'test <TELEPHONE> <EMAIL>'

In [43]:
df["anonymized_text"] = df["Notes__content"].progress_apply(lambda x: process(x))

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50000/50000 [13:56<00:00, 59.80it/s]


In [45]:
df.to_clipboard(sep="\t")

In [59]:
df_filtered = df[~((df["Notes__content"].str.startswith("Changement de statut")) & (df["Notes__content"].str.len() < 200))]
df_filtered = df_filtered[~((df_filtered["Notes__content"].str.startswith("Ajout dans une campagne")) & (df_filtered["Notes__content"].str.len() < 200))]
df_filtered = df_filtered[~((df_filtered["Notes__content"].str.startswith("Changement de propriétaire")) & (df_filtered["Notes__content"].str.len() < 200))]



In [62]:
df_filtered.to_clipboard(sep="\t")

In [81]:
df_filtered["anonymized_text"].sample(frac=1).to_clipboard(sep="\n", index=False)