# ENRON Dataset

In [1]:
import glob
from datasets import Dataset

datadir = '../Data/raw/maildir'
email_paths = glob.glob(f"{datadir}/**/all_documents/*")

print(len(email_paths))
print(email_paths[0])

dataset = Dataset.from_dict({'email_path':email_paths})

print(dataset)

128103
../Data/raw/maildir/scott-s/all_documents/628.
Dataset({
    features: ['email_path'],
    num_rows: 128103
})


### 处理数据

In [2]:
import os
os.environ['https_proxy'] = 'http://10.14.30.39:7890'
import email
from email import policy
from email.parser import BytesParser

from presidio_analyzer import AnalyzerEngine
from presidio_anonymizer import AnonymizerEngine, DeanonymizeEngine
from presidio_analyzer.nlp_engine import TransformersNlpEngine
from presidio_anonymizer.entities import RecognizerResult, OperatorResult, OperatorConfig
from presidio_anonymizer.operators import Decrypt
from presidio_anonymizer.entities import (
    ConflictResolutionStrategy,
    EngineResult,
    OperatorConfig,
    RecognizerResult,
)
from presidio_analyzer.nlp_engine import NlpEngineProvider

import sys
sys.path.append('..')
from Data.utils import save_json

import copy

configuration = {
    "nlp_engine_name": "spacy",
    "models": [{"lang_code": "es", "model_name": "es_core_news_md"},
                {"lang_code": "en", "model_name": "en_core_web_lg"}],
}

provider = NlpEngineProvider(nlp_configuration=configuration)
nlp_engine_with_spanish = provider.create_engine()

analyzer = AnalyzerEngine(default_score_threshold=0.85,nlp_engine=nlp_engine_with_spanish, 
    supported_languages=["en", "es"])
anonymizer = AnonymizerEngine()



def presidio_pii(seq,analyzer,anonymizer):
    # print(example)
    # ,entities=['PERSON', 'PHONE_NUMBER', 'DATE_TIME', 'LOCATION', 'EMAIL_ADDRESS', 'NRP']
    analyze_result = analyzer.analyze(text=seq,language='en',entities=['PERSON', 'PHONE_NUMBER', 'DATE_TIME', 'LOCATION', 'EMAIL_ADDRESS', 'NRP'])

    op_dict = {}
    pii_mask_idx = []
    pii_mask = []
    pii_dict = {} # pii_type:[pii1,pii2]
    masked_seq = copy.deepcopy(seq)
    for az in analyze_result:
        pii = seq[az.start:az.end]
        pii_type = az.entity_type
        if pii_type not in pii_dict.keys():
            pii_dict[pii_type] = [pii]
        else:
            if pii not in pii_dict[pii_type]:
                pii_dict[pii_type].append(pii)
        idx = pii_dict[pii_type].index(pii)
        pii_type_new = pii_type+'-'+str(idx)
        masked_seq = masked_seq.replace(pii,f'[{pii_type_new}]')
        
        # print(az)
        pii_mask_idx.append({'value':pii,'label':pii_type_new,'start':az.start,'end':az.end})
        pii_mask.append((pii,pii_type_new))
        # operators={az.entity_type: OperatorConfig("replace", {az.entity_type:az.entity_type})}
        # op_dict[pii_type_new] = OperatorConfig("replace", {"new_value": f'[{pii_type_new}]'})
    # print(analyze_result)
    # anonymizer_result =  anonymizer.anonymize(text=seq,analyzer_results=analyze_result,operators=op_dict)
    pii_mask = set(pii_mask)
    pii_mask = [{'value':pm[0],'label':pm[1]} for pm in pii_mask]
    # return anonymizer_result.text,pii_mask
    return masked_seq,pii_mask,pii_mask_idx
    # print(anonymizer_result)


# 读取一个邮件文件
def read_email(file_path):
    with open(file_path, 'rb') as f:
        msg = BytesParser(policy=policy.default).parse(f)
    return msg

# 解析邮件内容
def parse_email(msg):
    email_data = {
        "Message-ID": msg.get("Message-ID"),
        "Date": msg.get("Date"),
        "From": msg.get("From"),
        "To": msg.get("To"),
        "Subject": msg.get("Subject"),
        "Body": msg.get_body(preferencelist=('plain')).get_content() if msg.get_body(preferencelist=('plain')) else None
    }
    return email_data

def proc_email(example):
    path = example['email_path']
    msg = read_email(path)
    data = parse_email(msg)
    body = data['Body']
    data['len'] = len(body)
    return data

def proc_pii(example):
    body = example['Body']
    data = {}
    data['unmask_seq'] = body
    masked_seq,pii_mask,pii_mask_idx = presidio_pii(body,analyzer,anonymizer)
    # print(masked_seq)
    data['masked_seq'] = masked_seq
    data['pii_mask_idx'] = pii_mask_idx
    data['pii_mask'] = pii_mask
    return data

def filter_conflict(example):
    pii_mask = example['pii_mask']
    flag = 0
    pii_dict = {}
    for pm in pii_mask:
        pii_type = pm['label']
        pii = pm['value']
        if pii_type not in pii_dict:
            pii_dict[pii_type] = pii
        else:
            if pii_dict[pii_type] != pii:
                flag=1
    return flag==0

def proc_pii_privacy(example):
    pii_mask = example['pii_mask']
    new_pii_mask = []
    pii_dict = {} # pii_type:[pii1,pii2]
    for pm in pii_mask:
        pii_type = pm['label']
        pii = pm['value']
        if pii_type not in pii_dict.keys():
            pii_dict[pii_type] = [pii]
        else:
            if pii not in pii_dict[pii_type]:
                pii_dict[pii_type].append(pii)

        idx = pii_dict[pii_type].index(pii)
        pm['label'] = pm['label']+'-'+str(idx)
    
    return example

def fliter_pii_too_much(example):
    # 句子中90%单词不能是pii
    len_unmask_seq = len(example['masked_seq'].split(' '))
    len_pii_mask = len(example['pii_mask_idx'])
    # 一句话中一种PII不应该超过三种以上
    return len_pii_mask <= 0.1*len_unmask_seq


# 处理获得email数据
dataset_proc = dataset.map(proc_email,num_proc=48)
print(dataset_proc)
# 过滤email字符串长度超过1500的
dataset_proc = dataset_proc.filter(lambda x: x['len']<1500,num_proc=48)
print(dataset_proc)
# 处理经过presidio_pii处理得到的masked_seq unmask_seq pii_mask
dataset_proc = dataset_proc.map(proc_pii,num_proc=48)
print(dataset_proc)
# 过滤没有pii的邮件
dataset_proc = dataset_proc.filter(lambda x: len(x['pii_mask'])>0, num_proc=48)
print(dataset_proc)
# 过滤pii有歧义的邮件
# dataset_proc = dataset_proc.filter(filter_conflict, num_proc=48)
# print(dataset_proc)
# 为句子中每个不同的pii都分配一个类别
# dataset_proc = dataset_proc.map(proc_pii_privacy,num_proc=48)
# print(dataset_proc)
# 过滤句子中90%单词不能是pii
dataset_proc = dataset_proc.filter(fliter_pii_too_much,num_proc=48)
print(dataset_proc)

# 保存原始数据
# path = '../Data/raw/enron.json'
# dataset_proc.to_json(path)


Map (num_proc=48):   0%|          | 0/128103 [00:00<?, ? examples/s]

Dataset({
    features: ['email_path', 'Message-ID', 'Date', 'From', 'To', 'Subject', 'Body', 'len'],
    num_rows: 128103
})


Filter (num_proc=48):   0%|          | 0/128103 [00:00<?, ? examples/s]

Dataset({
    features: ['email_path', 'Message-ID', 'Date', 'From', 'To', 'Subject', 'Body', 'len'],
    num_rows: 91672
})


Map (num_proc=48):   0%|          | 0/91672 [00:00<?, ? examples/s]

Dataset({
    features: ['email_path', 'Message-ID', 'Date', 'From', 'To', 'Subject', 'Body', 'len', 'unmask_seq', 'masked_seq', 'pii_mask_idx', 'pii_mask'],
    num_rows: 91672
})


Filter (num_proc=48):   0%|          | 0/91672 [00:00<?, ? examples/s]

Dataset({
    features: ['email_path', 'Message-ID', 'Date', 'From', 'To', 'Subject', 'Body', 'len', 'unmask_seq', 'masked_seq', 'pii_mask_idx', 'pii_mask'],
    num_rows: 82137
})


Filter (num_proc=48):   0%|          | 0/82137 [00:00<?, ? examples/s]

Dataset({
    features: ['email_path', 'Message-ID', 'Date', 'From', 'To', 'Subject', 'Body', 'len', 'unmask_seq', 'masked_seq', 'pii_mask_idx', 'pii_mask'],
    num_rows: 45098
})


In [3]:
def filter_pii_type_too_much(example):
    flag = 0
    pii_mask=example['pii_mask']
    # print(set(pii_mask))
    for pm in pii_mask:
        label = pm['label']
        if '4' in label:
            flag=1
    return flag==0 and len(pii_mask)<7

dataset_proc1 = dataset_proc.filter(filter_pii_type_too_much,num_proc=48)
print(dataset_proc1)



Filter (num_proc=48):   0%|          | 0/45098 [00:00<?, ? examples/s]

Dataset({
    features: ['email_path', 'Message-ID', 'Date', 'From', 'To', 'Subject', 'Body', 'len', 'unmask_seq', 'masked_seq', 'pii_mask_idx', 'pii_mask'],
    num_rows: 31946
})


In [4]:
pii_type_dict ={}
pii_num={}
for dp in dataset_proc1:
    pii_mask = dp['pii_mask']
    len_pii_mask = len(pii_mask)
    if len_pii_mask not in pii_num.keys():
        pii_num[len_pii_mask]=1
    else:
        pii_num[len_pii_mask]+=1


    for pm in pii_mask:
        label = pm['label'].split('-')[0]
        if label not in pii_type_dict.keys():
            pii_type_dict[label]=1
        else:
            pii_type_dict[label]+=1
        


In [5]:
pii_num

{1: 8441, 2: 6908, 3: 5600, 5: 3557, 4: 4903, 6: 2537}

In [6]:
pii_type_dict

{'PERSON': 42979,
 'DATE_TIME': 31564,
 'LOCATION': 10696,
 'EMAIL_ADDRESS': 3823,
 'NRP': 2614}

### 划分A和B并保存数据

In [7]:
import sys
sys.path.append('..')
from Data.utils import save_json
dataset_AB = dataset_proc1.train_test_split(test_size=0.5,seed=42)

path = '../Data/raw/phishing/enron.json'

save_json({'A':dataset_AB['train'].to_list(),'B':dataset_AB['test'].to_list()},path)

In [10]:
pii_dict = {}
for pm in dataset_proc1['pii_mask']:
    for p in pm:
        label = p['label'].split("-")[0]
        value = p['value']
        if label not in pii_dict.keys():
            pii_dict[label] = 1
        else:
            pii_dict[label] +=1
pii_dict

{'PERSON': 42979,
 'DATE_TIME': 31564,
 'LOCATION': 10696,
 'EMAIL_ADDRESS': 3823,
 'NRP': 2614}

## Persidio Try


In [1]:
import os
os.environ['https_proxy'] = 'http://10.14.30.39:7890'

import transformers
from huggingface_hub import snapshot_download
from transformers import AutoTokenizer, AutoModelForTokenClassification

transformers_model = "obi/deid_roberta_i2b2" # e.g. "obi/deid_roberta_i2b2"

snapshot_download(repo_id=transformers_model)

# Instantiate to make sure it's downloaded during installation and not runtime
AutoTokenizer.from_pretrained(transformers_model)
AutoModelForTokenClassification.from_pretrained(transformers_model)

Fetching 9 files:   0%|          | 0/9 [00:00<?, ?it/s]



RobertaForTokenClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-23): 24 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
    

In [20]:
from presidio_analyzer import AnalyzerEngine, RecognizerRegistry
from presidio_analyzer.nlp_engine import NlpEngineProvider

# Create configuration containing engine name and models
conf_file = './parser.yaml'

# Create NLP engine based on configuration
provider = NlpEngineProvider(conf_file=conf_file)
nlp_engine = provider.create_engine()

# Pass the created NLP engine and supported_languages to the AnalyzerEngine
analyzer = AnalyzerEngine(
    nlp_engine=nlp_engine, 
    supported_languages=["en"]
)

results_english = analyzer.analyze(text="My name is Morris", language="en")
print(results_english)


[38;5;1m✘ No compatible package found for 'urchade/gliner_multi_pii-v1' (spaCy
v3.7.6)[0m



SystemExit: 1

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


## GLiNER 

In [19]:
from gliner import GLiNER

model = GLiNER.from_pretrained("urchade/gliner_large-v2.1")

text = """
Cristiano Ronaldo dos Santos Aveiro (Portuguese pronunciation: [kɾiʃˈtjɐnu ʁɔˈnaldu]; born 5 February 1985) is a Portuguese professional footballer who plays as a forward for and captains both Saudi Pro League club Al Nassr and the Portugal national team. Widely regarded as one of the greatest players of all time, Ronaldo has won five Ballon d'Or awards,[note 3] a record three UEFA Men's Player of the Year Awards, and four European Golden Shoes, the most by a European player. He has won 33 trophies in his career, including seven league titles, five UEFA Champions Leagues, the UEFA European Championship and the UEFA Nations League. Ronaldo holds the records for most appearances (183), goals (140) and assists (42) in the Champions League, goals in the European Championship (14), international goals (128) and international appearances (205). He is one of the few players to have made over 1,200 professional career appearances, the most by an outfield player, and has scored over 850 official senior career goals for club and country, making him the top goalscorer of all time.
"""

labels = ["person", "award", "date", "competitions", "teams"]

entities = model.predict_entities(text,labels)

for entity in entities:
    # print(entity["text"], "=>", entity["label"])
    print(entities)
    break


Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[{'start': 1, 'end': 18, 'text': 'Cristiano Ronaldo', 'label': 'person', 'score': 0.8923935294151306}, {'start': 92, 'end': 107, 'text': '5 February 1985', 'label': 'date', 'score': 0.9758335947990417}, {'start': 233, 'end': 255, 'text': 'Portugal national team', 'label': 'teams', 'score': 0.7006867527961731}, {'start': 317, 'end': 324, 'text': 'Ronaldo', 'label': 'person', 'score': 0.5729479193687439}, {'start': 338, 'end': 356, 'text': "Ballon d'Or awards", 'label': 'award', 'score': 0.6336824893951416}, {'start': 381, 'end': 417, 'text': "UEFA Men's Player of the Year Awards", 'label': 'award', 'score': 0.8929407000541687}, {'start': 428, 'end': 449, 'text': 'European Golden Shoes', 'label': 'award', 'score': 0.8832416534423828}, {'start': 556, 'end': 578, 'text': 'UEFA Champions Leagues', 'label': 'competitions', 'score': 0.7971668243408203}, {'start': 584, 'end': 610, 'text': 'UEFA European Championship', 'label': 'competitions', 'score': 0.9217649698257446}, {'start': 619, 'end':

In [1]:
dd = 'PERSON, organization, phone number, address, passport number, email, credit card number, social security number, health insurance id number, date of birth, mobile phone number, bank account number, medication, cpf, driver\'s license number, tax identification number, medical condition, identity card number, national id number, ip address, email address, iban, credit card expiration date, username, health insurance number, registration number, student id number, insurance number, flight number, landline phone number, blood type, cvv, reservation number, digital signature, social media handle, license plate number, cnpj, postal code, passport_number, serial number, vehicle registration number, credit card brand, fax number, visa number, insurance company, identity document number, transaction number, national health insurance number, cvc, birth certificate number, train ticket number, passport expiration date,social_security_number'.replace(', ','|').split('|')
dd
from gliner import GLiNER

model = GLiNER.from_pretrained("urchade/gliner_multi_pii-v1")

text = ["""
Harilala Rasoanaivo, un homme d'affaires local d'Antananarivo, a enregistré une nouvelle société nommée "Rasoanaivo Enterprises" au Lot II M 92 Antohomadinika. Son numéro est le +261 32 22 345 67, et son adresse électronique est harilala.rasoanaivo@telma.mg. Il a fourni son numéro de sécu 501-02-1234 pour l'enregistrement.
""","""
Harilala Rasoanaivo, un homme d'affaires local d'Antananarivo, a enregistré une nouvelle société nommée "Rasoanaivo Enterprises" au Lot II M 92 Antohomadinika. Son numéro est le +261 32 22 345 67, et son adresse électronique est harilala.rasoanaivo@telma.mg. Il a fourni son numéro de sécu 501-02-1234 pour l'enregistrement.
""",
]
labels = dd
# labels = ["work", "booking number", "personally identifiable information", "driver licence", "person", "book", "full address", "company", "actor", "character", "email", "passport number", "Social Security Number", "phone number"]
# entities = model.predict_entities(text, labels)
entities = model.batch_predict_entities(text, labels)
print(entities)
for entity in entities:
    print(entity["text"], "=>", entity["label"])
    

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[[{'start': 1, 'end': 20, 'text': 'Harilala Rasoanaivo', 'label': 'PERSON', 'score': 0.9991864562034607}, {'start': 106, 'end': 128, 'text': 'Rasoanaivo Enterprises', 'label': 'organization', 'score': 0.9995230436325073}, {'start': 133, 'end': 159, 'text': 'Lot II M 92 Antohomadinika', 'label': 'address', 'score': 0.9411612749099731}, {'start': 179, 'end': 196, 'text': '+261 32 22 345 67', 'label': 'phone number', 'score': 0.9696452021598816}, {'start': 230, 'end': 258, 'text': 'harilala.rasoanaivo@telma.mg', 'label': 'email', 'score': 0.9944571256637573}, {'start': 291, 'end': 302, 'text': '501-02-1234', 'label': 'social security number', 'score': 0.9772676825523376}], [{'start': 1, 'end': 20, 'text': 'Harilala Rasoanaivo', 'label': 'PERSON', 'score': 0.9991864562034607}, {'start': 106, 'end': 128, 'text': 'Rasoanaivo Enterprises', 'label': 'organization', 'score': 0.9995230436325073}, {'start': 133, 'end': 159, 'text': 'Lot II M 92 Antohomadinika', 'label': 'address', 'score': 0.9411

TypeError: list indices must be integers or slices, not str

In [7]:
import spacy
from gliner_spacy.pipeline import GlinerSpacy

# Configuration for GLiNER integration
custom_spacy_config = {
    "gliner_model": "urchade/gliner_multi_pii-v1",
    # "chunk_size": 250,
    "labels": ["person", "organization", "email"],
    "style": "ent",
    "threshold": 0.8,
    "map_location": "cuda" # only available in v.0.0.7
}

# Initialize a blank English spaCy pipeline and add GLiNER
nlp = spacy.blank("en")
nlp.add_pipe("gliner_spacy", config=custom_spacy_config)


# Example text for entity detection
text = '''Harilala Rasoanaivo, un homme d'affaires local d'Antananarivo, a enregistré une nouvelle société nommée "Rasoanaivo Enterprises" au Lot II M 92 Antohomadinika. Son numéro est le +261 32 22 345 67, et son adresse électronique est harilala.rasoanaivo@telma.mg. Il a fourni son numéro de sécu 501-02-1234 pour l'enregistrement.
'''

# Process the text with the pipeline
doc = nlp(text)

# Output detected entities
for ent in doc.ents:
    print(ent.text, ent.label_, ent._.score) # ent._.score only available in v. 0.0.7

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Harilala Rasoanaivo person 0.9999170303344727
Rasoanaivo Enterprises organization 0.9993711113929749
harilala.rasoanaivo@telma.mg email 0.999891996383667
