In [2]:
pip install faker

Collecting faker
  Downloading Faker-33.1.0-py3-none-any.whl.metadata (15 kB)
Downloading Faker-33.1.0-py3-none-any.whl (1.9 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.9 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.8/1.9 MB[0m [31m23.5 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m30.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faker
Successfully installed faker-33.1.0


In [7]:
from faker import Faker
from faker.providers import BaseProvider
from datetime import datetime, timedelta
import csv
import random
import string

# Custom Providers for all PHI elements
class NameProvider(BaseProvider):
    def person_name(self):
        patterns = [
            lambda: fake.name(),
            lambda: f"Dr. {fake.name()}",
            lambda: f"Patient: {fake.name()}",
            lambda: f"Name: {fake.name()}"
        ]
        return random.choice(patterns)()

class EmailProvider(BaseProvider):
    def email_address(self):
        patterns = [
            lambda: fake.free_email(),
            lambda: f"{fake.free_email()}; {fake.free_email()}",
            lambda: f"Email: {fake.free_email()}",
            lambda: f"Contact: {fake.free_email()}",
            lambda: f"Primary: {fake.free_email()}; Secondary: {fake.free_email()}"
        ]
        return random.choice(patterns)()

class SSNProvider(BaseProvider):
    def ssn_number(self):
        patterns = [
            lambda: fake.ssn(),
            lambda: f"SSN: {fake.ssn()}",
            lambda: f"Social Security: {fake.ssn()}",
            lambda: f"SSN: XXX-XX-{fake.numerify('####')}"
        ]
        return random.choice(patterns)()

class MRNProvider(BaseProvider):
    def medical_record_number(self):
        patterns = [
            lambda: f"MR{random.randint(100000000, 999999999)}",
            lambda: f"{random.randint(100000, 999999)}MRN",
            lambda: f"MRN{str(random.randint(1, 9999999)).zfill(7)}",
            lambda: f"H-{random.randint(100, 999)}-{random.randint(10, 99)}-{random.randint(1000, 9999)}",
            lambda: f"{random.randint(2000, 2025)}A{random.randint(100000, 999999)}",
            lambda: f"R{random.randint(100000000, 999999999)}",
            lambda: f"Medical Record: {random.randint(100000000, 999999999)}",
        ]
        return random.choice(patterns)()

class MBIProvider(BaseProvider):
    def medicare_beneficiary_identifier(self):
        valid_alpha = 'ACDEFGHJKMNPQRTUVWXY'
        valid_alphanum = '0123456789' + valid_alpha

        p1 = str(random.randint(1, 9))
        p2 = random.choice(valid_alpha)
        p3 = random.choice(valid_alphanum)
        p4 = str(random.randint(0, 9))
        p5 = random.choice(valid_alpha)
        p6 = random.choice(valid_alphanum)
        p7 = str(random.randint(0, 9))
        p8 = random.choice(valid_alpha)
        p9 = random.choice(valid_alpha)
        p10 = str(random.randint(0, 9))
        p11 = str(random.randint(0, 9))

        mbi = f"{p1}{p2}{p3}{p4}{p5}{p6}{p7}{p8}{p9}{p10}{p11}"
        patterns = [
            lambda: mbi,
            lambda: f"{mbi[:4]}-{mbi[4:7]}-{mbi[7:]}",
            lambda: f"MBI: {mbi}",
            lambda: f"Medicare ID: {mbi}",
            lambda: f"Beneficiary ID: {mbi[:4]}-{mbi[4:7]}-{mbi[7:]}"
        ]
        return random.choice(patterns)()

class PhoneProvider(BaseProvider):
    def phone_number(self):
        patterns = [
            lambda: fake.numerify("###-###-####"),
            lambda: fake.numerify("+1 (###) ###-####"),
            lambda: fake.numerify("(###) ###-####"),
            lambda: f"Mobile: {fake.numerify('###-###-####')}",
            lambda: f"Phone: {fake.numerify('+1 (###) ###-####')}",
            lambda: f"Work: {fake.numerify('(###) ###-####')}",
            lambda: f"Home: {fake.numerify('###-###-####')}",
            lambda: f"Mobile: {fake.numerify('+1 (###) ###-####')}; Work: {fake.numerify('(###) ###-####')}"
        ]
        return random.choice(patterns)()

class GeographicalProvider(BaseProvider):
    def geographical_identifier(self):
        patterns = [
            lambda: f"{random.randint(10000, 99999)}",
            lambda: f"{random.randint(10000, 99999)}-{random.randint(1000, 9999)}",
            lambda: f"ZIP: {random.randint(10000, 99999)}",
            lambda: f"{fake.city()} County",
            lambda: f"County: {fake.city()} County",
            lambda: f"{random.randint(1, 9999)} {fake.street_name()}",
            lambda: f"{random.randint(1, 9999)} {fake.street_name()}, {fake.city()}",
            lambda: f"Address: {random.randint(1, 9999)} {fake.street_name()}",
            lambda: f"Address: {random.randint(1, 9999)} {fake.street_name()}, {fake.city()}"
        ]
        return random.choice(patterns)()

class AccountNumberProvider(BaseProvider):
    def account_number(self):
        patterns = [
            lambda: f"ACCT#{random.randint(10000000, 99999999)}",
            lambda: f"{random.randint(100000000000, 999999999999)}",
            lambda: f"BANK-{random.randint(1000, 9999)}-{random.randint(10000, 99999)}",
            lambda: f"HAC{random.randint(100000, 999999)}",
            lambda: f"PATIENT#{random.randint(10000, 99999)}",
            lambda: f"INS-{random.choice(string.ascii_uppercase)}{random.randint(1000000, 9999999)}",
            lambda: f"Account: {random.randint(100000, 999999)}"
        ]
        return random.choice(patterns)()

class DateProvider(BaseProvider):
    def date(self):
        # Generate dates from 1950 to 2024
        start_date = datetime(1950, 1, 1)
        end_date = datetime(2024, 12, 31)
        time_between_dates = end_date - start_date
        days_between_dates = time_between_dates.days
        random_number_of_days = random.randrange(days_between_dates)
        random_date = start_date + timedelta(days=random_number_of_days)

        patterns = [
            lambda: random_date.strftime("%Y-%m-%d"),
            lambda: random_date.strftime("%m/%d/%Y"),
            lambda: f"DOB: {random_date.strftime('%Y-%m-%d')}",
            lambda: f"Date: {random_date.strftime('%m/%d/%Y')}",
            lambda: random_date.strftime("%B %d, %Y"),
            lambda: f"{random.choice(['Birth', 'Record', 'Event', 'Admission'])} Date: {random_date.strftime('%Y-%m-%d')}"
        ]
        return random.choice(patterns)()
class MedicalTermProvider(BaseProvider):
    def medical_context(self):
        # Various medical-related contexts and phrases
        contexts = [
            # Diagnoses
            lambda: f"{random.choice(['Primary', 'Secondary', 'Preliminary'])} Diagnosis: {self._generate_diagnosis()}",

            # Conditions
            lambda: f"Condition: {self._generate_condition()}",

            # Treatment notes
            lambda: f"{random.choice(['Treatment', 'Procedure', 'Intervention'])}: {self._generate_treatment()}",

            # Medication references
            lambda: f"Medication: {self._generate_medication()}",

            # Vital signs
            lambda: f"{random.choice(['Blood Pressure', 'Heart Rate', 'Temperature'])}: {self._generate_vital_sign()}",

            # Medical status
            lambda: f"Status: {self._generate_status()}",

            # Department/Ward
            lambda: f"Department: {self._generate_department()}",

            # Severity or stage
            lambda: f"{random.choice(['Severity', 'Stage'])}: {self._generate_severity()}"
        ]
        return random.choice(contexts)()

    def _generate_diagnosis(self):
        prefixes = ['Acute', 'Chronic', 'Recurrent', 'Suspected', 'Confirmed']
        systems = ['Cardiovascular', 'Respiratory', 'Neurological', 'Gastrointestinal', 'Endocrine']
        conditions = ['Disorder', 'Syndrome', 'Disease', 'Condition']
        return f"{random.choice(prefixes)} {random.choice(systems)} {random.choice(conditions)}"

    def _generate_condition(self):
        conditions = [
            'Hypertension', 'Diabetes', 'Arthritis', 'Asthma', 'Pneumonia',
            'Heart Failure', 'Kidney Disease', 'Liver Cirrhosis', 'Multiple Sclerosis',
            'Parkinson\'s Disease', 'Alzheimer\'s', 'Epilepsy', 'Cancer Stage II',
            'Chronic Obstructive Pulmonary Disease', 'Coronary Artery Disease'
        ]
        return random.choice(conditions)

    def _generate_treatment(self):
        treatments = [
            'Chemotherapy', 'Surgical Intervention', 'Physical Therapy',
            'Radiation Treatment', 'Dialysis', 'Cardiac Rehabilitation',
            'Cognitive Behavioral Therapy', 'Orthopedic Surgery',
            'Endoscopic Procedure', 'MRI Scan', 'CT Scan', 'Biopsy'
        ]
        return random.choice(treatments)

    def _generate_medication(self):
        medications = [
            'Metformin 500mg', 'Lisinopril 10mg', 'Atorvastatin 40mg',
            'Levothyroxine 75mcg', 'Metoprolol 25mg', 'Insulin Glargine',
            'Aspirin 81mg', 'Warfarin 5mg', 'Omeprazole 20mg',
            'Albuterol Inhaler', 'Prednisone 10mg'
        ]
        return random.choice(medications)

    def _generate_vital_sign(self):
        def blood_pressure():
            return f"{random.randint(90, 180)}/{random.randint(60, 120)} mmHg"

        def heart_rate():
            return f"{random.randint(60, 100)} bpm"

        def temperature():
            return f"{round(random.uniform(97.0, 99.5), 1)}°F"

        return random.choice([blood_pressure, heart_rate, temperature])()

    def _generate_status(self):
        statuses = [
            'Stable', 'Critical', 'Improving', 'Deteriorating', 'Recovering',
            'Observation', 'Emergency', 'Outpatient', 'Inpatient'
        ]
        return random.choice(statuses)

    def _generate_department(self):
        departments = [
            'Cardiology', 'Neurology', 'Oncology', 'Emergency', 'Pediatrics',
            'Orthopedics', 'Intensive Care', 'Psychiatry', 'Endocrinology',
            'Gastroenterology', 'Pulmonology'
        ]
        return random.choice(departments)

    def _generate_severity(self):
        severities = [
            'Mild', 'Moderate', 'Severe', 'Critical', 'Stage I',
            'Stage II', 'Stage III', 'Early Onset', 'Advanced'
        ]
        return random.choice(severities)

# Initialize Faker and add all providers
fake = Faker()
fake.add_provider(NameProvider)
fake.add_provider(EmailProvider)
fake.add_provider(SSNProvider)
fake.add_provider(MRNProvider)
fake.add_provider(MBIProvider)
fake.add_provider(PhoneProvider)
fake.add_provider(GeographicalProvider)
fake.add_provider(AccountNumberProvider)
fake.add_provider(DateProvider)
fake.add_provider(MedicalTermProvider)

# Number of examples to generate
n = 10000

# Generate all data
data = {
    "Names": [fake.person_name() for _ in range(n)],
    "Email Addresses": [fake.email_address() for _ in range(n)],
    "Social Security Numbers": [fake.ssn_number() for _ in range(n)],
    "Medical Record Numbers": [fake.medical_record_number() for _ in range(n)],
    "Medicare Beneficiary IDs": [fake.medicare_beneficiary_identifier() for _ in range(n)],
    "Phone Numbers": [fake.phone_number() for _ in range(n)],
    "Geographic Identifiers": [fake.geographical_identifier() for _ in range(n)],
    "Account Numbers": [fake.account_number() for _ in range(n)],
    "Dates": [fake.date() for _ in range(n)],
    "Medical Contexts": [fake.medical_context() for _ in range(n)]
}

# Write to CSV
with open("synthetic_phi.csv", "w", newline="", encoding='utf-8') as file:
    writer = csv.DictWriter(file, fieldnames=data.keys())
    writer.writeheader()
    writer.writerows([dict(zip(data.keys(), values)) for values in zip(*data.values())])

print("Generated 10,000 synthetic PHI records and saved to synthetic_phi.csv!")

Generated 10,000 synthetic PHI records and saved to synthetic_phi.csv!


In [8]:
import pandas as pd

synthetic_data = pd.read_csv('synthetic_phi.csv')
synthetic_data.head()

Unnamed: 0,Names,Email Addresses,Social Security Numbers,Medical Record Numbers,Medicare Beneficiary IDs,Phone Numbers,Geographic Identifiers,Account Numbers,Dates,Medical Contexts
0,Patient: Jesse Cain,Email: davidhale@yahoo.com,Social Security: 551-50-9970,R579632766,9P74KD1VN41,Phone: +1 (571) 796-4556,County: North Bobtown County,HAC516134,DOB: 2010-01-18,Heart Rate: 86 bpm
1,Dr. Joshua Cummings,Contact: annemiller@yahoo.com,Social Security: 311-93-3439,647161MRN,Beneficiary ID: 9HA7-KC8-NG92,824-962-0545,59223-3892,INS-H9451640,Birth Date: 2002-05-29,Heart Rate: 97.0°F
2,Krista Melton,Contact: pcarlson@yahoo.com,758-11-2574,MR557100350,MBI: 8JW6CU9PC41,Home: 715-983-3774,"5086 Lewis Grove, New Jameschester",ACCT#72971707,Date: 06/24/1958,Preliminary Diagnosis: Suspected Gastrointesti...
3,Name: Pamela Allen,Primary: timothyramirez@gmail.com; Secondary: ...,Social Security: 718-07-0975,2019A228520,Medicare ID: 4DU3T05XF71,(994) 241-0313,Address: 7133 Jennifer Wall,PATIENT#23162,1974-06-26,Procedure: Surgical Intervention
4,Patient: Daniel Rodriguez,tammywalker@hotmail.com; xtate@hotmail.com,SSN: XXX-XX-3311,2002A169797,Beneficiary ID: 2DU2-D39-DN30,+1 (634) 008-4243,ZIP: 93071,218634445376,05/22/1955,Status: Observation


In [9]:
merged_data = pd.melt(
    synthetic_data,
    var_name="Tag",  # Column to store the original column names (NER tags)
    value_name="Identifier"  # Column to store the identifier values
)

In [10]:
merged_data

Unnamed: 0,Tag,Identifier
0,Names,Patient: Jesse Cain
1,Names,Dr. Joshua Cummings
2,Names,Krista Melton
3,Names,Name: Pamela Allen
4,Names,Patient: Daniel Rodriguez
...,...,...
99995,Medical Contexts,Department: Pulmonology
99996,Medical Contexts,Department: Psychiatry
99997,Medical Contexts,Status: Observation
99998,Medical Contexts,Status: Observation


In [11]:
# Define a mapping of old column names to new NER tags
tag_mapping = {
    "Names": "PERSON",
    "Email Addresses": "EMAIL",
    "Social Security Numbers": "SSN",
    "Medical Record Numbers": "MRN",
    "Medicare Beneficiary IDs": "MBI",
    "Phone Numbers": "PHONE",
    "Geographic Identifiers": "ADDRESS",
    "Account Numbers": "ACCOUNT",
    "Dates": "DATE",
    "Medical Contexts": "OTHER"
}

# Replace the tags in the transformed dataset
merged_data["Tag"] = merged_data["Tag"].map(tag_mapping)

In [12]:
merged_data

Unnamed: 0,Tag,Identifier
0,PERSON,Patient: Jesse Cain
1,PERSON,Dr. Joshua Cummings
2,PERSON,Krista Melton
3,PERSON,Name: Pamela Allen
4,PERSON,Patient: Daniel Rodriguez
...,...,...
99995,OTHER,Department: Pulmonology
99996,OTHER,Department: Psychiatry
99997,OTHER,Status: Observation
99998,OTHER,Status: Observation


In [13]:
# Shuffle the dataset to randomize the rows
shuffled_data = merged_data.sample(frac=1, random_state=42).reset_index(drop=True)

# Display the first few rows of the shuffled dataset
shuffled_data.head()


Unnamed: 0,Tag,Identifier
0,ACCOUNT,PATIENT#92450
1,DATE,1968-07-09
2,EMAIL,Email: brenda37@hotmail.com
3,ACCOUNT,Account: 300288
4,OTHER,Primary Diagnosis: Acute Neurological Condition


In [14]:
# Save the shuffled data to a new CSV file
shuffled_data.to_csv("synthetic_phi_NER.csv", index=False)
print("Shuffled data saved to synthetic_phi_NER.csv")

Shuffled data saved to synthetic_phi_NER.csv
