In [13]:
import random
import csv
import json

# Australian suburbs by state
LOCATIONS = {
    'VIC': ['Melbourne', 'Geelong', 'Ballarat', 'Bendigo', 'Shepparton', 'Ringwood', 'Frankston', 'Dandenong', 'Box Hill', 'Preston'],
    'NSW': ['Sydney', 'Newcastle', 'Wollongong', 'Parramatta', 'Penrith', 'Liverpool', 'Blacktown', 'Bankstown', 'Campbelltown', 'Gosford'],
    'QLD': ['Brisbane', 'Gold Coast', 'Townsville', 'Cairns', 'Toowoomba', 'Rockhampton', 'Mackay', 'Bundaberg', 'Hervey Bay', 'Southport'],
    'SA': ['Adelaide', 'Mount Gambier', 'Whyalla', 'Mount Barker', 'Murray Bridge', 'Port Adelaide', 'Glenelg', 'Modbury', 'Elizabeth', 'Salisbury'],
    'WA': ['Perth', 'Mandurah', 'Bunbury', 'Kalgoorlie', 'Geraldton', 'Albany', 'Fremantle', 'Joondalup', 'Rockingham', 'Armadale'],
    'TAS': ['Hobart', 'Launceston', 'Devonport', 'Burnie', 'Kingston', 'Ulverstone', 'Glenorchy', 'Clarence', 'Sorell', 'Brighton'],
    'ACT': ['Canberra', 'Belconnen', 'Tuggeranong', 'Woden', 'Gungahlin', 'Weston Creek', 'Civic', 'Braddon', 'Kingston', 'Manuka'],
    'NT': ['Darwin', 'Alice Springs', 'Palmerston', 'Katherine', 'Nhulunbuy', 'Casuarina', 'Nightcliff', 'Stuart Park', 'Parap', 'Rapid Creek']
}

PROFESSIONS = [
    {
        'name': 'Medical Practitioner',
        'divisions': [''],
        'registration_types': ['General', 'Specialist', 'Limited'],
        'specialties': ['General practice', 'Surgery', 'Paediatrics', 'Psychiatry', 'Anaesthetics', 
                       'Emergency medicine', 'Obstetrics and gynaecology', 'Cardiology', 'Radiology', 
                       'Dermatology', 'Oncology', 'Ophthalmology', 'Orthopaedic surgery'],
        'weight': 0.15
    },
    {
        'name': 'Nurse',
        'divisions': ['Registered nurse (Division 1)', 'Enrolled nurse (Division 2)', 'Nurse practitioner'],
        'registration_types': ['General', 'Non-practising'],
        'specialties': [''],
        'weight': 0.557
    },
    {
        'name': 'Pharmacist',
        'divisions': [''],
        'registration_types': ['General', 'Non-practising'],
        'specialties': [''],
        'weight': 0.04
    },
    {
        'name': 'Dentist',
        'divisions': [''],
        'registration_types': ['General', 'Specialist', 'Limited'],
        'specialties': ['General dentistry', 'Oral and maxillofacial surgery', 'Orthodontics', 
                       'Periodontics', 'Prosthodontics', 'Paediatric dentistry', 'Endodontics'],
        'weight': 0.03
    },
    {
        'name': 'Physiotherapist',
        'divisions': [''],
        'registration_types': ['General', 'Non-practising'],
        'specialties': [''],
        'weight': 0.05
    },
    {
        'name': 'Psychologist',
        'divisions': [''],
        'registration_types': ['General', 'Non-practising'],
        'specialties': ['Clinical psychology', 'Counselling psychology', 'Educational and developmental psychology', 
                       'Organisational psychology', 'Forensic psychology', 'Health psychology'],
        'weight': 0.05
    },
    {
        'name': 'Occupational Therapist',
        'divisions': [''],
        'registration_types': ['General', 'Non-practising'],
        'specialties': [''],
        'weight': 0.03
    },
    {
        'name': 'Midwife',
        'divisions': [''],
        'registration_types': ['General', 'Non-practising'],
        'specialties': [''],
        'weight': 0.02
    },
    {
        'name': 'Chiropractor',
        'divisions': [''],
        'registration_types': ['General', 'Non-practising'],
        'specialties': [''],
        'weight': 0.02
    },
    {
        'name': 'Optometrist',
        'divisions': [''],
        'registration_types': ['General', 'Non-practising'],
        'specialties': [''],
        'weight': 0.02
    },
    {
        'name': 'Podiatrist',
        'divisions': [''],
        'registration_types': ['General', 'Non-practising'],
        'specialties': [''],
        'weight': 0.02
    },
    {
        'name': 'Osteopath',
        'divisions': [''],
        'registration_types': ['General', 'Non-practising'],
        'specialties': [''],
        'weight': 0.01
    },
    {
        'name': 'Paramedic',
        'divisions': [''],
        'registration_types': ['General', 'Non-practising'],
        'specialties': [''],
        'weight': 0.02
    }
]

FIRST_NAMES = {
    'male': ['James', 'Oliver', 'Jack', 'William', 'Thomas', 'Noah', 'Henry', 'Lucas', 'Liam', 
             'Alexander', 'George', 'Samuel', 'Benjamin', 'Matthew', 'Daniel', 'David', 'Andrew', 
             'Michael', 'Peter', 'John'],
    'female': ['Charlotte', 'Olivia', 'Amelia', 'Isla', 'Mia', 'Ava', 'Grace', 'Chloe', 'Sophie', 
               'Emily', 'Emma', 'Sarah', 'Jessica', 'Rebecca', 'Rachel', 'Jennifer', 'Lisa', 
               'Michelle', 'Amanda', 'Nicole']
}

LAST_NAMES = ['Smith', 'Jones', 'Williams', 'Brown', 'Wilson', 'Taylor', 'Johnson', 'White', 
              'Martin', 'Anderson', 'Thompson', 'Nguyen', 'Thomas', 'Walker', 'Harris', 'Lee', 
              'Ryan', 'Robinson', 'Kelly', 'King', 'Wong', 'Singh', 'Patel', 'Kumar', 'Chen', 
              "O'Brien", 'Murphy', 'Clarke', 'Davis', 'Miller']

TITLES = {
    'Medical Practitioner': ['Dr'],
    'Dentist': ['Dr'],
    'default': ['Mr', 'Mrs', 'Ms', 'Miss', 'Dr']
}


def select_profession():
    """Select a profession based on weights"""
    rand = random.random()
    cumulative = 0
    for profession in PROFESSIONS:
        cumulative += profession['weight']
        if rand <= cumulative:
            return profession
    return PROFESSIONS[0]


def generate_practitioner():
    """Generate a single practitioner record"""
    profession = select_profession()
    
    # Generate name
    gender = random.choice(['male', 'female'])
    first_name = random.choice(FIRST_NAMES[gender])
    last_name = random.choice(LAST_NAMES)
    
    profession_titles = TITLES.get(profession['name'], TITLES['default'])
    title = random.choice(profession_titles)
    
    # Generate professional details
    division = random.choice(profession['divisions'])
    registration_type = random.choice(profession['registration_types'])
    
    specialty = ''
    if profession['specialties'][0] != '' and (registration_type == 'Specialist' or random.random() > 0.7):
        specialty = random.choice(profession['specialties'])
    
    # Generate location
    state = random.choice(list(LOCATIONS.keys()))
    suburb = random.choice(LOCATIONS[state])
    postcode = random.randint(2000, 7999)
    location = f"{suburb}, {state}, {postcode}"
    
    return {
        'Practitioner Name': f"{title} {first_name} {last_name}",
        'Profession': profession['name'],
        'Division': division,
        'Registration Type': registration_type,
        'Specialty': specialty,
        'Location': location
    }


def generate_dataset(num_records=2000):
    """Generate a dataset of practitioners"""
    return [generate_practitioner() for _ in range(num_records)]


def save_to_csv(data, filename='ahpra_pseudo_data.csv'):
    """Save dataset to CSV"""
    if not data:
        return
    
    fieldnames = ['Practitioner Name', 'Profession', 'Division', 'Registration Type', 'Specialty', 'Location']
    
    with open(filename, 'w', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(data)
    
    print(f"CSV saved to {filename}")


def save_to_json(data, filename='ahpra_pseudo_data.json'):
    """Save dataset to JSON"""
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=2, ensure_ascii=False)
    
    print(f"JSON saved to {filename}")

In [14]:
# Generate 100 records by default (change this number as needed)
num_records = 2000

print(f"Generating {num_records} pseudo AHPRA practitioner records...")
data = generate_dataset(num_records)

# Save to both formats
save_to_csv(data)
save_to_json(data)

# Print first 5 records as preview
print("\nPreview (first 5 records):")
print("-" * 100)
for i, record in enumerate(data[:5], 1):
    print(f"\n{i}. {record['Practitioner Name']}")
    print(f"   Profession: {record['Profession']}")
    print(f"   Division: {record['Division'] if record['Division'] else 'N/A'}")
    print(f"   Registration Type: {record['Registration Type']}")
    print(f"   Specialty: {record['Specialty'] if record['Specialty'] else 'N/A'}")
    print(f"   Location: {record['Location']}")

print(f"\n\nTotal records generated: {len(data)}")

Generating 2000 pseudo AHPRA practitioner records...
CSV saved to ahpra_pseudo_data.csv
JSON saved to ahpra_pseudo_data.json

Preview (first 5 records):
----------------------------------------------------------------------------------------------------

1. Dr Andrew Kelly
   Profession: Occupational Therapist
   Division: N/A
   Registration Type: Non-practising
   Specialty: N/A
   Location: Newcastle, NSW, 2138

2. Mr Daniel O'Brien
   Profession: Nurse
   Division: Registered nurse (Division 1)
   Registration Type: Non-practising
   Specialty: N/A
   Location: Port Adelaide, SA, 3617

3. Dr Lisa Chen
   Profession: Nurse
   Division: Registered nurse (Division 1)
   Registration Type: Non-practising
   Specialty: N/A
   Location: Melbourne, VIC, 2513

4. Ms Isla Murphy
   Profession: Physiotherapist
   Division: N/A
   Registration Type: Non-practising
   Specialty: N/A
   Location: Ballarat, VIC, 7902

5. Dr James Nguyen
   Profession: Nurse
   Division: Registered nurse (Divisio

In [21]:
# import pandas as pd
df = pd.read_csv("ahpra_pseudo_data.csv")
df["Specialty"].unique()

array([nan, 'Paediatrics', 'Surgery', 'Orthopaedic surgery',
       'General dentistry', 'Health psychology',
       'Educational and developmental psychology', 'Cardiology',
       'Forensic psychology', 'Emergency medicine', 'Oncology',
       'Psychiatry', 'Dermatology', 'General practice', 'Anaesthetics',
       'Clinical psychology', 'Ophthalmology', 'Orthodontics',
       'Obstetrics and gynaecology', 'Periodontics',
       'Counselling psychology', 'Radiology', 'Paediatric dentistry',
       'Endodontics', 'Organisational psychology', 'Prosthodontics',
       'Oral and maxillofacial surgery'], dtype=object)

In [28]:
df[df["Specialty"]=="Surgery"]

Unnamed: 0,Practitioner Name,Profession,Division,Registration Type,Specialty,Location
14,Dr Emily Davis,Medical Practitioner,,Specialist,Surgery,"Joondalup, WA, 6467"
404,Dr Lisa Williams,Medical Practitioner,,Limited,Surgery,"Bendigo, VIC, 7473"
433,Dr Noah Thompson,Medical Practitioner,,Specialist,Surgery,"Armadale, WA, 3850"
539,Dr Nicole Taylor,Medical Practitioner,,Limited,Surgery,"Gungahlin, ACT, 2512"
738,Dr Lisa Chen,Medical Practitioner,,Specialist,Surgery,"Campbelltown, NSW, 3606"
1146,Dr Michael Thompson,Medical Practitioner,,Specialist,Surgery,"Devonport, TAS, 6997"
1332,Dr Grace Harris,Medical Practitioner,,Specialist,Surgery,"Rockhampton, QLD, 3783"
1454,Dr Olivia Johnson,Medical Practitioner,,Limited,Surgery,"Salisbury, SA, 7030"
1729,Dr Sophie Anderson,Medical Practitioner,,Specialist,Surgery,"Frankston, VIC, 3500"
1776,Dr David Martin,Medical Practitioner,,Limited,Surgery,"Armadale, WA, 2867"


In [None]:
import json
import re

# Function to load keywords from the keywords JSON file
def load_keywords():
    with open('keywords.json', 'r') as file:
        data = json.load(file)
    return data["specialties"]

# Function to load the transcript from a text file
def load_transcript():
    with open('transcript.txt', 'r') as file:
        transcript = file.read()
    return transcript

# Function to highlight the keywords in the transcript
def highlight_keywords(transcript, keywords):
    # Go through each keyword and highlight it in the transcript
    for keyword in keywords:
        # Use regex to match the keyword case-insensitively and wrap it with a marker (e.g., **keyword**)
        transcript = re.sub(r'\b' + re.escape(keyword) + r'\b', f'**{keyword}**', transcript, flags=re.IGNORECASE)
    return transcript

# Function to save the highlighted transcript to a new file
def save_highlighted_transcript(highlighted_transcript):
    with open('highlighted_transcript.txt', 'w') as file:
        file.write(highlighted_transcript)
    print("Highlighted transcript saved to 'highlighted_transcript.txt'")

# Main function to run the process
def main():
    keywords = load_keywords()  # Load the list of specialties
    transcript = load_transcript()  # Load the transcript text
    
    # Highlight the keywords in the transcript
    highlighted_transcript = highlight_keywords(transcript, keywords)
    
    # Save the highlighted transcript
    save_highlighted_transcript(highlighted_transcript)

main()

: 