In [2]:
# Importing libraries

import pandas as pd
import numpy as np
import os
import openai
import time

#pd.set_option('display.max_colwidth', None)

# from dotenv import load_dotenv, find_dotenv
# _ = load_dotenv(find_dotenv()) # read local .env file
# openai.api_key = os.environ['OPENAI_API_KEY']

In [3]:
# Function for getting response from ChatGPT

def get_completion(prompt, model="gpt-3.5-turbo", retries=3):
    for _ in range(retries):
        try:
            messages = [{"role": "user", "content": prompt}]
            response = openai.ChatCompletion.create(
                model=model,
                messages=messages,
                temperature=0, # this is the degree of randomness of the model's output
            )
            return response.choices[0].message["content"]
        except openai.error.ServiceUnavailableError as e:
            print(f"Service Unavailable. Retrying after 5 seconds... Error: {e}")
            time.sleep(5)
        except openai.error.APIError as e:
            print(f"API Error. Retrying after 5 seconds... Error: {e}")
            time.sleep(5)
    raise Exception("API call failed after multiple retries.")

In [6]:
df = pd.read_csv("Unbalanced_data.csv")
df.head()

Unnamed: 0,description,label
0,A 23-year-old white female presents with compl...,Allergy / Immunology
1,"Acute allergic reaction, etiology uncertain, h...",Allergy / Immunology
2,Mother states he has been wheezing and coughing.,Allergy / Immunology
3,Patient having severe sinusitis about two to t...,Allergy / Immunology
4,Functional endoscopic sinus surgery with left ...,Allergy / Immunology


In [7]:
np.unique(df['label'])

array(['Allergy / Immunology', 'Autopsy', 'Bariatrics',
       'Cardiovascular / Pulmonary', 'Chiropractic',
       'Consult - History and Phy.', 'Cosmetic / Plastic Surgery',
       'Dentistry', 'Dermatology', 'Diets and Nutritions',
       'Discharge Summary', 'ENT - Otolaryngology',
       'Emergency Room Reports', 'Endocrinology', 'Gastroenterology',
       'General Medicine', 'Hematology - Oncology',
       'Hospice - Palliative Care', 'IME-QME-Work Comp etc.',
       'Lab Medicine - Pathology', 'Letters', 'Nephrology', 'Neurology',
       'Neurosurgery', 'Obstetrics / Gynecology', 'Office Notes',
       'Ophthalmology', 'Orthopedic', 'Pain Management',
       'Pediatrics - Neonatal', 'Physical Medicine - Rehab', 'Podiatry',
       'Psychiatry / Psychology', 'Radiology', 'Rheumatology',
       'SOAP / Chart / Progress Notes', 'Sleep Medicine',
       'Speech - Language', 'Surgery', 'Urology'], dtype=object)

In [9]:
# Function for text classification using few-shot prompting technique

def classifier(content):
    text = content
    prompt = f"""
From the provided list of description, you need to classify it into correct medical_speciality. Examples of description and medical specialty are shown below

description: A 23-year-old white female presents with complaint of allergies.
medical_speciality: Allergy / Immunology

description: Consult for laparoscopic gastric bypass.
medical_specialty: Bariatrics

You need to classify each description into only one of the 40 medical specialties given in below list and display just the medical specialty, with nothing else

[' Allergy / Immunology', ' Autopsy', ' Bariatrics',
       ' Cardiovascular / Pulmonary', ' Chiropractic',
       ' Consult - History and Phy.', ' Cosmetic / Plastic Surgery',
       ' Dentistry', ' Dermatology', ' Diets and Nutritions',
       ' Discharge Summary', ' ENT - Otolaryngology',
       ' Emergency Room Reports', ' Endocrinology', ' Gastroenterology',
       ' General Medicine', ' Hematology - Oncology',
       ' Hospice - Palliative Care', ' IME-QME-Work Comp etc.',
       ' Lab Medicine - Pathology', ' Letters', ' Nephrology',
       ' Neurology', ' Neurosurgery', ' Obstetrics / Gynecology',
       ' Office Notes', ' Ophthalmology', ' Orthopedic',
       ' Pain Management', ' Pediatrics - Neonatal',
       ' Physical Medicine - Rehab', ' Podiatry',
       ' Psychiatry / Psychology', ' Radiology', ' Rheumatology',
       ' SOAP / Chart / Progress Notes', ' Sleep Medicine',
       ' Speech - Language', ' Surgery', ' Urology']
       
\"\"\"{text}\"\"\"
"""
    response = get_completion(prompt)
    return response
    #return response.replace("medical_speciality:", "").strip()  # Remove the word "medical_speciality"

In [None]:
# For the whole dataset

def main():
    csv_file = "Unbalanced_data.csv" 
    df = pd.read_csv(csv_file)

    # Create an empty list to store predictions
    predictions = []

    for _, row in df.iterrows():
        description = row['description']
        prediction = classifier(description)
        predictions.append(prediction)

    # Add predictions as a new column to the DataFrame
    df['predicted_medical_specialty'] = predictions
    #print(predictions)
    display(df)

    # Save the updated DataFrame back to the CSV file
    df.to_csv('Predicted_medical_specialty.csv', index=False)

if __name__ == "__main__":
    main()

In [33]:
df = pd.read_csv('Predicted_medical_specialty.csv')
df.head()

Unnamed: 0,description,label,predicted_medical_specialty
0,A 23-year-old white female presents with compl...,Allergy / Immunology,Allergy / Immunology
1,"Acute allergic reaction, etiology uncertain, h...",Allergy / Immunology,Allergy / Immunology
2,Mother states he has been wheezing and coughing.,Allergy / Immunology,Allergy / Immunology
3,Patient having severe sinusitis about two to t...,Allergy / Immunology,ENT - Otolaryngology
4,Functional endoscopic sinus surgery with left ...,Allergy / Immunology,ENT - Otolaryngology


In [39]:
## Acuuracy for the whole dataset

same_values_mask = df['label'] == df['predicted_medical_specialty']
num_same_values = len(df[same_values_mask])
print("Accuracy", num_same_values*100/len(df))

Accuracy 36.40534610013964


In [35]:
df['predicted_medical_specialty'].nunique() # Check the number of categories

78

In [37]:
category_counts = df['predicted_medical_specialty'].value_counts()
for category, count in category_counts.items():
    print(f"{category}: {count}")

Cardiovascular / Pulmonary: 717
Orthopedic: 663
Gastroenterology: 359
Neurology: 286
Surgery: 279
Urology: 275
Obstetrics / Gynecology: 256
ENT - Otolaryngology: 204
Radiology: 183
Ophthalmology: 151
General Medicine: 148
Pediatrics - Neonatal: 148
Neurosurgery: 123
Hematology - Oncology: 114
Psychiatry / Psychology: 96
Cosmetic / Plastic Surgery: 90
Pain Management: 81
Dermatology: 75
Physical Medicine - Rehab: 72
Podiatry: 71
Dentistry: 58
Pulmonary / Cardiovascular: 54
Endocrinology: 44
Bariatrics: 43
Nephrology: 37
Rheumatology: 31
Allergy / Immunology: 30
Diets and Nutritions: 29
Emergency Room Reports: 25
IME-QME-Work Comp etc.: 21
Sleep Medicine: 19
Hospice - Palliative Care: 18
Internal Medicine: 18
Plastic Surgery: 14
Lab Medicine - Pathology: 13
Oncology: 13
Otolaryngology: 13
Speech - Language: 11
Chiropractic: 11
Gynecology: 10
Gynecology / Obstetrics: 8
Infectious Disease: 7
medical_speciality: General Medicine: 6
Autopsy: 6
Breast Surgery: 6
Pediatric Gastroenterology: 6


In [41]:
# Predicting for first 20 samples

def main():
    csv_file = "Unbalanced_data.csv" 
    df = pd.read_csv(csv_file)
    df = df[:20]

    # Create an empty list to store predictions
    predictions = []

    for _, row in df.iterrows():
        description = row['description']
        prediction = classifier(description)
        predictions.append(prediction)

    # Add predictions as a new column to the DataFrame
    df['predicted_medical_specialty'] = predictions
    #print(predictions)
    display(df)
    
    # Measuring Accuracy
    same_values_mask = df['label'] == df['predicted_medical_specialty']
    num_same_values = len(df[same_values_mask])
    print("Accuracy", num_same_values*100/len(df))


if __name__ == "__main__":
    main()

Unnamed: 0,description,label,predicted_medical_specialty
0,A 23-year-old white female presents with compl...,Allergy / Immunology,Allergy / Immunology
1,"Acute allergic reaction, etiology uncertain, h...",Allergy / Immunology,Allergy / Immunology
2,Mother states he has been wheezing and coughing.,Allergy / Immunology,Allergy / Immunology
3,Patient having severe sinusitis about two to t...,Allergy / Immunology,ENT - Otolaryngology
4,Functional endoscopic sinus surgery with left ...,Allergy / Immunology,ENT - Otolaryngology
5,"Chronic glossitis, xerostomia, probable enviro...",Allergy / Immunology,Allergy / Immunology
6,A female for a complete physical and follow up...,Allergy / Immunology,Allergy / Immunology
7,This is a 14-month-old baby boy Caucasian who ...,Allergy / Immunology,Pediatrics - Neonatal
8,Autopsy - Homicide - Blunt force cranial trauma,Autopsy,Autopsy
9,Autopsy of a white female who died of acute co...,Autopsy,Autopsy


Accuracy 75.0


In [42]:
# Predicting for random 20 samples

def main():
    csv_file = "Unbalanced_data.csv" 
    df = pd.read_csv(csv_file)
    df = df.sample(n=20)

    # Create an empty list to store predictions
    predictions = []

    for _, row in df.iterrows():
        description = row['description']
        prediction = classifier(description)
        predictions.append(prediction)

    # Add predictions as a new column to the DataFrame
    df['predicted_medical_specialty'] = predictions
    #print(predictions)
    display(df)
    
    # Measuring Accuracy
    same_values_mask = df['label'] == df['predicted_medical_specialty']
    num_same_values = len(df[same_values_mask])
    print("Accuracy", num_same_values*100/len(df))


if __name__ == "__main__":
    main()

Service Unavailable. Retrying after 5 seconds... Error: The server is overloaded or not ready yet.


Unnamed: 0,description,label,predicted_medical_specialty
444,Patient complaining of cough and blood mixed w...,Consult - History and Phy.,Pulmonary / Cardiovascular
3587,He is a 67-year-old man who suffers from chron...,SOAP / Chart / Progress Notes,Cardiovascular / Pulmonary
524,Patient with mid-epigastric abdominal pain. So...,Consult - History and Phy.,Gastroenterology
3490,Right sacral alar notch and sacroiliac joint/p...,Radiology,Pain Management
3163,"Patient was referred to Physical Therapy, seco...",Physical Medicine - Rehab,Physical Medicine - Rehab
925,A 9-month well-child check.,Consult - History and Phy.,Pediatrics - Neonatal
1005,Evaluation and recommendations regarding facia...,Dermatology,Cosmetic / Plastic Surgery
2315,CT-guided frameless stereotactic radiosurgery ...,Neurosurgery,Radiology
4845,Vitrectomy opening. A limited conjunctival per...,Surgery,Ophthalmology
4860,Bladder instillation for chronic interstitial ...,Urology,Urology


Accuracy 40.0


In [10]:
# Predicting for random 20% of the samples

def main():
    csv_file = "Unbalanced_data.csv" 
    df = pd.read_csv(csv_file)
    df = df.sample(n=1000)

    # Create an empty list to store predictions
    predictions = []

    for _, row in df.iterrows():
        description = row['description']
        prediction = classifier(description)
        predictions.append(prediction)

    # Add predictions as a new column to the DataFrame
    df['predicted_medical_specialty'] = predictions
    #print(predictions)
    display(df)
    
    # Measuring Accuracy
    same_values_mask = df['label'] == df['predicted_medical_specialty']
    num_same_values = len(df[same_values_mask])
    print("Accuracy", num_same_values*100/len(df))


if __name__ == "__main__":
    main()

Service Unavailable. Retrying after 5 seconds... Error: The server is overloaded or not ready yet.
Service Unavailable. Retrying after 5 seconds... Error: The server is overloaded or not ready yet.
Service Unavailable. Retrying after 5 seconds... Error: The server is overloaded or not ready yet.
Service Unavailable. Retrying after 5 seconds... Error: The server is overloaded or not ready yet.
API Error. Retrying after 5 seconds... Error: Bad gateway. {"error":{"code":502,"message":"Bad gateway.","param":null,"type":"cf_bad_gateway"}} 502 {'error': {'code': 502, 'message': 'Bad gateway.', 'param': None, 'type': 'cf_bad_gateway'}} {'Date': 'Sun, 30 Jul 2023 20:53:45 GMT', 'Content-Type': 'application/json', 'Content-Length': '84', 'Connection': 'keep-alive', 'X-Frame-Options': 'SAMEORIGIN', 'Referrer-Policy': 'same-origin', 'Cache-Control': 'private, max-age=0, no-store, no-cache, must-revalidate, post-check=0, pre-check=0', 'Expires': 'Thu, 01 Jan 1970 00:00:01 GMT', 'Server': 'cloudfla

Unnamed: 0,description,label,predicted_medical_specialty
929,A 3-month well-child check.,Consult - History and Phy.,Pediatrics - Neonatal
2298,Anterior cranial vault reconstruction with fro...,Neurosurgery,Plastic Surgery
2496,"Postoperative day #1, total abdominal hysterec...",Obstetrics / Gynecology,Obstetrics / Gynecology
1121,"Fever, otitis media, and possible sepsis.",Discharge Summary,Pediatrics - Neonatal
3211,Right foot series after a foot injury.,Podiatry,Orthopedic
...,...,...,...
3631,Patient with complaint of a very painful left ...,SOAP / Chart / Progress Notes,Podiatry
1898,Sepsis. The patient was found to have a CT sca...,Hematology - Oncology,Urology
4617,Placement of a Port-A-Cath under fluoroscopic ...,Surgery,Surgery
4587,Visually significant nuclear sclerotic catarac...,Surgery,Ophthalmology


Accuracy 37.2
