# 1. Gender Classifier

## 1.1 Load Dataset

In [1]:
import os
import pandas as pd

file_name = 'gender_dataset.csv'
data_path = os.path.join('..', 'data', file_name )
print("Raw data path:", data_path)
data_df = pd.read_csv(data_path)
data_df.head(1)

Raw data path: ../data/gender_dataset.csv


Unnamed: 0,index,text,Gender
0,0,she is on oxygen 24/7 over the past 3 years .n...,Female


## 1.2. Prepare Dataset

In [2]:
# Select Random x samples for each gender to get predictions
number_of_samples_for_prediction = 10

df = data_df.copy()
df = df.sample(frac=1).reset_index(drop=True)
# Drop duplicates, if any
df = df.drop_duplicates()

# Create a new DataFrame with only x rows, x/2 with "Male" as the gender and x/2 with "Female" as the gender
filtered_gender_df = pd.concat([df[df["Gender"] == "Male"].head(number_of_samples_for_prediction), df[df["Gender"] == "Female"].head(number_of_samples_for_prediction)])

# Reset the index of the final data set
filtered_gender_df = filtered_gender_df.sample(frac=1).reset_index(drop=True)

# !!! Save the final data set to a CSV file to get predictions on JSL models
#filtered_gender_df.to_csv("gender_100_rows.csv", sep="\t", index=False)
filtered_gender_df.loc[:, 'predictions'] = ["NA" for _ in range(len(filtered_gender_df))]

filtered_gender_df.head(5)

Unnamed: 0,index,text,Gender,predictions
0,1736,she denies any chest pain .she has had a 20-po...,Female,
1,101,elsee from sound ** addr hospitalist team for ...,Female,
2,34,﻿sample type / medical specialty: letters samp...,Male,
3,614,she is doing well .she feels good .no complain...,Female,
4,1804,﻿sample type / medical specialty: general medi...,Male,


# 2. GPT Predictions 

## 2.1 Load Prompt and Initiate Classifier

In [3]:
from modules import GPTClassifier 

In [4]:

prompt_file = "09_03_Gender_Classifier.txt"

prompt_file_path = os.path.join('.', 'prompts', prompt_file)
print(f"Prompt for [Gender Classification] task: ({prompt_file_path})")

with open(prompt_file_path, 'r') as file:
    print("PROMPT:\n")
    for line in file:
        print(line)
        
gender_classifier = GPTClassifier.Classifier(prompt_file_path)

Prompt for [Gender Classification] task: (./prompts/09_03_Gender_Classifier.txt)
PROMPT:

You are a highly experienced, skilled and helpful medical annotator who have been working on medical and clinical texts.



I will provide you a a clinical text belonging to a patient, determine the gender of the patient.



Example Clinical Text:



    history of present illness : ms . tad arganda is resting in the bed .she reports that she received her chemotherapy on ** day and then began having shortness of breath on ** day .during that hospitalization , she was again reported to have shortness of breath the day following her chemotherapy .general : this is a well-nourished 78-year-old female who appears to be dyspneic .the patient is alert and oriented times 4 .50 . she remains very dyspneic and has difficult time completing 1 sentence .



    What is the gender of the patient mentioned in the clinical text?



    'Female'



Another Example Clinical Text:



    critical care medicine con

## 2.2 Get Predictions

In [5]:
import datetime

# Assing auto name to save prediction data as csv and excel
now = datetime.datetime.now()
file_name = f"Gender_Classifier_preds_{now.strftime('%m%d_%H%M')}"
processed_data_path = os.path.join('.', 'processed_data', file_name)


for idx, row in filtered_gender_df.iterrows():
    sentence = row['text']
    result = gender_classifier.do_query(sentence)
    predicted_gender = result['choices'][0]['message']['content']
    filtered_gender_df.at[idx, "predictions"] = gender_classifier.gender_fix_output_typo(predicted_gender)
    print(idx, "predicted_gender:", predicted_gender, "gt:", row["Gender"])

filtered_gender_df.to_csv(processed_data_path)

0 predicted_gender: Female gt: Female
1 predicted_gender: Female gt: Female
2 predicted_gender: Male gt: Male
3 predicted_gender: Female gt: Female
4 predicted_gender: Male gt: Male
5 predicted_gender: Male gt: Male
6 predicted_gender: Male gt: Male
7 predicted_gender: 'Male' gt: Male
8 predicted_gender: Female gt: Female
9 predicted_gender: Male gt: Male
10 predicted_gender: 'Male' gt: Male
11 predicted_gender: Female gt: Female
12 predicted_gender: Female gt: Female
13 predicted_gender: Male gt: Male
14 predicted_gender: Female gt: Female
15 predicted_gender: Male gt: Male
16 predicted_gender: Female gt: Female
17 predicted_gender: Female gt: Female
18 predicted_gender: Male gt: Male
19 predicted_gender: Female gt: Female


In [6]:
filtered_gender_df

Unnamed: 0,index,text,Gender,predictions
0,1736,she denies any chest pain .she has had a 20-po...,Female,Female
1,101,elsee from sound ** addr hospitalist team for ...,Female,Female
2,34,﻿sample type / medical specialty: letters samp...,Male,Male
3,614,she is doing well .she feels good .no complain...,Female,Female
4,1804,﻿sample type / medical specialty: general medi...,Male,Male
5,470,the patient is doing quite healthy .he lives a...,Male,Male
6,647,he presented with hoarseness of voice and diff...,Male,Male
7,1862,﻿sample type / medical specialty: urology samp...,Male,Male
8,35,﻿sample type / medical specialty: surgery samp...,Female,Female
9,872,he was noted to have a rapid decline in his en...,Male,Male


# 3. Evaluation

In [7]:
evaluation_data = list(zip(filtered_gender_df['Gender'],filtered_gender_df['predictions']))

In [8]:
evaluation_result = gender_classifier.gender_metrics(evaluation_data, file_name)
evaluation_result

{'Version': 'Gender_Classifier_preds_0404_2134',
 'Male Accuracy': 1.0,
 'Male Not Found': 0,
 'Male False Positive': 0,
 'Female Accuracy': 1.0,
 'Female Not Found': 0,
 'Female False Positive': 0,
 'Data Distribution': '10F-10M'}

In [9]:
import json
# create folder if it doesn't exist
if not os.path.exists('eval_results'):
    os.makedirs('eval_results')
    
# write result to JSON file
with open('eval_results/classifier_eval_result.json', 'a') as f:
    json.dump(evaluation_result, f)
    print("Results appended to file: eval_results/classifier_eval_result.json")

Results appended to file: eval_results/classifier_eval_result.json
