In [3]:
import pandas as pd

# Load
df_SOAP = pd.read_csv("/lustre/hl/users/4283/agentic_paper/CHIL/Dataset/soap_generated_summaries.csv")

df_SOAP.head(5)

Unnamed: 0,input,output,generated_summary
0,"Good afternoon, champ, how you holding up? Goo...","Subjective:\n- Symptoms: Lower back pain, radi...",The patient is a 75-year-old man who has been ...
1,"What brings you in here today? Hi, I'm um, I'm...",Subjective:\n- Presenting with dry cough for 1...,", but then it cleared up. Um, but I never expe..."
2,Do you have any known allergies to medications...,Subjective:\n- No known allergies to medicatio...,"The patient, a 30-year-old male, presents with..."
3,"How may I help you today? Yeah I've had, a fev...",Subjective:\n- Fever and dry cough started 4 d...,"that you could have contracted the illness, bu..."
4,It sounds like that you're experiencing some c...,Subjective:\n- Presenting with chest pain for ...,into everything and figure out why this is hap...


In [8]:
# Load
df_MIMIC = pd.read_csv("/lustre/hl/users/4283/agentic_paper/CHIL/Dataset/sample_data_100.csv")

df_MIMIC.head(5)

Unnamed: 0,note_id,input,target,input_tokens,target_tokens
0,16002318-DS-17,<SEX> F <SERVICE> SURGERY <ALLERGIES> Iodine /...,This is a ___ yo F admitted to the hospital af...,1195,75
1,15638884-DS-4,<SEX> M <SERVICE> MEDICINE <ALLERGIES> Augment...,Mr. ___ is a ___ yo man with CAD with prior MI...,3496,1143
2,12435705-DS-14,<SEX> M <SERVICE> MEDICINE <ALLERGIES> ibuprof...,Mr. ___ is a ___ w/ Ph+ve ALL on dasatanib and...,5591,1098
3,12413577-DS-4,<SEX> F <SERVICE> OBSTETRICS/GYNECOLOGY <ALLER...,"On ___, Ms. ___ was admitted to the gynecology...",1119,221
4,17967161-DS-29,<SEX> M <SERVICE> SURGERY <ALLERGIES> lisinopr...,Mr. ___ underwent an angiogram on ___ which sh...,3307,439


In [7]:
df_SOAP.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 3 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   input              100 non-null    object
 1   output             100 non-null    object
 2   generated_summary  100 non-null    object
dtypes: object(3)
memory usage: 2.5+ KB


In [10]:
import pandas as pd
from transformers import AutoTokenizer

# 1. Load tokenizer (use the SAME tokenizer as your model)
tokenizer = AutoTokenizer.from_pretrained(
    "mistralai/Mistral-7B-Instruct-v0.2",
    use_fast=True
)

# 2. Create a copy so original df_SOAP is untouched
df_soap_mimic = df_SOAP.copy()

# 3. Rename output -> target
df_soap_mimic = df_soap_mimic.rename(columns={"output": "target"})

# 4. Create unique note_id (MIMIC-style but synthetic)
# Example: SOAP-000001, SOAP-000002, ...
df_soap_mimic["note_id"] = [
    f"SOAP-{i:06d}" for i in range(len(df_soap_mimic))
]

# 5. Token counting functions
def count_tokens(text):
    return len(tokenizer.encode(text, add_special_tokens=False))

# 6. Compute token counts
df_soap_mimic["input_tokens"] = df_soap_mimic["input"].apply(count_tokens)
df_soap_mimic["target_tokens"] = df_soap_mimic["target"].apply(count_tokens)

# 7. Reorder columns to match MIMIC layout
df_soap_mimic = df_soap_mimic[
    ["note_id", "input", "target", "input_tokens", "target_tokens"]
]

# Done ðŸŽ‰
df_soap_mimic.head(10)

Unnamed: 0,note_id,input,target,input_tokens,target_tokens
0,SOAP-000000,"Good afternoon, champ, how you holding up? Goo...","Subjective:\n- Symptoms: Lower back pain, radi...",427,317
1,SOAP-000001,"What brings you in here today? Hi, I'm um, I'm...",Subjective:\n- Presenting with dry cough for 1...,2281,438
2,SOAP-000002,Do you have any known allergies to medications...,Subjective:\n- No known allergies to medicatio...,12,68
3,SOAP-000003,"How may I help you today? Yeah I've had, a fev...",Subjective:\n- Fever and dry cough started 4 d...,1173,260
4,SOAP-000004,It sounds like that you're experiencing some c...,Subjective:\n- Presenting with chest pain for ...,1630,420
5,SOAP-000005,Hi there! What brings you in today? Guest_fami...,Subjective:\n- Concern that the baby may have ...,94,111
6,SOAP-000006,"Are you allergic to anything? No, I am not all...",Subjective:\n- No allergies reported (includin...,26,67
7,SOAP-000007,"What brought you in today? Sure, I'm I'm just ...",Subjective:\n- Chest pain\n- Duration: Started...,1390,322
8,SOAP-000008,"What brings you in? Yeah, I'm just coming in w...",Subjective:\n- 4 yo M presents with right ear ...,1867,432
9,SOAP-000009,"What brings you in today? Yeah, so I've had th...",Subjective:\n- Presenting with cough that star...,2220,448


In [11]:
df_soap_mimic.to_csv(
    "df_soap_mimic.csv",
    index=False
)


In [13]:
df_soap_mimic.to_csv(
    "/lustre/hl/users/4283/agentic_paper/CHIL/Dataset/df_soap_mimic.csv",
    index=False
)