In [None]:
import pandas as pd
import time
import os
from backend.lib.formula_cal_LLM import get_formulation_label_LLM

data_path = "backend/data/data.csv"
data_checkpoint_path = "backend/data/data_checkpoint.csv"

# Load the original data
df = pd.read_csv(data_path)
df['eventdate'] = pd.to_datetime(df['eventdate'])
df['year'] = df['eventdate'].dt.year

# Filter the DataFrame
df = df[df['year'] == 2018].reset_index(drop=True)
df = df[df["TreatingUnitDesc"].apply(lambda x: "rbwh" in x.lower().strip())].reset_index(drop=True)
df = df.sort_values(by='eventdate', ascending=True).reset_index(drop=True)
# Keys we expect from the LLM function
keys = ['integrated', 'perpetuating', 'precipitating', 'predisposing', 'presentation', 'protective']

# Determine how many rows have already been processed
start_index = 0
if os.path.exists(data_checkpoint_path):
    # We only need the count of processed rows from the checkpoint
    checkpoint_df = pd.read_csv(data_checkpoint_path)
    # The number of processed rows is just how many rows in the checkpoint
    start_index = len(checkpoint_df)
    print(f"Found checkpoint with {start_index} rows already processed. Resuming...")

total_rows = len(df)
remaining = total_rows - start_index
if remaining <= 0:
    print("All rows appear to be processed already.")
    exit()

batch_size = 100
batches = (remaining // batch_size) + (1 if remaining % batch_size != 0 else 0)

print(f"Starting processing from row {start_index} of {total_rows}.")
print(f"Total batches to process: {batches}")

start_time = time.time()
all_iter = list(range(start_index, total_rows, batch_size))
for i in all_iter:
    batch_end = min(i + batch_size, total_rows)
    current_batch_size = batch_end - i
    print(f"Processing batch from row {i} to {batch_end - 1}")

    # We'll store results for this batch in a temporary DataFrame
    batch_df = df.iloc[i:batch_end].copy()

    for idx in range(i, batch_end):
        print (f"main iter {i}/{len(all_iter)} current batch iter {idx}/{batch_size}")
        text = df["formulationOverallClinicalImpression"].iloc[idx]

        # Run the LLM function
        try:
            result = get_formulation_label_LLM(text)
            dict_of_5ps = result[-1]  # Assuming dictionary is last element returned

            # Update the batch_df with the values
            # idx_in_batch is the relative index inside this batch
            idx_in_batch = idx - i
            for k in keys:
                batch_df.at[batch_df.index[idx_in_batch], k] = dict_of_5ps.get(k, None)

        except Exception as e:
            print(f"Error processing row {idx}: {e}")
            # If there's an error, just set them to None
            idx_in_batch = idx - i
            for k in keys:
                batch_df.at[batch_df.index[idx_in_batch], k] = None

    # Now we have a batch_df that includes the processed data
    # Append this to the checkpoint file
    if not os.path.exists(data_checkpoint_path) or i == 0:
        # If file doesn't exist or this is the very first batch, write with header
        batch_df.to_csv(data_checkpoint_path, index=False, mode='w')
    else:
        # Append without header
        batch_df.to_csv(data_checkpoint_path, index=False, mode='a', header=False)

    # Time estimation
    elapsed = time.time() - start_time
    processed = batch_end - start_index
    total_to_process = total_rows - start_index
    avg_time_per_row = elapsed / processed
    est_total_time = avg_time_per_row * total_to_process
    est_remaining_time = est_total_time - elapsed

    print(f"Completed batch ending at row {batch_end - 1}.")
    print(f"Elapsed time: {elapsed:.2f} seconds.")
    print(f"Estimated total time: {est_total_time/60:.2f} minutes.")
    print(f"Estimated remaining time: {est_remaining_time/60:.2f} minutes.")

print("Processing complete.")

main iter 600/38 current batch iter 639/100
main iter 600/38 current batch iter 640/100
main iter 600/38 current batch iter 641/100
main iter 600/38 current batch iter 642/100
main iter 600/38 current batch iter 643/100
main iter 600/38 current batch iter 644/100
main iter 600/38 current batch iter 645/100
main iter 600/38 current batch iter 646/100
main iter 600/38 current batch iter 647/100
main iter 600/38 current batch iter 648/100
main iter 600/38 current batch iter 649/100
main iter 600/38 current batch iter 650/100
main iter 600/38 current batch iter 651/100
main iter 600/38 current batch iter 652/100
main iter 600/38 current batch iter 653/100
main iter 600/38 current batch iter 654/100
main iter 600/38 current batch iter 655/100
main iter 600/38 current batch iter 656/100
main iter 600/38 current batch iter 657/100
main iter 600/38 current batch iter 658/100
main iter 600/38 current batch iter 659/100
main iter 600/38 current batch iter 660/100
main iter 600/38 current batch i

In [7]:
d.copy()

{1}

year
2017        4
2018    16160
2019    27165
2020    37343
dtype: int64

In [8]:
df.columns

Index(['Template', 'eventdate', 'formulationOverallClinicalImpression',
       'TreatingUnitDesc', 'TUSpecialServiceType', 'char_len',
       'grouped_char_len', 'year'],
      dtype='object')

"Anthony is a 40 year old single man; a voluntary patient, who receives the DSP, and who lives alone in a one bedroom department of housing unit in New Farm. Anthony was initially diagnosed with Tourettes in Grade seven which presented themselves in the form of vocal tics which progressed to motor head tics during high school years. Anthony reports that his Tourettes progressed to a diagnosis of Paranoid Schizophrenia and Obsessive Compulsive disorder. Collateral from father suggests that Anthony's diagnosis was in line with Anthony's marijuana use in his teens. Anthony has only ever been admitted to hospital on one occasion in 1996, when he was 19 years of age. He was commenced on Clozapine during this admission and has been managed on this regime ever since. The Clinical notes indicate that when Anthony is unwell he experienced ideas of reference and a believed that he was receiving messagesabout the future. He would ask co-patients and others around him about his future. Anthony is 

(("Inclusive 5 P's Formulation", 'Absent Integrated Formulation', []),
 {},
 {'perpetuating': 1,
  'precipitating': 2,
  'predisposing': 3,
  'presentation': 4,
  'protective': 5})