In [None]:
import pandas as pd
import numpy as np # We'll use np.nan for empty cells


In [None]:
try:
    ds_df = pd.read_parquet("Raw_Data/DS.parquet")
    ec_df = pd.read_parquet("Raw_data/EC.parquet")
    ie_df = pd.read_parquet("Raw_data/IE.parquet")
except:
    print("Data not found")

# Randomization Scheme

In [None]:
# --- Load the Randomization Scheme ---
rand_df = pd.read_parquet("Raw_Data/randomization_scheme.parquet")
print("Randomization scheme loaded:")
print(rand_df.head())
print("-" * 50)

# --- Rename 'Randomization Number' to 'SUBJID' for consistency ---
rand_df.rename(columns={'Randomization Number': 'SUBJID'}, inplace=True)

# --- Reshape (Melt) the DataFrame ---
# Identify the columns that are 'ID variables' (stay as they are)
id_vars = ['SUBJID', 'Block Number', 'Sequence']
# Identify the columns that are 'value variables' (get unpivoted)
value_vars = ['Treatment in Period 1', 'Treatment in Period 2', 'Treatment in Period 3',
              'Treatment in Period 4', 'Treatment in Period 5', 'Treatment in Period 6']

# Use pd.melt to transform from wide to long
rand_long_df = pd.melt(rand_df,
                       id_vars=id_vars,
                       value_vars=value_vars,
                       var_name='Period_Raw', # Name for the new column holding 'Treatment in Period X'
                       value_name='Treatment') # Name for the new column holding 'TP1', 'CP', etc.

print("Randomization data after melting (long format):")
print(rand_long_df.head())
print("-" * 50)

# --- Extract the Period Number ---
# We want just the number (1, 2, 3...) from 'Treatment in Period X'.
# .str.extract('(\\d+)') finds one or more digits and extracts them.
# .astype(int) converts the extracted string ('1') to an integer (1).
rand_long_df['Period'] = rand_long_df['Period_Raw'].str.extract('(\\d+)').astype(int)

# --- Clean up and show final long format ---
# We can drop the original 'Period_Raw' column now.
rand_long_df = rand_long_df.drop(columns=['Period_Raw'])
print("Final long format randomization data:")
print(rand_long_df.head())

# Calculation of Initial Disposition Counts

In [None]:
# find number screened successfully
num_screened = ie_df['SubjectID'].nunique()
print(f"Total participants screened: {num_screened}")

# find number of screen failures
screen_failures_df = ie_df[ie_df['SubjectStatus'].fillna('').str.upper() == 'SCREEN FAILURE']
num_screen_failures = screen_failures_df['SubjectID'].nunique()
print(f"Number of screen failures: {num_screen_failures}")

# find the number of participants randomized
num_randomized = num_screened - num_screen_failures
print(f"Number of randomized participants: {num_randomized}")


# Calculation of Dosed and Population Counts

## Find product each participant received for each period

In [None]:
# --- Prepare IE data (Create a clean map from SubjectID to SUBJID) ---
# We select only the ID columns and remove any duplicate rows.
# We also drop rows where SUBJID might be missing, as we need it for merging.
# We ensure SUBJID is an integer to match rand_long_df.
subject_map_df = ie_df[['SubjectID', 'SUBJID']].drop_duplicates().dropna(subset=['SUBJID'])
subject_map_df['SUBJID'] = subject_map_df['SUBJID'].astype(int)
print("SubjectID to SUBJID map prepared.")
print(subject_map_df.head())
print("-" * 50)

In [None]:
# --- Prepare EC data (Filter for dosed events and extract Period from VISIT) ---
# Filter for rows where ECYN is 'yes'. Use .copy() to avoid SettingWithCopyWarning.
dosed_df = ec_df[ec_df['ECYN'].fillna('').str.lower() == 'yes'].copy()
# Extract the period number from the 'VISIT' column using 'Period X' pattern.
dosed_df['Period'] = dosed_df['VISIT'].str.extract('Period (\\d+)').astype(int)
# Keep only SubjectID and Period, and drop duplicates (one row per subject per period dosed).
dosed_df = dosed_df[['SubjectID', 'Period']].drop_duplicates()
print("Dosed events (EC) prepared with correct Period:")
print(dosed_df.head())
print("-" * 50)

In [None]:
# --- Merge Dosing with Subject Map ---
# Add SUBJID to our dosed_df by merging with subject_map_df.
dosed_with_subjid = pd.merge(dosed_df, subject_map_df, on='SubjectID', how='left')
print("Dosed data merged with SUBJID:")
print(dosed_with_subjid.head())
print("-" * 50)

In [None]:
# --- Prepare for Merge: Check and Convert SUBJID types ---

print("--- Preparing for Merge ---")
print(f"dosed_with_subjid['SUBJID'] dtype before: {dosed_with_subjid['SUBJID'].dtype}")
print(f"rand_long_df['SUBJID'] dtype before: {rand_long_df['SUBJID'].dtype}")

# 1. Drop rows where SUBJID might be missing in either DataFrame
dosed_with_subjid.dropna(subset=['SUBJID'], inplace=True)
rand_long_df.dropna(subset=['SUBJID'], inplace=True)

# 2. Convert both SUBJID columns to integer type
try:
    dosed_with_subjid['SUBJID'] = dosed_with_subjid['SUBJID'].astype(int)
    rand_long_df['SUBJID'] = rand_long_df['SUBJID'].astype(int)
    print(f"dosed_with_subjid['SUBJID'] dtype after: {dosed_with_subjid['SUBJID'].dtype}")
    print(f"rand_long_df['SUBJID'] dtype after: {rand_long_df['SUBJID'].dtype}")
    print("SUBJID types standardized. Proceeding with merge.")

except Exception as e:
    print(f"Error converting SUBJID to int: {e}")
    print("Please check the SUBJID columns in both DataFrames for non-numeric values.")
    # You might want to stop or handle the error here if conversion fails.
    raise # Re-raise the error to stop execution if conversion fails

# --- END Prepare for Merge ---


# --- Merge with Randomization Data --- (Your original line)
final_dosed_df = pd.merge(dosed_with_subjid, rand_long_df, on=['SUBJID', 'Period'], how='inner')

# ... (The rest of your cell) ...

## Count dosed per treatment

In [None]:
# --- Count Dosed per Treatment ---
# Finally, group by the 'Treatment' column and count the unique SubjectIDs.
dosed_per_treatment = final_dosed_df.groupby('Treatment')['SubjectID'].nunique()

print("### Number of Participants Dosed Per Treatment ###")
print(dosed_per_treatment)

# Calculating Overall Completion and Discontinuation Counts


In [None]:
# --- Calculate 'Completed' ---
# Filter for 'Completed' status and count unique SubjectIDs.
# We'll use .dropna() here to ensure we only count subjects with an ID.
completed_df = ds_df[ds_df['SubjectStatus'].fillna('').str.upper() == 'COMPLETED']
num_completed = completed_df['SubjectID'].dropna().nunique()
print(f"Number of completed participants: {num_completed}")

In [None]:
# --- Calculate 'Lost To Follow-Up' ---
# Filter for 'Lost to Follow-up' status and count unique SubjectIDs.
lost_df = ds_df[ds_df['SubjectStatus'].fillna('').str.upper() == 'LOST TO FOLLOW-UP']
num_lost_to_follow_up = lost_df['SubjectID'].dropna().nunique()
print(f"Number Lost To Follow-Up: {num_lost_to_follow_up}")

In [None]:
# --- Calculate Total 'Discontinued' ---
# This should be the total randomized minus those who completed.
# We use 'num_randomized' which we calculated back in Step 2.
num_discontinued = num_randomized - num_completed
print(f"Total number discontinued: {num_discontinued}")

# Assemble Data into the Final Dataframe

In [None]:
# --- Ensure we have all our numbers ---
# These should exist from previous steps:
# num_screened, num_screen_failures, num_randomized
# dosed_per_treatment (this is a pandas Series, like TP1: 27, TP2: 27...)
# num_completed, num_discontinued, num_lost_to_follow_up

# --- Define the structure ---
index_names = ['Screened', 'Screen Failure', 'Randomized',
               'Included in Safety Population', 'Included in PK Population',
               'Completed', 'Discontinued from study', '  Lost To Follow-Up']

# Ensure 'Overall' is not in dosed_per_treatment, then get T names
treatment_names = list(dosed_per_treatment.index)
column_names = treatment_names + ['Overall']

# --- Create the Data Dictionary ---
data = {}

# --- Populate Overall Column ---
# The N for the 'Overall' column is the total number randomized
overall_N = num_randomized
data['Overall'] = [
    num_screened,
    num_screen_failures,
    num_randomized,
    num_randomized, # Safety N (Overall)
    num_randomized, # PK N (Overall)
    num_completed,
    num_discontinued,
    num_lost_to_follow_up
]

# --- Populate Treatment Columns ---
for t_name in treatment_names:
    n_t = dosed_per_treatment.get(t_name, 0) # Get N for this treatment
    data[t_name] = [
        np.nan, # Screened is only Overall
        np.nan, # Screen Failure is only Overall
        np.nan, # Randomized is only Overall (in this simple view)
        n_t,    # Safety N for this treatment
        n_t,    # PK N for this treatment
        np.nan, # Completed is only Overall
        np.nan, # Discontinued is only Overall
        np.nan  # Lost to Follow-up is only Overall
    ]

# --- Create the DataFrame ---
summary_df = pd.DataFrame(data, index=index_names)

# --- Reorder columns to match image (TP1, TP2 ... RP, Overall) ---
summary_df = summary_df[column_names]

# --- Display the DataFrame with raw numbers ---
print("### Participant Disposition Table (Raw Numbers) ###")
print(summary_df)

# Formatting the summary table

In [None]:
# --- Define N for each column (base for percentages) ---
# For treatment columns, N is the count from dosed_per_treatment.
# For Overall, N is num_randomized.
N_map = dosed_per_treatment.copy()
N_map['Overall'] = num_randomized

# --- Create an empty DataFrame with the same structure ---
formatted_df = pd.DataFrame(index=summary_df.index, columns=summary_df.columns)

# --- Rows that need percentages ---
rows_with_pct = ['Included in Safety Population', 'Included in PK Population',
                 'Completed', 'Discontinued from study', '  Lost To Follow-Up']

# --- Loop through each cell and format it ---
for col_name in summary_df.columns:
    N = N_map.get(col_name, 0) # Get N for this column

    for row_name in summary_df.index:
        count = summary_df.loc[row_name, col_name]

        # Check if the count is NaN (Not a Number)
        if pd.isna(count):
            formatted_df.loc[row_name, col_name] = "" # Make it blank
        else:
            count = int(count) # Convert to integer for display
            # Check if this row should have a percentage
            if row_name in rows_with_pct and N > 0:
                pct = (count / N) * 100
                formatted_df.loc[row_name, col_name] = f"{count} ({pct:.1f}%)"
            else:
                # If no percentage needed or N is 0, just show the count
                formatted_df.loc[row_name, col_name] = f"{count}"

# --- Format the column headers ---
new_column_names = []
for col in summary_df.columns:
    n_val = N_map.get(col, 0)
    new_column_names.append(f"{col}\n(N={n_val})") # Add N and a newline

formatted_df.columns = new_column_names

# --- Display the final formatted table ---
print("### Final Participant Disposition Table ###")
print(formatted_df.to_string()) # .to_string() helps display all rows/cols nicely


# Generate Word Document

In [None]:
from docx import Document
from docx.shared import Pt, Inches
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.oxml.ns import qn
from docx.oxml import OxmlElement

# --- Function to set specific borders (from DV.ipynb) ---
def set_cell_borders(cell, **kwargs):
    """
    Set cell's border (top, bottom, left, right, insideH, insideV).
    Use {'val': 'nil'} to remove a border.
    Use {"sz": 6, "val": "single", "color": "000000"} for a 0.75pt black line.
    """
    tc = cell._tc
    tcPr = tc.get_or_add_tcPr()
    tcBorders = tcPr.first_child_found_in("w:tcBorders")
    if tcBorders is None:
        tcBorders = OxmlElement('w:tcBorders')
        tcPr.append(tcBorders)

    for edge in ('top', 'left', 'bottom', 'right', 'insideH', 'insideV'):
        edge_data = kwargs.get(edge)
        if edge_data:
            tag = f'w:{edge}'
            border = tcBorders.find(qn(tag))
            if border is None:
                border = OxmlElement(tag)
                tcBorders.append(border)

            if edge_data.get("val") == "nil":
                border.set(qn('w:val'), "nil")
                for att in ["sz", "color", "space", "shadow"]:
                    if qn(f'w:{att}') in border.attrib:
                        del border.attrib[qn(f'w:{att}')]
            else:
                for k, v in edge_data.items():
                    border.set(qn(f'w:{k}'), str(v))

# --- Check if formatted_df exists ---
if 'formatted_df' in locals() and not formatted_df.empty:
    try:
        document = Document()
        document.add_paragraph('Participant Disposition Table')
        document.add_paragraph()

        # --- Define Styles ---
        font_name = 'Calibri'
        font_size = Pt(10)
        border_style = {"sz": 6, "val": "single", "color": "000000"}
        no_border = {"val": "nil"}

        # --- Add the Table (formatted_df.shape[1] + 1 for the new first column) ---
        num_cols = formatted_df.shape[1] + 1
        table = document.add_table(rows=formatted_df.shape[0] + 1, cols=num_cols)

        # --- Header Row ---
        column_headers = formatted_df.columns.tolist()
        for j in range(num_cols):
            cell = table.cell(0, j)
            if j == 0:
                # First column header is blank
                header_text = ""
                paragraph = cell.paragraphs[0]
                run = paragraph.add_run(header_text)
            else:
                # Other headers (j-1 because column_headers doesn't include the first blank one)
                header = column_headers[j-1]
                # Handle headers with newlines (from N=xx)
                if '\\n' in header:
                     parts = header.split('\\n')
                     cell.text = parts[0]
                     for part in parts[1:]:
                         cell.add_paragraph(part)
                else:
                     cell.text = header

                # Format all paragraphs in the cell
                for p in cell.paragraphs:
                     run = p.runs[0]
                     run.font.bold = True
                     run.font.name = font_name
                     run.font.size = font_size
                     p.alignment = WD_ALIGN_PARAGRAPH.CENTER

            # Set header borders for all header cells
            set_cell_borders(cell,
                             top=border_style,
                             bottom=border_style,
                             left=no_border,
                             right=no_border)

        # --- Data Rows ---
        num_data_rows = formatted_df.shape[0]
        for i, row_name in enumerate(formatted_df.index):
            for j in range(num_cols):
                cell = table.cell(i + 1, j)
                if j == 0:
                    # First column: Row names (Category)
                    cell_text = row_name.strip() # Use strip() to remove leading spaces for bold check
                    paragraph = cell.paragraphs[0]
                    run = paragraph.add_run(row_name) # Use original for spaces
                    run.font.name = font_name
                    run.font.size = font_size
                    paragraph.alignment = WD_ALIGN_PARAGRAPH.LEFT
                    # Handle indentation for 'Lost To Follow-Up'
                    if row_name.strip().startswith("Lost"):
                         paragraph.paragraph_format.left_indent = Inches(0.25)

                else:
                    # Other columns: Data (j-1 to access formatted_df)
                    cell_text = str(formatted_df.iloc[i, j-1])
                    paragraph = cell.paragraphs[0]
                    run = paragraph.add_run(cell_text)
                    run.font.name = font_name
                    run.font.size = font_size
                    paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER

                # Set data row borders (Only bottom border for the *last* row)
                border_args = {"left": no_border, "right": no_border, "top": no_border, "bottom": no_border}
                if (i + 1) == num_data_rows: # If it's the last data row
                    border_args["bottom"] = border_style

                set_cell_borders(cell, **border_args)

        # --- Save the Document ---
        file_path = "outputs/Summary of Participant Disposition.docx"
        document.save(file_path)
        print(f"Word document '{file_path}' created successfully!")

    except Exception as e:
        print(f"An error occurred while creating the Word document: {e}")
        import traceback
        traceback.print_exc()
else:
    print("Formatted DataFrame ('formatted_df') is empty or not defined. Cannot create Word document.")