<a href="https://colab.research.google.com/github/Majids-Hamm/Pharmacovigillance/blob/main/CIOMS_summary_CSV.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# --- Task 2: Extract PTs and Causality Assessments from CIOMS Reports ---

# Install dependencies
!pip install python-docx rarfile pandas

import os
import re
import pandas as pd
import rarfile
from docx import Document
from google.colab import files



In [4]:
# --- Step 1: Extract RAR file ---
rar_path = "/content/CIOMS_files.rar"   # Path to your uploaded RAR
extract_path = "/content/cioms_docs"

os.makedirs(extract_path, exist_ok=True)
with rarfile.RarFile(rar_path) as rf:
    rf.extractall(extract_path)

print("RAR extracted successfully.")

RAR extracted successfully.


In [5]:
# --- Step 2: Helpers from Task 1 (patient code + date) ---

def parse_filename_for_info(filename):
    """Extract patient code and report date from filename if available."""
    name_without_ext = os.path.splitext(filename)[0]
    match = re.search(r'([A-Z0-9]+PEM-\d{3})-(\d{8})', name_without_ext)
    if match:
        patient_code = match.group(1)
        filename_date_raw = match.group(2)
        return patient_code, filename_date_raw
    return None, None



In [6]:
def extract_report_date(doc, fallback_filename, filename_report_date):
    """Extract 'Date Received by Sponsor' or fallback to filename date."""
    full_text = "\n".join([p.text for p in doc.paragraphs])
    sponsor_date_match = re.search(r"Date\s*Received\s*by\s*Sponsor\s*(\d{1,2}\s*/\s*\d{1,2}\s*/\s*\d{4})", full_text)
    if sponsor_date_match:
        return sponsor_date_match.group(1).replace(" ", "")
    return filename_report_date if filename_report_date else f"UNKNOWN_REPORT_DATE_{fallback_filename}"


In [7]:
# --- Step 3: Main extraction function ---

def extract_cioms_summary(docx_path):
    filename = os.path.basename(docx_path)
    patient_code, filename_report_date = parse_filename_for_info(filename)

    try:
        doc = Document(docx_path)
    except Exception as e:
        print(f"Error reading {filename}: {e}")
        return []

    # --- Report Date ---
    report_date = extract_report_date(doc, filename, filename_report_date)

    # --- Find last table (PT table is always last one) ---
    if not doc.tables:
        print(f"No tables in {filename}")
        return []

    last_table = doc.tables[-1]

    rows_data = []
    for r_idx, row in enumerate(last_table.rows):
        cells = [c.text.strip() for c in row.cells]
        # Skip header row
        if r_idx == 0:
            continue
        if len(cells) < 3:
            continue

        pt = cells[1] if cells[1] else "UNKNOWN_PT"
        causality = cells[2] if cells[2] else "UNKNOWN_CAUSALITY"

        rows_data.append({
            "Patient ID": patient_code if patient_code else f"UNKNOWN_CODE_{filename}",
            "Report Date": report_date,
            "Source File": filename,  # keep filename to assign unique CIOMS Number
            "PT": pt,
            "Causality Assessment": causality
        })

    return rows_data


In [8]:
# --- Step 4: Process all files ---
all_data = []
for root, dirs, files_in in os.walk(extract_path):
    for f in files_in:
        if f.endswith(".docx"):
            path = os.path.join(root, f)
            extracted_rows = extract_cioms_summary(path)
            all_data.extend(extracted_rows)

print(f"Extracted {len(all_data)} PT rows.")

Extracted 469 PT rows.


In [9]:
# --- Step 5: Create DataFrame and assign CIOMS Numbers (per file) ---
df = pd.DataFrame(all_data)

# Sort by Report Date then Patient ID
df.sort_values(by=["Report Date", "Patient ID"], inplace=True, ignore_index=True)

# Assign CIOMS Number per file
df["CIOMS Number"] = None
current_number = 1
seen_files = {}

for idx, row in df.iterrows():
    fname = row["Source File"]
    if fname not in seen_files:
        seen_files[fname] = current_number
        current_number += 1
    df.at[idx, "CIOMS Number"] = seen_files[fname]

# Reorder columns (drop Outcome + Source File)
df = df[["Patient ID", "CIOMS Number", "Report Date", "PT", "Causality Assessment"]]


In [11]:
# --- Step 6: Save CSV ---
output_csv = "cioms_summary.csv"
df.to_csv(output_csv, index=False, encoding="utf-8-sig")
files.download(output_csv)

print("CSV generated and downloaded successfully.")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

CSV generated and downloaded successfully.
