In [None]:
import os
import pandas as pd
from datetime import datetime

In [None]:
current_date = datetime.now().strftime("%d_%m_%Y")
print(current_date)

folder_name = f"dashboard_files_{current_date}"
folder_path = os.path.join(r"C:\Users\O304312\Documents\Dashboard Files", folder_name)
if not os.path.exists(folder_path):
    os.makedirs(folder_path)
    print(f"Folder created at: {folder_path}")
else:
    print(f"Folder already exists: {folder_path}")

In [None]:
"""Reports run on a two month lag (1 month then we wait till the 11th for the DOR reports.) So we are setting our file naming and data extraction accordingly."""

today = datetime.today()
first_of_this_month = today.replace(day=1)
first_of_last_month = (first_of_this_month - pd.DateOffset(months=1)).replace(day=1)

month = first_of_last_month.strftime("%B").lower()
print(month)
cap_month = month.capitalize()
print(cap_month)

In [None]:
snapshot_ds = r"J:\ONELINK\Snapshot Datasources"

In [None]:
visit_data = pd.read_csv(
    r"C:\Users\O304312\Downloads\Data Table Visit Data.csv",
    low_memory=False,
)

print(visit_data.columns)

In [None]:
"""Aligning to current Tableau Dashboard Naming Conventions. Real fix is to change it to reflect Viewpoint report changes"""

visit_data = visit_data[
    [
        "Site Name",
        "Site Study Code",
        "Subject ID",
        "Participant Status",
        "Participant Protocol Arm",
        "Visit Period",
        "Visit Name",
        "Visit Status",
        "Visit Completed Date",
    ]
]
print(visit_data.dtypes)

visit_data = visit_data.rename(
    columns={
        "Site Name": "Site",
        "Site Study Code": "Site Study ID",
        "Participant Protocol Arm": "Arm",
    }
)

# Convert Visit Completed Date to datetime
visit_data["Visit Completed Date"] = pd.to_datetime(
    visit_data["Visit Completed Date"], errors="coerce"
)

visit_data = visit_data[visit_data["Visit Completed Date"] < first_of_last_month]

visit_data.to_csv(f"{folder_path}/Visit Data_data.csv", index=False)

In [None]:
study_accruals = pd.read_csv(
    r"C:\Users\O304312\Downloads\Data Table Study Accrual.csv",
    low_memory=False,
)

print(study_accruals.columns)

In [None]:
study_accruals = study_accruals[
    [
        "Site Name",
        "Sponsor Site Number",
        "Network Study Code",
        "Network Study Status",
        "Site Study Code",
        "Site Study Status",
        "Study Title",
        "Study Phase",
        "Study Type",
        "Funding Source",
        "Site IRB Name",
        "Site IRB Status",
        "Site IRB Expiration Date",
        "IRB Submission Number",
        "Principal Investigators",
        "Lead Coordinators",
        "Site Study Start Date",
        "Site Study End Date",
        "Site Enrollment Start Date",
        "Site Enrollment End Date",
        "Study Therapeutic Areas",
        "Study Therapeutic Area Details",
        "Study Sponsors",
        "Site Enrollment Target",
        "CRO Name",
        "Total Patients Prescreened Sum",
        "Failed Prescreening Sum",
        "Total Participants Sum",
        "In Screening Sum",
        "Failed Screening Sum",
        "In Treatment Sum",
        "Completed Sum",
        "Follow Up Sum",
        "Off Study Sum",
        "Long Term Follow Up Sum",
    ]
]

study_accruals = study_accruals.rename(
    columns={
        "Site Name": "Site",
        "Site Study Code": "Site Study ID",
        "Site Enrollment Target": "Site Enrollment Targets",
    }
)

print(study_accruals.dtypes)

study_accruals.to_csv(f"{folder_path}/Study Accrual Data.csv", index=False)

In [None]:
partis_info = pd.read_csv(
    r"C:\Users\O304312\Downloads\Data Table Participant Information.csv",
    low_memory=False,
)

print(partis_info.columns)

In [None]:
partis_info = partis_info[
    [
        "Subject ID",
        "Site Study Code",
        "Participant Status",
        "Participant Status Date",
        "Participant Latest Consent Date",
        "Participant Consent Status",
    ]
]

partis_info["Participant Latest Consent Date"] = partis_info[
    "Participant Latest Consent Date"
].replace(["", "0", 0], pd.NA)
if "Latest Screen Failure Date" in partis_info.columns:
    partis_info["Participant Latest Consent Date"] = partis_info[
        "Participant Latest Consent Date"
    ].fillna(partis_info["Latest Screen Failure Date"])


partis_info = partis_info.rename(
    columns={
        "Site Study Code": "Site Study ID",
        "Participant Status Date": "Current Status Date",
        "Participant Latest Consent Date": "Consent Date",
        "Participant Consent Status": "Consent Result",
    }
)

partis_info["Consent Date"] = pd.to_datetime(
    partis_info["Consent Date"], errors="coerce"
)

partis_info = partis_info[partis_info["Consent Date"] < first_of_last_month]

partis_info.to_csv(
    f"{folder_path}/Participant Information Data Sheet.csv",
    index=False,
)

In [None]:
transactions = pd.read_csv(
    r"C:\Users\O304312\Downloads\Transaction.csv", low_memory=False
)

transactions = transactions.drop(columns=["Unnamed: 0"])
print(transactions.columns)

In [None]:
transactions["Accountable Completed Date"] = pd.to_datetime(
    transactions["Accountable Completed Date"], errors="coerce"
)

transactions["Accountable Completed Date"] = transactions[
    "Accountable Completed Date"
].replace(["", "0", 0], pd.NA)

transactions = transactions[
    (transactions["Accountable Completed Date"] < first_of_last_month)
    | (transactions["Accountable Completed Date"].isnull())
]

mask = transactions["Accountable Completed Date"].isna()
transactions.loc[mask, "Accountable Completed Date"] = pd.to_datetime(
    transactions.loc[mask, "Transaction Created Date"], errors="coerce"
)

num_blank_rows = transactions["Accountable Completed Date"].isna().sum()
print(f"Number of blank rows in 'Accountable Completed Date': {num_blank_rows}")

transactions = transactions.drop(columns=["Transaction Created Date"])

transactions["Account Code"] = transactions["Account Code"].replace(["", "0", 0], pd.NA)

transactions["Account Code"] = transactions["Account Code"].fillna(47205)

transactions.loc[
    transactions["Transaction Line Item"] == "StudyActivity", "Account Code"
] = 47206

In [None]:
import glob
import re

excel_files = glob.glob(
    r"J:\ADMIN-eFILES\CHEN_W154867_VXC\zzz_CTP Projects List\DOR Finance CTP List\CTP Project List Reconciled *.xlsx"
)


def extract_date(filename):
    match = re.search(r"(\d{6})\.xlsx$", filename)
    return match.group(1) if match else ""


excel_files_sorted = sorted(excel_files, key=extract_date, reverse=True)

if excel_files_sorted:
    latest_excel = excel_files_sorted[0]
    dor_ctp_projects = pd.read_excel(latest_excel)
    print(f"Loaded file: {latest_excel}")
else:
    print("No matching Excel files found.")

In [None]:
print(transactions.columns)
print(dor_ctp_projects.columns)
dor_ctp_projects = dor_ctp_projects[["ProjectID", "IDC Rate"]]

dor_ctp_projects = dor_ctp_projects.drop_duplicates()

In [None]:
"""Adding in OH rate based on what the overhead rate was at the time the study was created"""

transactions = pd.merge(
    transactions,
    dor_ctp_projects,
    left_on="Service Line Code",
    right_on="ProjectID",
    how="left",
)

transactions["Transaction Amount"] = pd.to_numeric(
    transactions["Transaction Amount"], errors="coerce"
)
transactions["IDC Rate"] = pd.to_numeric(transactions["IDC Rate"], errors="coerce")

mask = (transactions["Transaction Line Item"] == "AdHoc") & (
    transactions["Account Code"] == 47205
)
transactions.loc[mask, "Transaction Amount"] = transactions.loc[
    mask, "Transaction Amount"
] * (1 + transactions.loc[mask, "IDC Rate"].fillna(0))

In [None]:
project_id_list = pd.read_csv(
    "J:\\ONELINK//Snapshot Datasources//SignalPath ProjectID Lookup.csv",
    low_memory=False,
    encoding="cp1252",
)
print(project_id_list.columns)

transactions = pd.merge(
    transactions,
    project_id_list,
    left_on="Service Line Code",
    right_on="Project ID",
    how="left",
)

print(transactions.columns)

list_csv = pd.read_csv(
    r"J:\ONELINK\Snapshot Datasources\04 2025 April\_Transaction Data april 2025.csv"
)

columns_list = list_csv.columns.tolist()

print(columns_list)
transactions["Transaction Created Date"] = transactions["Accountable Completed Date"]
transactions = transactions.rename(
    columns={
        "Transaction Amount": "Amount",
        "Site Study ID": "Site Protocol Version Desc",
    }
)

transactions = transactions.loc[:, columns_list]

In [None]:
transactions.to_csv(
    f"{folder_path}/_Transaction Data.csv",
    index=False,
)

In [None]:
"""Checking available datasources for RNG numbers for studies. Not efficient but requires data management and creation processes to be updated to fix"""

studies = transactions[
    [
        "Service Line Code",
        "Site Study Code",
    ]
].drop_duplicates()

In [None]:
# Use a raw string to avoid invalid escape sequence warnings
project_list = pd.read_excel(
    r"J:\ONELINK\Snapshot Datasources\CTP Project List Reconciled.xlsx"
)

project_list = project_list[["ProjectID", "Project Status", "Funding Type"]]

In [None]:
trac_id = pd.read_json(r"J:\TRAC\TRAC_Data.json", encoding="utf-16")

trac_id = pd.json_normalize(trac_id["TRAC_Data"])

lookup_table = trac_id[["ProjectID", "ReferenceNum"]].drop_duplicates()

In [None]:
signalpath_lookup = pd.read_csv(
    r"J:\ONELINK\Snapshot Datasources\SignalPath ProjectID Lookup.csv",
    low_memory=False,
    encoding="cp1252",
)

print(f"Number of rows in signalpath_lookup: {len(signalpath_lookup)}")

"""Export a backup of the file in case update process goes off the rails"""

signalpath_lookup.to_csv(f"{folder_path}/SIGNALPATH LOOKUP BACKUP DO NOT COPY.csv")

In [None]:
missing_site_study_codes = studies.loc[
    ~studies["Site Study Code"].isin(signalpath_lookup["Site Study ID"])
]

mask_rng = (
    missing_site_study_codes["Site Study Code"].str.contains(r"RNG\d{6}", regex=True, na=False)
    & (missing_site_study_codes["Service Line Code"].isna() | (missing_site_study_codes["Service Line Code"] == ""))
)
missing_site_study_codes.loc[mask_rng, "Service Line Code"] = (
    missing_site_study_codes.loc[mask_rng, "Site Study Code"].str.extract(r"(RNG\d{6})", expand=False)
)

to_append = missing_site_study_codes[["Site Study Code"]].rename(
    columns={"Site Study Code": "Site Study ID"}
)
to_append["Project ID"] = missing_site_study_codes["Service Line Code"].values

signalpath_lookup = pd.concat([signalpath_lookup, to_append], ignore_index = True)


# Find duplicated Site Study IDs
dup_mask = signalpath_lookup.duplicated(subset=["Site Study ID"], keep=False)

# For duplicates, keep the row where Project ID is 'Non-Industry'
signalpath_lookup = signalpath_lookup.loc[
    ~dup_mask | ((dup_mask) & (signalpath_lookup["Project ID"] == "Non-Industry"))
].drop_duplicates(subset=["Site Study ID"], keep="first")

In [None]:
# First, regular merge
signalpath_lookup = signalpath_lookup.merge(
    lookup_table, left_on="Site Study ID", right_on="ReferenceNum", how="left"
)

# Now, for rows where ReferenceNum is contained within Site Study ID but not an exact match
mask_no_match = signalpath_lookup["ReferenceNum"].isna()
for idx, row in signalpath_lookup[mask_no_match].iterrows():
    matches = lookup_table[
        lookup_table["ReferenceNum"].apply(
            lambda x: x in row["Site Study ID"] if pd.notna(x) else False
        )
    ]
    if not matches.empty:
        # Assign the first match (or you can handle multiple matches as needed)
        signalpath_lookup.at[idx, "ProjectID"] = matches.iloc[0]["ProjectID"]
        signalpath_lookup.at[idx, "ReferenceNum"] = matches.iloc[0]["ReferenceNum"]

# Fill missing 'Project ID' with 'ProjectID' where 'Project ID' is blank or NA and 'ProjectID' is not
mask_fill = (
    signalpath_lookup["Project ID"].isna() & signalpath_lookup["ProjectID"].notna()
)
signalpath_lookup.loc[mask_fill, "Project ID"] = signalpath_lookup.loc[
    mask_fill, "ProjectID"
]

In [None]:
signalpath_lookup = signalpath_lookup[["Site Study ID", "Project ID"]]

signalpath_lookup = signalpath_lookup.merge(
    project_list, left_on="Project ID", right_on="ProjectID", how="left"
)

signalpath_lookup = signalpath_lookup[
    ~signalpath_lookup["Project Status"].isin(["C", "E"])
]
signalpath_lookup.loc[
    signalpath_lookup["Funding Type"].isin(["FEDERAL", "FOUNDATION"]), "Project ID"
] = "Non-Industry"

signalpath_lookup = signalpath_lookup[["Site Study ID", "Project ID"]]

signalpath_lookup = signalpath_lookup.dropna(subset=["Project ID"])
signalpath_lookup = signalpath_lookup.drop_duplicates()

In [None]:
sp_lookup = pd.read_csv(
    r"J:\ONELINK\Snapshot Datasources\SignalPath ProjectID Lookup.csv",
    low_memory=False,
    encoding="cp1252",
)

missing_ids = sp_lookup[
    ~sp_lookup["Site Study ID"].isin(signalpath_lookup["Site Study ID"])
]

missing_ids = missing_ids[~missing_ids["Project ID"].isin(sp_lookup["Project ID"])]
print(missing_ids)

sp_lookup = pd.concat([sp_lookup, missing_ids], ignore_index=True)

print(f"Number of rows in new signalpath_lookup: {len(sp_lookup)}")

sp_lookup.to_csv(
    f"{folder_path}/SignalPath ProjectID Lookup.csv",
    index=False,
)