In [None]:
from jupyter_client import find_connection_file
connection_file = find_connection_file()
print(connection_file)

In [None]:
import papermill as pm
import pandas as pd
import os
import sys
sys.tracebacklimit = 0
def exception_handler(exception_type, exception, traceback):
    # All your trace are belong to us!
    # your format
    print(f"{exception_type.__name__}, {exception}")

sys.excepthook = exception_handler

In [None]:
dataset_col = "Dataset"
uid_col = "uid"
age_col = "Age"
day_col = "Day"
response_col = "Response"
immage_col = "IMMAGE"
strain_col = 'Strain'

In [None]:
# Read in Data and drop missing values
df = pd.read_csv("../data/all_vaccines.csv")
df.dropna(inplace=True, subset=[immage_col, dataset_col, day_col, response_col])
datasets = df


# Plot distribution of N values
N_vals = df[[dataset_col, uid_col]].groupby(dataset_col, as_index=False)[uid_col].nunique()
N_vals = N_vals.rename(columns={uid_col: "N"})

# Narrow to large datasets only
bNarrow = False
N_vals = N_vals.loc[N_vals["N"] > 70]
if bNarrow:
    datasets = df.loc[df["Dataset"].isin(N_vals["Dataset"])]
dataset_names = datasets["Dataset"].unique()

In [None]:
#Collect day info from papers here
dataset_day_dict = {
        "GSE125921.SDY1529": ['FC', 'D84'],
        "GSE13485.SDY1264": ['D60'],
        "GSE13699.SDY1289": ['D28'],
        "GSE169159": ['FC.D42', 'D42'],
        "GSE41080.SDY212": ['HAI.D28'],
        "GSE45735.SDY224": ['HAI.D21'],
        "GSE47353.SDY80": ['D70.nAb', 'FC.D70.nAb'],
        "GSE48018.SDY1276": ['nAb.D28', 'nAb.FC'],
        "GSE48023.SDY1276": ['nAb.FC', 'nAb.D14'],
        "GSE59635.SDY63": ['HAI.D28'],
        "GSE59654.SDY180": ['FC.HAI', 'HAI.D28'],
        "GSE59654.SDY404": ['FC.HAI', 'HAI.D28'],
        "GSE59654.SDY520": ['FC.HAI', 'HAI.D28'],
        "GSE59743.SDY400": ['FC.HAI', 'HAI.D28'],
        "GSE65834.SDY1328": ['D7', 'FC'],
        "GSE79396.SDY984": ['D28', 'FC.D28'],
        "GSE82152.SDY1294": ['D28', 'FC'],
        "SDY1325": ['FC.D28', 'D28'],
        "SDY296": ['D28.nAb', 'FC.nAb'],
        "SDY67": ['nAb.D28', 'FC.D28.nAb'],
        "SDY89": ['D28']
}
dataset_names = dataset_day_dict.keys()

In [None]:
# Keep only Influenze datasets for now
# influenza_sets = ['GSE41080.SDY212', 'GSE48018.SDY1276', 'GSE48023.SDY1276', 'SDY67', 'GSE125921.SDY1529', 'GSE45735.SDY224', 'GSE47353.SDY80', 'GSE48023.SDY1276', 'GSE59635.SDY63', 'GSE59654.SDY404', 'GSE59743.SDY400', 'SDY296']

bInfluenza = True
if bInfluenza:
    influenza_dict = [
        {"Dataset": "GSE125921.SDY1529", "Days": ["FC", "D84"], "Day0": "D0"},
        {"Dataset": "GSE41080.SDY212", "Days": ["FC.HAI", "HAI.D28"], "Day0": "HAI.D0"},
        {"Dataset": "GSE45735.SDY224", "Days": ["FC.HAI", "HAI.D21"], "Day0": "HAI.D0"},
        # start with sets that have a HAI measurement
        # {"Dataset": "GSE47353.SDY80", "Days": ["D70.nAb", "FC.D70.nAb"], "Day0": "D0.nAb"},
        {"Dataset": "GSE48018.SDY1276", "Days": ["HAI.D28", "HAI.FC"], "Day0": "HAI.D0"},
        {"Dataset": "GSE48023.SDY1276", "Days": ["HAI.FC", "HAI.D28"], "Day0": "HAI.D0"},
        {"Dataset": "GSE59635.SDY63", "Days": ["FC", "HAI.D28"], "Day0": "HAI.D0"},
        {"Dataset": "GSE59654.SDY404", "Days": ["HAI.D28", "FC.HAI"], "Day0": "HAI.D0"},
        {"Dataset": "GSE59743.SDY400", "Days": ["FC.HAI", "HAI.D28"], "Day0": "None"},
        {"Dataset": "SDY296", "Days": ["D28.HAI", "FC.HAI"], "Day0": "D0.HAI"},
        {"Dataset": "SDY67", "Days": ["FC.D28.HAI", "HAI.D28"], "Day0": "HAI.D0"}
    ]

    tmp_df = pd.DataFrame(influenza_dict)
    datasets = tmp_df.loc[tmp_df["Dataset"].isin(tmp_df["Dataset"])]
    dataset_names = datasets["Dataset"].unique()
    # df['Days'] = df['Days'].apply(lambda x: ', '.join(x) if isinstance(x, list) else x)  # Convert list to string for display

In [None]:
# Loop through each combination of dataset and strain
for dataset_name in dataset_names:
        dataset = datasets.loc[datasets[dataset_col] == dataset_name] 
        print(dataset_name)
        days = dataset_day_dict[dataset_name]
        for day in days:
                print(day)
                day_mask = dataset[day_col] == day
                name_mask = dataset[dataset_col] == dataset_name
                data = dataset.loc[(name_mask) & (day_mask)].reset_index()
                strains = data[strain_col].unique()
                print(strains)
                for strain_index in range(len(strains)):
                        strain_name = strains[strain_index].replace("/", "_").replace(" ", "_")
                        print(f'exporting {dataset_name}, strain no. {strain_index}: {strain_name}, day: {day}')
                        # Define parameters for dataset and strain
                        parameters = {"dataset_name": dataset_name, "strain_index": strain_index, "day": day}

                        # Execute the notebook with specific parameters
                        output_notebook = f"export/{dataset_name}_{strain_name}_{day}_analysis.ipynb"
                        try:
                                pm.execute_notebook(
                                        input_path="vaccines-4.ipynb",
                                        output_path=output_notebook,
                                        parameters=parameters,
                                        prepare_only=True
                                )
                        except:
                                print (f"******\nCaught exception when runnnig {output_notebook}\n******\n")
                        # Export the executed notebook to HTML
                        output_html = f"{dataset_name}_{strain_name}_{day}_analysis.html"
                        os.system(f"jupyter nbconvert --execute --no-input --to html {output_notebook} --output {output_html}")