* Positive, final candidates taken from : `ExtraFilterRerank-llmOutputs.ipynb`
  ```
   df = df.loc[(df['answer'])| (df[['novel','plausible']].min(axis=1)>0)]
    df = df.loc[(df['p_val']<=0.3)| ((df['MutualInfoTarget']>=0.003) & (df['feature_importance']>=0.003))]

    clean_feats = deduplicate_texts(df["raw_name"], use_difflib=True, string_cutoff=0.95, distance_threshold=0.8)
  ```

In [1]:
import pandas as pd
from configs import *

In [2]:
# !conda install openpyxl -y

In [3]:
def corr_direction(row):
    corr = row['corr']

    # Determine direction of effect
    if corr > 0:
        direction = 'positive'
    elif corr < 0:
        direction = 'negative'
    else:
        direction = 'neutral'
    return direction

import pandas as pd
from openpyxl import load_workbook
from openpyxl.worksheet.datavalidation import DataValidation

def prepare_annotation_sheet(input_df, output_file, annotation_columns={
                                       "Novel": "numeric",
                                       "Plausible/Makes Sense": "numeric",
                                        "Utility": "numeric",
                                       "Interesting": "numeric",
                                   }):
    """
    Prepares an Excel sheet for human annotation with data types validation.

    Args:
        input_df (pd.DataFrame): The input dataframe with base data.
        output_file (str): Path to save the Excel file.
        annotation_columns (dict): Dictionary where keys are column names 
                                   and values are validation types. 
                                   Example:
                                   {
                                       "Novel": "boolean",
                                       "Plausible": "boolean",
                                       "Interesting": "numeric",
                                       "Utility": "boolean"
                                   }

    Returns:
        None: Saves the Excel file with data validation applied.
    """
    # Save the dataframe to an Excel file
    input_df.to_excel(output_file, index=False, engine="openpyxl")

    # Load the workbook to add Excel logic
    wb = load_workbook(output_file)
    ws = wb.active

    # Add annotation columns with validation
    col_index = len(input_df.columns) + 1  # Starting index for annotation columns
    for col_name, validation_type in annotation_columns.items():
        # Add column header
        ws.cell(row=1, column=col_index, value=col_name)

        # Define data validation
        if validation_type == "boolean":
            validation = DataValidation(type="list", formula1='"True,False"', showDropDown=False, allow_blank=True)
        elif validation_type == "numeric":
            # validation = DataValidation(type="whole", operator="between", formula1="1", formula2="4")
            validation = DataValidation(
                type="list", 
                formula1='"1,2,3,4"', 
                # showDropDown=True,
                showDropDown=False,
                allow_blank=True,
            )
        else:
            raise ValueError(f"Unsupported validation type: {validation_type}")
        validation.showInputMessage = True
        validation.showErrorMessage = True
        validation.error = 'Invalid input: Please choose a value between 1-4.'
        validation.errorTitle = 'Invalid Entry'
        validation.errorStyle = 'stop'         # or 'warning' / 'information'
        validation.showErrorMessage = True
        # Add validation to the column
        for row in range(2, len(input_df) + 2):  # Assuming headers are in the first row
            validation.add(ws.cell(row=row, column=col_index))

        # Add the validation rule to the worksheet
        ws.add_data_validation(validation)

        col_index += 1

    # Save the updated workbook
    wb.save(output_file)
    print(f"Excel sheet with annotation logic saved to {output_file}.")


In [4]:
## import from configs
all_configs = [config_gall,
               config_celiac, config_gout,config_spine, config_oesophagus, config_eye_occ,config_depression,
    config_heart
              ]

MAX_FINAL_CANDIDATES = 50

In [5]:
for config in all_configs:
    print(config['OUTPUT_RES_PREFIX'])
    
    df_pos = pd.read_csv(f"./Outputs/llm_reranked/subset/"+config['OUTPUT_RES_PREFIX']+"_ranked.csv")
    num_pos = df_pos.shape[0]
    print(num_pos,"# Positive candidates (pre limit)")
    df_pos = df_pos.head(MAX_FINAL_CANDIDATES)
    df_pos.rename(columns={'step_by_step_explanation':"explanation","target":'Target'},inplace=True) 
    df_pos["Real"] = 1

    df_neg = pd.read_csv(f"./Outputs/Negatives+Explain/neg_explanations-{config.get('OUTPUT_RES_PREFIX', '')}.csv")
    ## get 20% of real cases as negaitves:
    df_neg = df_neg.head(num_pos//5)
    df_neg.drop(columns=["feature_name","prompt"],inplace=True,errors="ignore")
    df_neg["Feature_Name"] = df_neg["raw_name"]
    df_neg["Real"] = 0
    
    # display(df_pos.head(2))
    # display(df_neg.head(2).select_dtypes("O"))

    df = pd.concat([df_pos,df_neg],ignore_index=True).sample(frac=1)
    df["Target"] = df["Target"].str.replace("AND","")
    df["correlation"] = df.apply(corr_direction,axis=1)
    df.to_csv(f"./Outputs/for_annotators/uncensored/{config['OUTPUT_RES_PREFIX']}candidates_uncensored.csv")
    df = df[['Feature_Name','explanation', "correlation",'Target']]
    df["Comments"] = ""
 
    # df.to_csv(f"./Outputs/for_annotators/raw/{config['OUTPUT_RES_PREFIX']}candidates_uncensored.csv")

    prepare_annotation_sheet(input_df=df, output_file=f"./Outputs/for_annotators/{config['OUTPUT_RES_PREFIX']}candidates.xlsx"
                             ,
                             annotation_columns={
                                           "Novel?": "numeric",
                                           "Plausible-Makes sense?": "numeric",
                                            "Utility?": "numeric",
                                           "Interesting?": "numeric",
                                       })
    display(df)

gallstone_
43 # Positive candidates (pre limit)
Excel sheet with annotation logic saved to ./Outputs/for_annotators/gallstone_candidates.xlsx.


Unnamed: 0,Feature_Name,explanation,correlation,Target,Comments
28,missing_Essential hypertension,The feature 'Essential hypertension' has a neg...,negative,(Cholelithiasis) OR (Gallstone) OR (Gallbladde...,
17,Home area population density - urban or rural_...,1. **Novelty:** The feature 'Home area populat...,negative,(Cholelithiasis) OR (Gallstone) OR (Gallbladde...,
35,Skin colour_Very fair,The feature 'Skin colour Very fair' is conside...,positive,(Cholelithiasis) OR (Gallstone) OR (Gallbladde...,
45,Actinic keratosis,Actinic keratosis is interesting because it ha...,positive,(Cholelithiasis) OR (Gallstone) OR (Gallbladde...,
3,Frequency of tiredness / lethargy in last 2 we...,1. **Novelty:** The feature 'Frequency of tire...,positive,(Cholelithiasis) OR (Gallstone) OR (Gallbladde...,
1,Apolipoprotein A Blood biochemistry,The feature 'Apolipoprotein A Blood biochemist...,negative,(Cholelithiasis) OR (Gallstone) OR (Gallbladde...,
24,(BC) breast cancer genetic risk,The feature 'BC breast cancer genetic risk' be...,positive,(Cholelithiasis) OR (Gallstone) OR (Gallbladde...,
26,Sodium in urine,The feature 'Sodium in urine' is evaluated for...,positive,(Cholelithiasis) OR (Gallstone) OR (Gallbladde...,
40,Cholesterol in Medium HDL,The feature 'Cholesterol in Medium HDL' is eva...,negative,(Cholelithiasis) OR (Gallstone) OR (Gallbladde...,
30,(AST) asthma genetic risk,1. **Novelty:** The association between asthma...,positive,(Cholelithiasis) OR (Gallstone) OR (Gallbladde...,


celiac_
49 # Positive candidates (pre limit)
Excel sheet with annotation logic saved to ./Outputs/for_annotators/celiac_candidates.xlsx.


Unnamed: 0,Feature_Name,explanation,correlation,Target,Comments
46,Concentration of HDL Particles,1. **Novelty:** The relationship between HDL p...,negative,(celiac disease) OR (Coeliac) OR (gluten all...,
28,missing_Z90.4 - Acquired absence of other part...,The feature 'Z90.4 - Acquired absence of other...,negative,(celiac disease) OR (Coeliac) OR (gluten all...,
30,Falls in the last year_Only one fall,The feature 'Falls in the last year Only one f...,positive,(celiac disease) OR (Coeliac) OR (gluten all...,
44,Cholesterol in Very Small VLDL,The feature 'Cholesterol in Very Small VLDL' i...,negative,(celiac disease) OR (Coeliac) OR (gluten all...,
24,missing_Direct bilirubin,1. **Novelty:** The association between direct...,positive,(celiac disease) OR (Coeliac) OR (gluten all...,
50,Hallux valgus (Bunion),The feature 'Hallux valgus (Bunion)' is intere...,negative,(celiac disease) OR (Coeliac) OR (gluten all...,
23,Acetone,The feature 'Acetone' is evaluated for its pot...,negative,(celiac disease) OR (Coeliac) OR (gluten all...,
48,Variation in diet_Never/rarely,The feature 'Variation in diet Never/rarely' i...,positive,(celiac disease) OR (Coeliac) OR (gluten all...,
2,Home area population density - urban or rural_...,The feature 'Home area population density - ur...,negative,(celiac disease) OR (Coeliac) OR (gluten all...,
56,Z98.0 - Intestinal bypass and anastomosis status,The feature 'Z98.0 - Intestinal bypass and ana...,negative,(celiac disease) OR (Coeliac) OR (gluten all...,


gout_
53 # Positive candidates (pre limit)
Excel sheet with annotation logic saved to ./Outputs/for_annotators/gout_candidates.xlsx.


Unnamed: 0,Feature_Name,explanation,correlation,Target,Comments
25,Haemoglobin concentration,1. **Novelty:** The relationship between haemo...,negative,Gout,
18,Meat substitutes - vegetarian,1. **Novelty:** The association between 'Meat ...,positive,Gout,
19,Health satisfaction_Moderately unhappy,The feature 'Health satisfaction Moderately un...,positive,Gout,
3,Average Diameter for VLDL Particles,The feature 'Average Diameter for VLDL Particl...,positive,Gout,
38,(ISS) ischaemic stroke genetic risk,The feature '(ISS) ischaemic stroke genetic ri...,positive,Gout,
37,(POAG) primary open angle glaucoma genetic risk,The feature 'POAG primary open angle glaucoma ...,negative,Gout,
7,Falls in the last year_More than one fall,The feature 'Falls in the last year More than ...,positive,Gout,
17,Townsend deprivation index at recruitment,1. **Novelty:** The Townsend deprivation index...,positive,Gout,
50,Hemorrhage of gastrointestinal tract,The feature 'Hemorrhage of gastrointestinal tr...,negative,Gout,
32,Concentration of IDL Particles,The feature 'Concentration of IDL Particles' i...,negative,Gout,


spine_
67 # Positive candidates (pre limit)
Excel sheet with annotation logic saved to ./Outputs/for_annotators/spine_candidates.xlsx.


Unnamed: 0,Feature_Name,explanation,correlation,Target,Comments
33,Variation in diet_Sometimes,1. **Novelty:** The feature 'Variation in diet...,positive,(disc displacement) OR (disc degeneration) OR ...,
51,Acetone,Interestingness: Acetone levels have been inve...,negative,(disc displacement) OR (disc degeneration) OR ...,
61,Alcohol intake frequency._None,## Step 1: Assess Novelty\nThe feature 'Alcoho...,positive,(disc displacement) OR (disc degeneration) OR ...,
0,"Long-standing illness, disability or infirmity_No","The feature 'Long-standing illness, disability...",negative,(disc displacement) OR (disc degeneration) OR ...,
52,Actinic keratosis,The feature 'Actinic keratosis' is interesting...,positive,(disc displacement) OR (disc degeneration) OR ...,
...,...,...,...,...,...
9,"Major dietary changes in the last 5 years_Yes,...",The feature 'dietary changes in the last 5 yea...,positive,(disc displacement) OR (disc degeneration) OR ...,
18,Concentration of Large VLDL Particles,The feature 'Concentration of Large VLDL Parti...,positive,(disc displacement) OR (disc degeneration) OR ...,
39,missing_Atrial fibrillation and flutter,1. **Novelty:** The association between atrial...,negative,(disc displacement) OR (disc degeneration) OR ...,
29,Cholesterol in Medium LDL,The feature 'Cholesterol in Medium LDL' being ...,negative,(disc displacement) OR (disc degeneration) OR ...,


oesophagus_
44 # Positive candidates (pre limit)
Excel sheet with annotation logic saved to ./Outputs/for_annotators/oesophagus_candidates.xlsx.


Unnamed: 0,Feature_Name,explanation,correlation,Target,Comments
14,(MEL) melanoma genetic risk,1. **Novelty:** The association between melano...,positive,oesophagus cancer,
32,Apolipoprotein A1,The feature 'Apolipoprotein A1' (ApoA1) in rel...,positive,oesophagus cancer,
51,Age asthma diagnosed_10,The feature 'Age asthma diagnosed_10' is inter...,positive,oesophagus cancer,
3,Weight (p21002),The feature 'Weight (p21002)' in relation to o...,positive,oesophagus cancer,
41,Treatment/medication code | Array 0__atenolol,1. **Novelty:** Atenolol is a beta-blocker use...,positive,oesophagus cancer,
24,(ISS) ischaemic stroke genetic risk,1. **Novelty:** The association between ischae...,positive,oesophagus cancer,
48,Ever addicted to illicit or recreational drugs...,The feature 'Ever addicted to illicit or recre...,negative,oesophagus cancer,
44,Abnormal results of function study of liver,**Interestingness:** The feature 'Abnormal res...,negative,oesophagus cancer,
35,(RA) rheumatoid arthritis genetic risk,The feature 'RA rheumatoid arthritis genetic r...,positive,oesophagus cancer,
37,Birth weight known_No,The feature 'Birth weight known' in relation t...,positive,oesophagus cancer,


eye_occ_
49 # Positive candidates (pre limit)
Excel sheet with annotation logic saved to ./Outputs/for_annotators/eye_occ_candidates.xlsx.


Unnamed: 0,Feature_Name,explanation,correlation,Target,Comments
22,Alcohol intake frequency._Once or twice a week,1. **Novelty:** The relationship between moder...,negative,(Retinal Vein Occlusion) OR (Central retinal a...,
1,(ISS) ischaemic stroke genetic risk,The feature '(ISS) ischaemic stroke genetic ri...,positive,(Retinal Vein Occlusion) OR (Central retinal a...,
36,Oily fish intake_5-6 times a week,The feature 'Oily fish intake 5-6 times a week...,positive,(Retinal Vein Occlusion) OR (Central retinal a...,
52,Acetone,The feature 'Acetone' is interesting in the co...,neutral,(Retinal Vein Occlusion) OR (retinal artery oc...,
53,Duodenitis,The feature 'Duodenitis' is interesting becaus...,negative,(Retinal Vein Occlusion) OR (retinal artery oc...,
37,"Non-cancer illness code, self-reported | Array...",1. **Novelty:** The relationship between chron...,negative,(Retinal Vein Occlusion) OR (Central retinal a...,
47,Number of treatments/medications taken,The feature 'Number of treatments/medications ...,positive,(Retinal Vein Occlusion) OR (Central retinal a...,
45,Concentration of IDL Particles,The feature 'Concentration of IDL Particles' p...,negative,(Retinal Vein Occlusion) OR (Central retinal a...,
55,Ever addicted to a behaviour or miscellanous_None,The feature 'Ever addicted to a behaviour or m...,negative,(Retinal Vein Occlusion) OR (retinal artery oc...,
50,Hemorrhage of gastrointestinal tract,The feature 'Hemorrhage of gastrointestinal tr...,negative,(Retinal Vein Occlusion) OR (retinal artery oc...,


depression_
20 # Positive candidates (pre limit)
Excel sheet with annotation logic saved to ./Outputs/for_annotators/depression_candidates.xlsx.


Unnamed: 0,Feature_Name,explanation,correlation,Target,Comments
14,Skin colour_Fair,The feature 'Skin colour Fair' as a predictor ...,positive,(Depression) OR (Depressive disorder),
20,Nonspecific abnormal findings in stool contents,The feature 'Nonspecific abnormal findings in ...,positive,(Depression) OR (Depressive disorder),
1,missing_Peripheral enthesopathies and allied s...,1. **Novelty:** The relationship between perip...,neutral,(Depression) OR (Depressive disorder),
6,missing_Hallux valgus (Bunion),1. **Novelty:** The association between hallux...,neutral,(Depression) OR (Depressive disorder),
8,"Other diseases of respiratory system, NEC",1. **Novelty:** The feature 'Other diseases of...,positive,(Depression) OR (Depressive disorder),
15,missing_Polyp of corpus uteri,The feature 'Polyp of corpus uteri' in relatio...,negative,(Depression) OR (Depressive disorder),
13,missing_Diverticulosis,1. **Novelty:** The association between divert...,positive,(Depression) OR (Depressive disorder),
22,Z51.0 - Radiotherapy session,The feature 'Z51.0 - Radiotherapy session' is ...,negative,(Depression) OR (Depressive disorder),
23,Age other serious eye condition diagnosed_66,The feature 'Age other serious eye condition d...,neutral,(Depression) OR (Depressive disorder),
19,(MEL) melanoma genetic risk,The feature '(MEL) melanoma genetic risk' in r...,neutral,(Depression) OR (Depressive disorder),


heart_
32 # Positive candidates (pre limit)
Excel sheet with annotation logic saved to ./Outputs/for_annotators/heart_candidates.xlsx.


Unnamed: 0,Feature_Name,explanation,correlation,Target,Comments
27,Basophill count,1. **Novelty:** The relationship between basop...,negative,Heart attack,
15,(BD) bipolar disorder genetic risk,The feature 'BD bipolar disorder genetic risk'...,positive,Heart attack,
17,missing_Fluid intelligence score,1. **Novelty:** The association between fluid ...,positive,Heart attack,
21,Fed-up feelings_No,1. **Novelty:** The feature 'Fed up feelings' ...,negative,Heart attack,
35,Gastritis and duodenitis,## Interestingness Explanation:\nThe feature '...,positive,Heart attack,
5,Home area population density - urban or rural_...,The feature 'Home area population density - ur...,negative,Heart attack,
22,Most recent bowel cancer screening_10,The feature 'Most recent bowel cancer screenin...,negative,Heart attack,
19,Standing height,1. **Novelty:** The association between standi...,negative,Heart attack,
34,Duodenitis,Duodenitis is interesting because it shows a n...,negative,Heart attack,
28,Hot drink temperature_Very hot,The feature 'Hot drink temperature Very hot' i...,negative,Heart attack,


In [6]:
# df.iloc[0]["explanation"]

In [7]:
display(df_pos.head(2).select_dtypes("O"))

Unnamed: 0,Feature_Name,novel_cot,plausible_cot,explanation,F.Split-Feature Split,boring_cot,Target
0,(AST) asthma genetic risk,The relationship between asthma and heart dise...,To determine if there is a plausible explanati...,1. **Novelty:** The association between asthma...,-0.48 <= (AST) asthma genetic risk < -0.48,The association between (AST) asthma genetic r...,Heart attack
1,missing_Other non-epithelial cancer of skin,The association between 'Other non-epithelial ...,To determine if there is a plausible explanati...,1. **Novelty:** The association between 'Other...,missing_Other non-epithelial cancer of skin >=...,The feature 'Other non-epithelial cancer of sk...,Heart attack


In [8]:
set(df_pos.columns).intersection(set(df_neg.columns))

{'Feature_Name',
 'MutualInfoTarget',
 'Real',
 'Target',
 'corr',
 'explanation',
 'feature_importance',
 'p_val'}