In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from IPython.display import display, Markdown
from collections import Counter
import random
from lightgbm import LGBMClassifier

In [2]:
import psutil

# Get the number of CPUs
num_cpus = psutil.cpu_count(logical=True)
print(f"Number of CPUs: {num_cpus}")

# Get the amount of RAM
ram_info = psutil.virtual_memory()
total_ram = ram_info.total / (1024 ** 3)  # Convert bytes to GB
print(f"Total RAM: {total_ram:.2f} GB")

Number of CPUs: 16
Total RAM: 31.73 GB


In [3]:
EXP_NAME = 'gemma_att'

In [4]:
os.getcwd()

'c:\\Users\\Piotr.Matys\\Python_Projects\\hallu_project'

In [5]:
context_df = None

for f_ in os.listdir(os.path.join(EXP_NAME)):
    
    # if f_.startswith('attension'):

    if context_df is None:
        context_df = pd.read_parquet(os.path.join(EXP_NAME, f_))
    else:
        context_df = pd.concat((context_df, pd.read_parquet(os.path.join(EXP_NAME, f_))))

In [6]:
context_df['dataset'].value_counts()

dataset
cnndm          19400
nq              8030
xsum            7776
poquad_v2       6247
hotpotqa_en     3121
bioask          3056
hotpotqa_pl     2208
polqa           1869
Name: count, dtype: int64

In [7]:
context_df = context_df[context_df['dataset'].isin(['cnndm', 'nq'])]

In [8]:
context_df['label'].value_counts()

label
0    25940
1     1490
Name: count, dtype: int64

In [9]:
context_df.groupby(['dataset', 'label']).size()

dataset  label
cnndm    0        18882
         1          518
nq       0         7058
         1          972
dtype: int64

In [10]:
grouped_df = context_df.groupby(['dataset', 'label']).agg(['median']).T

In [11]:
dataset_counts = context_df.groupby(['dataset', 'label']).size().to_frame().reset_index().rename(columns={0: 'count'})

In [12]:
TOP_N = 10

In [13]:
previous_features = []
highest_non_hallu, highest_hallu = [], []

for dataset in grouped_df.columns.levels[0]:

    stats_grouped = grouped_df[dataset].reset_index()

    stats_grouped = stats_grouped.rename(columns={'level_0': 'feature', 'level_1': 'statistic'})

    stats_grouped['median_diff'] = stats_grouped[0] - stats_grouped[1]
    stats_grouped['median_perc_diff'] = stats_grouped['median_diff'] / stats_grouped[0] * 100
    
    stats_grouped.drop(columns=['median_diff', 'statistic'], inplace=True)

    stats_grouped['median_proportion'] = stats_grouped[0] / stats_grouped[1]

    n_ok, n_hallu = dataset_counts.loc[dataset_counts['dataset'] == dataset, 'count'].values

    # display(Markdown(f"### **{dataset}**, hallu: {n_hallu}, non-hallu: {n_ok}"))

    highest_non_hallu_context = stats_grouped.sort_values('median_proportion', ascending=False).head(TOP_N)
    highest_hallu_context = stats_grouped.sort_values('median_proportion', ascending=True).head(TOP_N)

    previous_features.extend(highest_non_hallu_context['feature'].values)
    previous_features.extend(highest_hallu_context['feature'].values)

    highest_hallu.append(highest_hallu_context)
    highest_non_hallu.append(highest_non_hallu_context)

In [14]:
feature_counts = Counter(previous_features)
common_features = {feature for feature, count in feature_counts.items() if count > 1}

# Generate a random color for each common feature
def generate_random_color():
    return f'#{random.randint(0, 0xFFFFFF):06x}'

# Assign colors from the palette to common features
color_map = {feature: generate_random_color() for feature in common_features}

def highlight_common_features(s):
    return [f'color: {color_map[v]}' if v in color_map else '' for v in s]

In [15]:

display(Markdown(f"## **Highest non-hallu**"))
for df, dataset in zip(highest_non_hallu, grouped_df.columns.levels[0]):

    n_ok, n_hallu = dataset_counts.loc[dataset_counts['dataset'] == dataset, 'count'].values
    display(Markdown(f"### **{dataset}**, hallu: {n_hallu}, non-hallu: {n_ok}"))
    
    styled_df = df.style.apply(highlight_common_features, subset=['feature'])
    display(styled_df)

display(Markdown(f"## **Highest hallu**"))
for df, dataset in zip(highest_hallu, grouped_df.columns.levels[0]):

    n_ok, n_hallu = dataset_counts.loc[dataset_counts['dataset'] == dataset, 'count'].values
    display(Markdown(f"### **{dataset}**, hallu: {n_hallu}, non-hallu: {n_ok}"))

    styled_df = df.style.apply(highlight_common_features, subset=['feature'])
    display(styled_df)

del highest_non_hallu, highest_hallu

## **Highest non-hallu**

### **cnndm**, hallu: 518, non-hallu: 18882

label,feature,0,1,median_perc_diff,median_proportion
259,layer_16_head_3,4.6e-05,3.2e-05,30.46875,1.438202
454,layer_28_head_6,0.000319,0.000229,28.068862,1.390219
270,layer_16_head_14,0.000243,0.000175,27.968597,1.388283
107,layer_6_head_11,0.000107,7.7e-05,27.839644,1.385802
198,layer_12_head_6,0.000207,0.000151,26.832083,1.366719
123,layer_7_head_11,0.000688,0.000512,25.554785,1.34327
134,layer_8_head_6,0.000278,0.000211,24.16309,1.318619
124,layer_7_head_12,6.1e-05,4.6e-05,24.07045,1.31701
97,layer_6_head_1,0.00024,0.000188,21.875,1.28
104,layer_6_head_8,0.000344,0.00027,21.50277,1.27393


### **nq**, hallu: 972, non-hallu: 7058

label,feature,0,1,median_perc_diff,median_proportion
413,layer_25_head_13,0.000443,0.000221,50.0,2.0
276,layer_17_head_4,6.3e-05,3.2e-05,49.670123,1.986891
454,layer_28_head_6,0.000375,0.000196,47.804645,1.915879
237,layer_14_head_13,0.0003,0.000158,47.278506,1.89676
412,layer_25_head_12,0.000448,0.000244,45.53429,1.836018
277,layer_17_head_5,0.000341,0.000189,44.671558,1.807389
20,layer_1_head_4,1.4e-05,8e-06,44.052863,1.787402
204,layer_12_head_12,1.9e-05,1.1e-05,43.76947,1.778393
303,layer_18_head_15,0.000129,7.3e-05,43.565498,1.771966
489,layer_30_head_9,0.00041,0.000239,41.635147,1.71336


## **Highest hallu**

### **cnndm**, hallu: 518, non-hallu: 18882

label,feature,0,1,median_perc_diff,median_proportion
355,layer_22_head_3,4.1e-05,5.2e-05,-26.801153,0.788636
499,layer_31_head_3,0.000164,0.000207,-26.361656,0.791379
486,layer_30_head_6,2.9e-05,3.6e-05,-25.469729,0.797005
584,layer_36_head_8,6e-06,7e-06,-23.5,0.809717
199,layer_12_head_7,4.3e-05,5.3e-05,-21.349862,0.824064
475,layer_29_head_11,0.000186,0.000223,-19.910371,0.833956
93,layer_5_head_13,3.4e-05,4e-05,-19.542254,0.836524
204,layer_12_head_12,2.1e-05,2.5e-05,-18.965517,0.84058
444,layer_27_head_12,2.1e-05,2.4e-05,-18.115942,0.846626
254,layer_15_head_14,4.2e-05,5e-05,-17.882188,0.848305


### **nq**, hallu: 972, non-hallu: 7058

label,feature,0,1,median_perc_diff,median_proportion
86,layer_5_head_6,3.6e-05,5e-05,-40.984975,0.709295
2,layer_0_head_2,1.3e-05,1.8e-05,-40.952381,0.709459
186,layer_11_head_10,2.1e-05,2.8e-05,-31.766382,0.758919
448,layer_28_head_0,8e-06,1.1e-05,-30.797101,0.764543
373,layer_23_head_5,2.9e-05,3.8e-05,-30.641822,0.765452
176,layer_11_head_0,3.8e-05,4.7e-05,-24.606918,0.802524
208,layer_13_head_0,5.2e-05,6.4e-05,-24.025229,0.806288
166,layer_10_head_6,0.000109,0.000133,-22.112753,0.818915
539,layer_33_head_11,1.1e-05,1.3e-05,-20.670391,0.828704
379,layer_23_head_11,7.2e-05,8.7e-05,-20.642769,0.828893


In [16]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score


In [17]:
TOP_N_FEATURES = 7

In [18]:
att_cols = [col for col in context_df.columns if col not in ['dataset', 'label']]

In [19]:
validation_results = []

# Loop over each unique dataset
for dataset in context_df['dataset'].unique():

    in_dist_sample = context_df.loc[context_df['dataset'] != dataset]
    out_dist_sample = context_df.loc[context_df['dataset'] == dataset]

    # Undersample to have equal number of class 1 and 0
    class_0 = in_dist_sample[in_dist_sample['label'] == 0]
    class_1 = in_dist_sample[in_dist_sample['label'] == 1]
    class_0_under = class_0.sample(n=len(class_1), random_state=42)
    in_dist_sample = pd.concat([class_0_under, class_1])

    X_train, X_val = in_dist_sample[att_cols], out_dist_sample[att_cols]
    y_train, y_val = in_dist_sample['label'], out_dist_sample['label']

    print(f"{X_train.shape = }, {X_val.shape = }")

    models = {
        # 'LogisticRegression': LogisticRegression(max_iter=10000, class_weight='balanced', random_state=42),
        'LGBMClassifier': LGBMClassifier(
            n_estimators=50,
            learning_rate=0.001,
            max_depth=10,
            num_leaves=15,
            class_weight='balanced',
            reg_alpha=0.1,
            reg_lambda=0.1,
            random_state=42,
            n_jobs=-1,
            silent=True,
            verbose=-1
        )
    }

    for model_name, model in models.items():
        model.fit(X_train, y_train)

        train_pred = model.predict(X_train)
        train_auc = roc_auc_score(y_train, train_pred)
        print(f"Train AUC for {dataset} with {model_name}: {train_auc}")

        # Validate the model
        y_pred = model.predict_proba(X_val)

        auc = roc_auc_score(y_val, y_pred[:, 1])

        # Store the result
        validation_results.append({
            'dataset': dataset,
            'model': model_name,
            'train_auc': train_auc,
            'test_auc': auc
        })

    print('\n\n')

X_train.shape = (1036, 672), X_val.shape = (8030, 672)
Train AUC for nq with LGBMClassifier: 0.8223938223938224



X_train.shape = (1944, 672), X_val.shape = (19400, 672)
Train AUC for cnndm with LGBMClassifier: 0.7407407407407408





In [20]:
pd.DataFrame(validation_results)

Unnamed: 0,dataset,model,train_auc,test_auc
0,nq,LGBMClassifier,0.822394,0.480107
1,cnndm,LGBMClassifier,0.740741,0.556534


In [21]:
feat = [f for i, f in enumerate(model.feature_name_) if i in model.feature_importances_.nonzero()[0]]
feat

['layer_3_head_2',
 'layer_4_head_7',
 'layer_7_head_3',
 'layer_7_head_4',
 'layer_12_head_7',
 'layer_12_head_14',
 'layer_12_head_15',
 'layer_13_head_12',
 'layer_15_head_9',
 'layer_15_head_15',
 'layer_16_head_8',
 'layer_17_head_8',
 'layer_17_head_15',
 'layer_18_head_14',
 'layer_20_head_3',
 'layer_20_head_4',
 'layer_22_head_0',
 'layer_24_head_3',
 'layer_24_head_13',
 'layer_25_head_5',
 'layer_25_head_13',
 'layer_26_head_1',
 'layer_27_head_6',
 'layer_31_head_9',
 'layer_36_head_6',
 'layer_36_head_11',
 'layer_37_head_2',
 'layer_38_head_13',
 'layer_39_head_12',
 'layer_40_head_9',
 'layer_41_head_6',
 'layer_41_head_11',
 'layer_41_head_12',
 'layer_41_head_14']

In [22]:
context_df

Unnamed: 0,layer_0_head_0,layer_0_head_1,layer_0_head_2,layer_0_head_3,layer_0_head_4,layer_0_head_5,layer_0_head_6,layer_0_head_7,layer_0_head_8,layer_0_head_9,...,layer_41_head_8,layer_41_head_9,layer_41_head_10,layer_41_head_11,layer_41_head_12,layer_41_head_13,layer_41_head_14,layer_41_head_15,label,dataset
0,0.000607,0.000536,0.000012,0.000291,0.000548,0.000584,0.000073,0.000084,0.000440,0.000608,...,0.000029,0.000018,0.000163,0.000117,0.000128,0.000013,0.000022,0.000019,0,nq
1,0.000541,0.000566,0.000016,0.000298,0.000528,0.000548,0.000060,0.000075,0.000422,0.000582,...,0.000034,0.000018,0.000118,0.000101,0.000178,0.000011,0.000028,0.000022,0,nq
2,0.000530,0.000559,0.000017,0.000290,0.000547,0.000539,0.000053,0.000070,0.000430,0.000581,...,0.000034,0.000019,0.000126,0.000085,0.000195,0.000010,0.000025,0.000021,0,nq
3,0.000479,0.000566,0.000007,0.000267,0.000534,0.000526,0.000042,0.000053,0.000416,0.000568,...,0.000041,0.000018,0.000115,0.000090,0.000193,0.000010,0.000026,0.000023,0,nq
4,0.000511,0.000543,0.000011,0.000175,0.000505,0.000526,0.000049,0.000068,0.000402,0.000559,...,0.000050,0.000027,0.000111,0.000110,0.000197,0.000018,0.000031,0.000030,0,nq
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7788,0.000699,0.000630,0.000014,0.000279,0.000696,0.000724,0.000082,0.000070,0.000530,0.000700,...,0.000023,0.000009,0.000127,0.000160,0.000046,0.000005,0.000016,0.000018,0,cnndm
7789,0.000692,0.000699,0.000017,0.000268,0.000678,0.000722,0.000072,0.000077,0.000508,0.000683,...,0.000020,0.000009,0.000129,0.000149,0.000051,0.000004,0.000016,0.000018,0,cnndm
7790,0.000643,0.000703,0.000017,0.000247,0.000626,0.000677,0.000075,0.000100,0.000454,0.000664,...,0.000020,0.000009,0.000107,0.000110,0.000047,0.000006,0.000015,0.000017,0,cnndm
7791,0.000746,0.000650,0.000011,0.000355,0.000727,0.000667,0.000120,0.000226,0.000492,0.000625,...,0.000027,0.000012,0.000120,0.000112,0.000048,0.000009,0.000017,0.000016,0,cnndm


In [23]:
n_bins = int(2 * (len(context_df) ** (1/3)))
n_bins

60

In [24]:
numerical_df = context_df.drop(columns=['label', 'dataset']).apply(pd.qcut, q=n_bins, labels=False, duplicates='drop')

In [25]:
numerical_df['label'] = context_df['label'].values
numerical_df['dataset'] = context_df['dataset'].values

In [26]:
def get_prob_vec(df, col, target_col, min_val=0, max_val=57):
    
    all_values = np.arange(min_val, max_val + 1)
    pos = df.loc[df[target_col] == 1, col].value_counts(normalize=True).reindex(all_values, fill_value=0).sort_index()
    neg = df.loc[df[target_col] == 0, col].value_counts(normalize=True).reindex(all_values, fill_value=0).sort_index()

    return pos, neg

In [27]:
numerical_df.describe()

Unnamed: 0,layer_0_head_0,layer_0_head_1,layer_0_head_2,layer_0_head_3,layer_0_head_4,layer_0_head_5,layer_0_head_6,layer_0_head_7,layer_0_head_8,layer_0_head_9,...,layer_41_head_7,layer_41_head_8,layer_41_head_9,layer_41_head_10,layer_41_head_11,layer_41_head_12,layer_41_head_13,layer_41_head_14,layer_41_head_15,label
count,27430.0,27430.0,27430.0,27430.0,27430.0,27430.0,27430.0,27430.0,27430.0,27430.0,...,27430.0,27430.0,27430.0,27430.0,27430.0,27430.0,27430.0,27430.0,27430.0,27430.0
mean,29.482902,29.48261,29.447612,29.485162,29.48097,29.481735,29.487204,29.48724,29.484032,29.485855,...,29.462012,29.446336,29.419213,29.486803,29.483704,29.484652,29.380314,29.438133,29.445206,0.05432
std,17.325346,17.324763,17.33425,17.320366,17.326024,17.321418,17.320979,17.321357,17.320309,17.322992,...,17.330554,17.327645,17.332324,17.320023,17.320073,17.325133,17.358672,17.339192,17.329651,0.226652
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,14.0,14.0,14.0,14.0,14.0,14.0,14.0,14.25,14.25,14.0,...,14.0,14.25,14.0,14.25,14.25,14.0,14.0,14.0,14.0,0.0
50%,29.0,29.0,29.0,29.0,29.0,29.0,29.0,29.0,29.0,29.0,...,29.0,29.0,29.0,29.0,29.0,29.0,29.0,29.0,29.0,0.0
75%,44.0,44.0,44.0,44.75,44.0,44.75,44.0,44.0,44.0,44.0,...,44.0,44.0,44.0,44.0,44.0,44.0,44.0,44.0,44.0,0.0
max,59.0,59.0,59.0,59.0,59.0,59.0,59.0,59.0,59.0,59.0,...,59.0,59.0,59.0,59.0,59.0,59.0,59.0,59.0,59.0,1.0


In [29]:
def js_div(p, q):
    m = (p + q) / 2
    return 0.5 * (p * np.log2(p / m)).sum() + 0.5 * (q * np.log2(q / m)).sum()

In [28]:
from scipy.special import kl_div
from scipy.spatial.distance import jensenshannon
from scipy.special import rel_entr

In [1]:
import numpy as np
import pandas as pd
from typing import Tuple

In [None]:
def calculate_probabilities(df: pd.DataFrame, label_col: str, n_bins: int) -> Tuple[np.ndarray, np.ndarray]:
    """
    Calculate positive and negative probability vectors for each column in the dataframe.

    Args:
        df (pd.DataFrame): The input dataframe containing numerical data.
        label_col (str): The name of the column containing the labels.
        n_bins (int): The number of bins to use for probability calculation.

    Returns:
        Tuple[np.ndarray, np.ndarray]: Two arrays containing the positive and negative probabilities.
    """
    
    pos_probs = np.zeros((df.shape[1] - 1, n_bins))
    neg_probs = np.zeros((df.shape[1] - 1, n_bins))

    for i, col in enumerate(df.columns[:-2]):
        pos, neg = get_prob_vec(df, col, label_col, max_val=n_bins - 1)
        pos_probs[i] = pos.values
        neg_probs[i] = neg.values

    return pos_probs, neg_probs

In [32]:
jensen_divs_df = pd.DataFrame(jensenshannon(pos_probs, neg_probs, axis=1), index=numerical_df.columns[:-2], columns=['js_div'])
jensen_divs_df

Unnamed: 0,js_div
layer_0_head_0,0.248799
layer_0_head_1,0.251337
layer_0_head_2,0.203342
layer_0_head_3,0.183826
layer_0_head_4,0.268269
...,...
layer_41_head_11,0.162815
layer_41_head_12,0.116295
layer_41_head_13,0.142988
layer_41_head_14,0.107490


In [33]:
normalized_importances = model.feature_importances_ / model.feature_importances_.sum()
lgbm_top_features = [model.feature_name_[i] for i in np.argsort(model.feature_importances_)[-20:]]
lgbm_top_importances = normalized_importances[np.argsort(model.feature_importances_)[-20:]]

In [34]:
non_zero_features = [model.feature_name_[i] for i in np.where(model.feature_importances_ > 0)[0]]

### Jensen Shanon features

In [35]:
jensen_divs_df.sort_values('js_div', ascending=False).head(10)

Unnamed: 0,js_div
layer_15_head_9,0.317225
layer_18_head_15,0.316034
layer_28_head_4,0.315424
layer_18_head_6,0.315169
layer_19_head_1,0.311446
layer_25_head_10,0.311332
layer_24_head_0,0.310957
layer_21_head_5,0.30991
layer_15_head_10,0.309281
layer_31_head_7,0.3088


In [36]:
top_features_dict = {
    'LightGBM_Importance': non_zero_features,
}

In [37]:
models = {
    'lgbm': LGBMClassifier(
        n_estimators=250,
        learning_rate=0.001,
        max_depth=10,
        num_leaves=15,
        # class_weight='balanced',
        # reg_alpha=0.1,
        # reg_lambda=0.1,
        random_state=42,
        n_jobs=-1,
        silent=True,
        verbose=-1,
    ),
    'logistic_reg': LogisticRegression(max_iter=10000, random_state=42, class_weight='balanced', n_jobs=-1),
}

In [73]:
TOP_N_FEATURES = 50

In [74]:
train_cols_dict = {
    'jensen': jensen_divs_df.nlargest(2 * TOP_N_FEATURES, 'js_div').index,
    'lgbm': non_zero_features,
    'all': att_cols
}

In [75]:
from imblearn.under_sampling import RandomUnderSampler

In [76]:
SPLIT_VAL = False

In [77]:
validation_results = []

# Loop over each unique dataset
for dataset in context_df['dataset'].unique():

    in_dist_sample = context_df.loc[context_df['dataset'] != dataset]
    out_dist_sample = context_df.loc[context_df['dataset'] == dataset]

    stats_grouped = in_dist_sample.drop(columns=['dataset']).groupby('label').agg(['median']).T
    stats_grouped['median_proportion'] = stats_grouped[0] / stats_grouped[1]
    stats_grouped = stats_grouped.reset_index().rename(columns={'level_0': 'feature', 'level_1': 'statistic'})

    train_cols_dict['median'] = stats_grouped.sort_values('median_proportion', ascending=False).head(TOP_N_FEATURES)['feature'].values.tolist() \
        + stats_grouped.sort_values('median_proportion', ascending=True).head(TOP_N_FEATURES)['feature'].values.tolist()
    
    # class_0 = in_dist_sample[in_dist_sample['label'] == 0]
    # class_1 = in_dist_sample[in_dist_sample['label'] == 1]
    # class_0_under = class_0.sample(n=len(class_1), random_state=42)
    # in_dist_sample = pd.concat([class_0_under, class_1])

    for method, train_cols in train_cols_dict.items():

        X_train, X_test = in_dist_sample[train_cols], out_dist_sample[train_cols]
        y_train, y_test = in_dist_sample['label'], out_dist_sample['label']

        if SPLIT_VAL:
            X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42, stratify=y_train)

        rus = RandomUnderSampler(random_state=42)
        X_train, y_train = rus.fit_resample(X_train, y_train)

        for model_name, model in models.items():

            if model_name == 'logistic_reg':

                scaler = RobustScaler()
                X_train = scaler.fit_transform(X_train)

                if SPLIT_VAL:
                    X_val = scaler.transform(X_val)

                X_test = scaler.transform(X_test)

            model.fit(X_train, y_train)

            train_pred = model.predict_proba(X_train)
            train_auc = roc_auc_score(y_train, train_pred[:, 1])

            print(f"Train AUC for {dataset} with {method} and {model_name = }: {train_auc}")

            auc_val = None

            if SPLIT_VAL:
                y_pred = model.predict_proba(X_val)
                auc_val = roc_auc_score(y_val, y_pred[:, 1])

            y_pred = model.predict_proba(X_test)
            auc_test = roc_auc_score(y_test, y_pred[:, 1])

            # Store the result
            validation_results.append({
                'dataset': dataset,
                'columns_selection': method,
                'model': model_name,
                'train_auc': train_auc,
                'val_auc': auc_val,
                'test_auc': auc_test
            })

    print('\n\n')

Train AUC for nq with jensen and model_name = 'lgbm': 0.9098869277440707
Train AUC for nq with jensen and model_name = 'logistic_reg': 0.8887389871945857
Train AUC for nq with lgbm and model_name = 'lgbm': 0.8974187922064372
Train AUC for nq with lgbm and model_name = 'logistic_reg': 0.7617432656042694
Train AUC for nq with all and model_name = 'lgbm': 0.9621632802134732
Train AUC for nq with all and model_name = 'logistic_reg': 1.0
Train AUC for nq with median and model_name = 'lgbm': 0.9481969559189637
Train AUC for nq with median and model_name = 'logistic_reg': 0.8782143975194168



Train AUC for cnndm with jensen and model_name = 'lgbm': 0.8106313189046386
Train AUC for cnndm with jensen and model_name = 'logistic_reg': 0.7995383071686227
Train AUC for cnndm with lgbm and model_name = 'lgbm': 0.8286105607207573
Train AUC for cnndm with lgbm and model_name = 'logistic_reg': 0.7859362563294892
Train AUC for cnndm with all and model_name = 'lgbm': 0.85957213500652
Train AUC for cnndm

In [78]:
df = pd.DataFrame(validation_results)

# Find the best columns_selection for each dataset based on val_auc
best_columns_selection = df.loc[df.groupby('dataset')['test_auc'].idxmax()]

# Color the best columns_selection for each dataset
def highlight_best_columns_selection(row):
    if row['columns_selection'] == best_columns_selection.loc[best_columns_selection['dataset'] == row['dataset'], 'columns_selection'].values[0]:
        return ['color: yellow'] * len(row)
    else:
        return [''] * len(row)

styled_df = df.style.apply(highlight_best_columns_selection, axis=1)
display(styled_df)

Unnamed: 0,dataset,columns_selection,model,train_auc,val_auc,test_auc
0,nq,jensen,lgbm,0.909887,,0.640077
1,nq,jensen,logistic_reg,0.888739,,0.498669
2,nq,lgbm,lgbm,0.897419,,0.596718
3,nq,lgbm,logistic_reg,0.761743,,0.484045
4,nq,all,lgbm,0.962163,,0.555873
5,nq,all,logistic_reg,1.0,,0.459129
6,nq,median,lgbm,0.948197,,0.453757
7,nq,median,logistic_reg,0.878214,,0.453256
8,cnndm,jensen,lgbm,0.810631,,0.545009
9,cnndm,jensen,logistic_reg,0.799538,,0.564595
