In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from IPython.display import display, Markdown
from collections import Counter
import random
from lightgbm import LGBMClassifier

In [5]:
EXP_NAME = 'gemma_att'

In [7]:
context_df = None

for f_ in os.listdir(os.path.join("..", "..", EXP_NAME)):
    
    # if f_.startswith('attension'):

    if context_df is None:
        context_df = pd.read_parquet(os.path.join("..", "..", EXP_NAME, f_))
    else:
        context_df = pd.concat((context_df, pd.read_parquet(os.path.join("..", "..", EXP_NAME, f_))))

In [8]:
context_df['dataset'].value_counts()

dataset
cnndm          19400
nq              8030
xsum            7776
poquad_v2       6247
hotpotqa_en     3121
bioask          3056
hotpotqa_pl     2208
polqa           1869
Name: count, dtype: int64

In [16]:
context_df.groupby(['dataset'])['label'].value_counts(normalize=True).to_frame().round(3).reset_index()

Unnamed: 0,dataset,label,proportion
0,bioask,0,0.94
1,bioask,1,0.06
2,cnndm,0,0.973
3,cnndm,1,0.027
4,hotpotqa_en,0,0.868
5,hotpotqa_en,1,0.132
6,hotpotqa_pl,0,0.873
7,hotpotqa_pl,1,0.127
8,nq,0,0.879
9,nq,1,0.121


### Here is the reason why training on the `cnndm` gives pure performance: only **2.7%** of hallucinated examples and then generalization is very weak. 

In [72]:
context_df['summed_att'] = context_df.drop(columns=['dataset', 'label']).sum(axis=1)

In [76]:
context_df.groupby(['dataset', 'label'])['summed_att'].median().to_frame().round(3).reset_index()

Unnamed: 0,dataset,label,summed_att
0,bioask,0,0.087
1,bioask,1,0.083
2,cnndm,0,0.202
3,cnndm,1,0.211
4,hotpotqa_en,0,0.084
5,hotpotqa_en,1,0.059
6,hotpotqa_pl,0,0.162
7,hotpotqa_pl,1,0.118
8,nq,0,0.103
9,nq,1,0.089


In [77]:
context_df[['summed_att', 'label', 'dataset']].groupby(['dataset', 'label']).describe().round(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,summed_att,summed_att,summed_att,summed_att,summed_att,summed_att,summed_att,summed_att
Unnamed: 0_level_1,Unnamed: 1_level_1,count,mean,std,min,25%,50%,75%,max
dataset,label,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
bioask,0,2874.0,0.151,0.237,0.043,0.079,0.087,0.124,2.488
bioask,1,182.0,0.101,0.066,0.041,0.074,0.083,0.117,0.651
cnndm,0,18882.0,0.238,0.128,0.053,0.147,0.202,0.297,0.979
cnndm,1,518.0,0.232,0.115,0.076,0.154,0.211,0.282,0.857
hotpotqa_en,0,2710.0,0.089,0.033,0.03,0.071,0.084,0.101,0.49
hotpotqa_en,1,411.0,0.069,0.028,0.032,0.047,0.059,0.087,0.251
hotpotqa_pl,0,1927.0,0.17,0.066,0.042,0.122,0.162,0.207,0.646
hotpotqa_pl,1,281.0,0.138,0.068,0.041,0.089,0.118,0.176,0.388
nq,0,7058.0,0.098,0.025,0.027,0.088,0.103,0.115,0.154
nq,1,972.0,0.082,0.03,0.027,0.05,0.089,0.107,0.137


In [78]:
summ_att_df = context_df[['summed_att', 'label', 'dataset']].loc[context_df['dataset'].isin(['nq', 'cnndm'])]

In [79]:
summ_att_df

Unnamed: 0,summed_att,label,dataset
0,0.113586,0,nq
1,0.112671,0,nq
2,0.107483,0,nq
3,0.102539,0,nq
4,0.097351,0,nq
...,...,...,...
7788,0.161377,0,cnndm
7789,0.161499,0,cnndm
7790,0.143188,0,cnndm
7791,0.163574,0,cnndm


In [80]:
TRAIN_COLS = ['summed_att']

In [81]:
from sklearn.svm import SVC

In [105]:
models = {
    # 'lgbm': LGBMClassifier(
    #     n_estimators=250,
    #     learning_rate=0.001,
    #     max_depth=10,
    #     num_leaves=15,
    #     # class_weight='balanced',
    #     # reg_alpha=0.1,
    #     # reg_lambda=0.1,
    #     random_state=42,
    #     n_jobs=-1,
    #     silent=True,
    #     verbose=-1,
    # ),
    # 'logistic_reg': LogisticRegression(max_iter=10000, random_state=42, class_weight='balanced', n_jobs=-1),
    'svm': SVC(kernel='rbf', probability=True, class_weight='balanced', random_state=42, C=0.1e-3)
}

In [106]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

In [107]:
validation_results = []

for dataset in summ_att_df['dataset'].unique():

    in_dist_sample = summ_att_df.loc[summ_att_df['dataset'] != dataset]
    out_dist_sample = summ_att_df.loc[summ_att_df['dataset'] == dataset]

    X_train, X_test = in_dist_sample[TRAIN_COLS], out_dist_sample[TRAIN_COLS]
    y_train, y_test = in_dist_sample['label'], out_dist_sample['label']

    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42, stratify=y_train)

    # rus = RandomUnderSampler(random_state=42)
    # X_train, y_train = rus.fit_resample(X_train, y_train)

    for model_name, model in models.items():

        model.fit(X_train, y_train)

        train_pred = model.predict_proba(X_train)
        train_auc = roc_auc_score(y_train, train_pred[:, 1])

        print(f"Train AUC for model tested_on {dataset} and {model_name = }: {train_auc}")

        y_pred = model.predict_proba(X_val)
        auc_val = roc_auc_score(y_val, y_pred[:, 1])

        y_pred = model.predict_proba(X_test)
        auc_test = roc_auc_score(y_test, y_pred[:, 1])

        # Store the result
        validation_results.append({
            'dataset': dataset,
            'columns_selection': TRAIN_COLS,
            'model': model_name,
            'train_auc': train_auc,
            'val_auc': auc_val,
            'test_auc': auc_test
        })

    print('\n\n')

Train AUC for model tested_on nq and model_name = 'svm': 0.4573245202501357



Train AUC for model tested_on cnndm and model_name = 'svm': 0.3571617688706521





In [108]:
pd.DataFrame(validation_results)

Unnamed: 0,dataset,columns_selection,model,train_auc,val_auc,test_auc
0,nq,[summed_att],svm,0.457325,0.439745,0.607564
1,cnndm,[summed_att],svm,0.357162,0.371716,0.500987


### That data is not linearly separable

In [109]:
for n in range(42):  # Assuming there are 42 layers, adjust if necessary
    layer_columns = [col for col in context_df.columns if col.startswith(f'layer_{n}_')]
    context_df[f'sum_layer_{n}'] = context_df[layer_columns].sum(axis=1)

In [149]:
summ_layers_df = context_df[[col for col in context_df.columns if col.startswith('sum_layer_')] + ['label', 'dataset']]

In [150]:
summ_layers_df

Unnamed: 0,sum_layer_0,sum_layer_1,sum_layer_2,sum_layer_3,sum_layer_4,sum_layer_5,sum_layer_6,sum_layer_7,sum_layer_8,sum_layer_9,...,sum_layer_34,sum_layer_35,sum_layer_36,sum_layer_37,sum_layer_38,sum_layer_39,sum_layer_40,sum_layer_41,label,dataset
0,0.005493,0.001273,0.001323,0.001439,0.001986,0.002054,0.001840,0.002005,0.002207,0.002832,...,0.002640,0.002686,0.002653,0.002666,0.001726,0.002979,0.002056,0.000912,0,nq
1,0.005333,0.001189,0.001333,0.001093,0.001382,0.001426,0.001387,0.001568,0.001972,0.002802,...,0.002743,0.002104,0.002651,0.002447,0.001687,0.003056,0.001678,0.000837,0,nq
2,0.005280,0.001167,0.001335,0.001018,0.001374,0.001262,0.001232,0.001456,0.001976,0.002438,...,0.002876,0.002235,0.002542,0.002403,0.001582,0.002983,0.001688,0.000833,0,nq
3,0.005150,0.001280,0.001534,0.001017,0.001442,0.001438,0.001093,0.001318,0.001839,0.002008,...,0.002985,0.002243,0.002544,0.002432,0.001637,0.003019,0.001784,0.000841,0,nq
4,0.004993,0.001418,0.001431,0.001142,0.001468,0.001797,0.001159,0.001532,0.001710,0.001840,...,0.002611,0.002096,0.002222,0.002012,0.001452,0.002607,0.001637,0.000931,0,nq
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7771,0.016083,0.004208,0.003077,0.003664,0.003597,0.004871,0.004662,0.003485,0.006992,0.011383,...,0.007088,0.005001,0.006855,0.005783,0.002962,0.006950,0.004395,0.001422,0,xsum
7772,0.015564,0.003929,0.002893,0.003515,0.003250,0.004780,0.004391,0.005165,0.008423,0.013184,...,0.005386,0.003500,0.004566,0.004311,0.002970,0.005196,0.003462,0.001101,0,xsum
7773,0.014694,0.004070,0.003607,0.003910,0.003502,0.004436,0.004776,0.004742,0.008873,0.013374,...,0.004131,0.002596,0.003960,0.003521,0.002464,0.004475,0.002886,0.001216,0,xsum
7774,0.014511,0.004944,0.004082,0.006706,0.007874,0.009529,0.011330,0.006660,0.013336,0.018280,...,0.009018,0.007248,0.009583,0.008926,0.003922,0.009407,0.005936,0.001915,0,xsum


In [151]:
grouped_df = summ_layers_df.groupby(['dataset', 'label']).agg(['median']).T

In [152]:
dataset_counts = summ_layers_df.groupby(['dataset', 'label']).size().to_frame().reset_index().rename(columns={0: 'count'})

In [153]:
TOP_N = 10

In [154]:
previous_features = []
highest_non_hallu, highest_hallu = [], []

for dataset in grouped_df.columns.levels[0]:

    stats_grouped = grouped_df[dataset].reset_index()

    stats_grouped = stats_grouped.rename(columns={'level_0': 'feature', 'level_1': 'statistic'})

    stats_grouped['median_diff'] = stats_grouped[0] - stats_grouped[1]
    stats_grouped['median_perc_diff'] = stats_grouped['median_diff'] / stats_grouped[0] * 100
    
    stats_grouped.drop(columns=['median_diff', 'statistic'], inplace=True)

    stats_grouped['median_proportion'] = stats_grouped[0] / stats_grouped[1]

    n_ok, n_hallu = dataset_counts.loc[dataset_counts['dataset'] == dataset, 'count'].values

    # display(Markdown(f"### **{dataset}**, hallu: {n_hallu}, non-hallu: {n_ok}"))

    highest_non_hallu_context = stats_grouped.sort_values('median_proportion', ascending=False).head(TOP_N)
    highest_hallu_context = stats_grouped.sort_values('median_proportion', ascending=True).head(TOP_N)

    previous_features.extend(highest_non_hallu_context['feature'].values)
    previous_features.extend(highest_hallu_context['feature'].values)

    highest_hallu.append(highest_hallu_context)
    highest_non_hallu.append(highest_non_hallu_context)

In [155]:
feature_counts = Counter(previous_features)
common_features = {feature for feature, count in feature_counts.items() if count > 1}

# Generate a random color for each common feature
def generate_random_color():
    return f'#{random.randint(0, 0xFFFFFF):06x}'

# Assign colors from the palette to common features
color_map = {feature: generate_random_color() for feature in common_features}

def highlight_common_features(s):
    return [f'color: {color_map[v]}' if v in color_map else '' for v in s]

In [156]:

display(Markdown(f"## **Highest non-hallu**"))
for df, dataset in zip(highest_non_hallu, grouped_df.columns.levels[0]):

    n_ok, n_hallu = dataset_counts.loc[dataset_counts['dataset'] == dataset, 'count'].values
    display(Markdown(f"### **{dataset}**, hallu: {n_hallu}, non-hallu: {n_ok}"))
    
    styled_df = df.style.apply(highlight_common_features, subset=['feature'])
    display(styled_df)

display(Markdown(f"## **Highest hallu**"))
for df, dataset in zip(highest_hallu, grouped_df.columns.levels[0]):

    n_ok, n_hallu = dataset_counts.loc[dataset_counts['dataset'] == dataset, 'count'].values
    display(Markdown(f"### **{dataset}**, hallu: {n_hallu}, non-hallu: {n_ok}"))

    styled_df = df.style.apply(highlight_common_features, subset=['feature'])
    display(styled_df)

del highest_non_hallu, highest_hallu

## **Highest non-hallu**

### **bioask**, hallu: 182, non-hallu: 2874

label,feature,0,1,median_perc_diff,median_proportion
38,sum_layer_38,0.002211,0.001509,31.75151,1.465234
36,sum_layer_36,0.00293,0.002424,17.252604,1.208497
29,sum_layer_29,0.00242,0.002023,16.43026,1.196605
34,sum_layer_34,0.003025,0.002597,14.155107,1.164892
40,sum_layer_40,0.002035,0.001756,13.706654,1.158838
31,sum_layer_31,0.00243,0.002137,12.048666,1.136992
33,sum_layer_33,0.002453,0.002174,11.353033,1.12807
37,sum_layer_37,0.002172,0.001944,10.535558,1.117763
30,sum_layer_30,0.003329,0.002979,10.512747,1.117478
35,sum_layer_35,0.00229,0.002082,9.07955,1.099863


### **cnndm**, hallu: 518, non-hallu: 18882

label,feature,0,1,median_perc_diff,median_proportion
7,sum_layer_7,0.00338,0.003029,10.383747,1.115869
10,sum_layer_10,0.003254,0.002975,8.55803,1.09359
16,sum_layer_16,0.004185,0.003872,7.474932,1.080788
6,sum_layer_6,0.003307,0.003073,7.093426,1.07635
14,sum_layer_14,0.006451,0.006031,6.505027,1.069576
4,sum_layer_4,0.002377,0.002232,6.099518,1.064957
8,sum_layer_8,0.004784,0.00462,3.429027,1.035508
9,sum_layer_9,0.006981,0.00675,3.306011,1.03419
5,sum_layer_5,0.003151,0.003048,3.268765,1.033792
11,sum_layer_11,0.003789,0.003731,1.535364,1.015593


### **hotpotqa_en**, hallu: 411, non-hallu: 2710

label,feature,0,1,median_perc_diff,median_proportion
30,sum_layer_30,0.002677,0.000819,69.415746,3.269656
36,sum_layer_36,0.001811,0.00067,62.997103,2.702491
28,sum_layer_28,0.003818,0.001514,60.329753,2.520781
32,sum_layer_32,0.002481,0.001114,55.111453,2.22774
34,sum_layer_34,0.002043,0.000923,54.80859,2.21281
31,sum_layer_31,0.001858,0.000845,54.517454,2.198646
29,sum_layer_29,0.001586,0.000729,54.05893,2.176702
9,sum_layer_9,0.002701,0.001255,53.531073,2.151976
7,sum_layer_7,0.001759,0.000838,52.331887,2.097838
13,sum_layer_13,0.002149,0.001104,48.601864,1.945596


### **hotpotqa_pl**, hallu: 281, non-hallu: 1927

label,feature,0,1,median_perc_diff,median_proportion
30,sum_layer_30,0.004581,0.001376,69.962531,3.329175
28,sum_layer_28,0.006859,0.002304,66.407119,2.976821
36,sum_layer_36,0.002983,0.001122,62.372123,2.657604
32,sum_layer_32,0.003798,0.001468,61.35108,2.587394
7,sum_layer_7,0.003426,0.001478,56.848552,2.317419
29,sum_layer_29,0.00301,0.001305,56.653992,2.307018
31,sum_layer_31,0.003155,0.001406,55.441354,2.244233
34,sum_layer_34,0.003382,0.001566,53.694303,2.159562
9,sum_layer_9,0.005688,0.002739,51.8444,2.076602
37,sum_layer_37,0.003147,0.001681,46.575758,1.871809


### **nq**, hallu: 972, non-hallu: 7058

label,feature,0,1,median_perc_diff,median_proportion
25,sum_layer_25,0.003357,0.002488,25.880682,1.349176
31,sum_layer_31,0.002302,0.001746,24.150787,1.318405
21,sum_layer_21,0.00312,0.002367,24.144254,1.318292
30,sum_layer_30,0.003578,0.002726,23.827292,1.312806
20,sum_layer_20,0.002628,0.002053,21.879536,1.280074
17,sum_layer_17,0.003471,0.002713,21.840659,1.279438
29,sum_layer_29,0.0019,0.001487,21.736948,1.277742
33,sum_layer_33,0.00223,0.001749,21.556886,1.274809
24,sum_layer_24,0.002529,0.002001,20.889894,1.264061
26,sum_layer_26,0.002134,0.001689,20.866845,1.263693


### **polqa**, hallu: 377, non-hallu: 1492

label,feature,0,1,median_perc_diff,median_proportion
30,sum_layer_30,0.004314,0.001637,62.068966,2.636364
28,sum_layer_28,0.006149,0.002739,55.459057,2.245125
31,sum_layer_31,0.002823,0.0014,50.405405,2.016349
32,sum_layer_32,0.003414,0.001758,48.519553,1.942485
7,sum_layer_7,0.003219,0.001693,47.407407,1.901408
36,sum_layer_36,0.002604,0.001421,45.421245,1.832215
29,sum_layer_29,0.002665,0.001457,45.311382,1.828534
34,sum_layer_34,0.002906,0.001675,42.369544,1.735194
9,sum_layer_9,0.005095,0.003208,37.027331,1.58799
14,sum_layer_14,0.004364,0.002758,36.800699,1.582296


### **poquad_v2**, hallu: 425, non-hallu: 5822

label,feature,0,1,median_perc_diff,median_proportion
30,sum_layer_30,0.01408,0.00655,53.481441,2.14968
32,sum_layer_32,0.010078,0.004841,51.968206,2.081954
31,sum_layer_31,0.008835,0.004433,49.827288,1.993115
28,sum_layer_28,0.018173,0.009567,47.355164,1.899522
34,sum_layer_34,0.008015,0.004559,43.122323,1.758159
36,sum_layer_36,0.006802,0.003998,41.222658,1.701336
33,sum_layer_33,0.006433,0.003803,40.883487,1.691575
25,sum_layer_25,0.013653,0.008377,38.642079,1.629781
7,sum_layer_7,0.008278,0.005085,38.571429,1.627907
35,sum_layer_35,0.006348,0.003906,38.461538,1.625


### **xsum**, hallu: 822, non-hallu: 6954

label,feature,0,1,median_perc_diff,median_proportion
9,sum_layer_9,0.013733,0.012917,5.944444,1.063201
31,sum_layer_31,0.008842,0.008324,5.867127,1.062328
8,sum_layer_8,0.008621,0.008148,5.486726,1.058052
6,sum_layer_6,0.006104,0.005772,5.4375,1.057502
30,sum_layer_30,0.011749,0.011116,5.38961,1.056966
25,sum_layer_25,0.016159,0.015301,5.311615,1.056096
10,sum_layer_10,0.006054,0.005747,5.072464,1.053435
7,sum_layer_7,0.005764,0.005495,4.665784,1.048941
13,sum_layer_13,0.010452,0.009987,4.452555,1.0466
19,sum_layer_19,0.012268,0.011723,4.446517,1.046534


## **Highest hallu**

### **bioask**, hallu: 182, non-hallu: 2874

label,feature,0,1,median_perc_diff,median_proportion
8,sum_layer_8,0.001738,0.001934,-11.306257,0.898422
10,sum_layer_10,0.001231,0.001352,-9.879892,0.910085
3,sum_layer_3,0.000863,0.000927,-7.460624,0.930573
11,sum_layer_11,0.001117,0.001194,-6.874466,0.935677
1,sum_layer_1,0.000931,0.000987,-6.072252,0.942754
5,sum_layer_5,0.001441,0.001522,-5.557393,0.947352
7,sum_layer_7,0.001407,0.001483,-5.457627,0.948248
21,sum_layer_21,0.002765,0.002903,-5.001725,0.952365
6,sum_layer_6,0.001518,0.00157,-3.423367,0.966899
13,sum_layer_13,0.001741,0.001799,-3.34155,0.967665


### **cnndm**, hallu: 518, non-hallu: 18882

label,feature,0,1,median_perc_diff,median_proportion
36,sum_layer_36,0.003801,0.004135,-8.780733,0.91928
33,sum_layer_33,0.004089,0.004379,-7.089552,0.933798
40,sum_layer_40,0.002491,0.002666,-7.04441,0.934192
20,sum_layer_20,0.006767,0.007223,-6.736189,0.936889
29,sum_layer_29,0.004013,0.004274,-6.511407,0.938867
2,sum_layer_2,0.001851,0.001967,-6.28542,0.940863
38,sum_layer_38,0.001851,0.001966,-6.2339,0.941319
28,sum_layer_28,0.008629,0.009163,-6.189213,0.941715
34,sum_layer_34,0.00412,0.004372,-6.111111,0.942408
35,sum_layer_35,0.003466,0.003654,-5.448542,0.94833


### **hotpotqa_en**, hallu: 411, non-hallu: 2710

label,feature,0,1,median_perc_diff,median_proportion
10,sum_layer_10,0.001488,0.001523,-2.371795,0.976832
0,sum_layer_0,0.005135,0.005211,-1.485884,0.985359
23,sum_layer_23,0.002122,0.002026,4.539326,1.047552
6,sum_layer_6,0.001838,0.001731,5.812143,1.061708
8,sum_layer_8,0.002176,0.002047,5.959684,1.063374
2,sum_layer_2,0.001403,0.001312,6.458192,1.069041
20,sum_layer_20,0.001792,0.001606,10.377861,1.115796
3,sum_layer_3,0.001533,0.001364,11.014312,1.123776
16,sum_layer_16,0.001488,0.001307,12.207626,1.139051
11,sum_layer_11,0.001353,0.001179,12.896406,1.148058


### **hotpotqa_pl**, hallu: 281, non-hallu: 1927

label,feature,0,1,median_perc_diff,median_proportion
0,sum_layer_0,0.011009,0.01178,-6.999307,0.934585
4,sum_layer_4,0.003922,0.003859,1.605058,1.016312
10,sum_layer_10,0.003132,0.003078,1.705238,1.017348
8,sum_layer_8,0.004574,0.004425,3.252711,1.033621
2,sum_layer_2,0.002329,0.002243,3.685504,1.038265
3,sum_layer_3,0.003044,0.002895,4.887218,1.051383
6,sum_layer_6,0.003857,0.003609,6.429278,1.06871
1,sum_layer_1,0.002588,0.002344,9.432572,1.10415
23,sum_layer_23,0.004436,0.00396,10.748065,1.120424
41,sum_layer_41,0.001356,0.001197,11.744023,1.133068


### **nq**, hallu: 972, non-hallu: 7058

label,feature,0,1,median_perc_diff,median_proportion
10,sum_layer_10,0.001625,0.00161,0.938967,1.009479
0,sum_layer_0,0.005188,0.005106,1.580882,1.016063
2,sum_layer_2,0.00137,0.001323,3.479471,1.036049
6,sum_layer_6,0.001912,0.001801,5.811923,1.061706
8,sum_layer_8,0.002378,0.002233,6.134723,1.065357
1,sum_layer_1,0.001444,0.001347,6.73712,1.072238
3,sum_layer_3,0.00141,0.001304,7.572684,1.081931
23,sum_layer_23,0.002779,0.002551,8.201784,1.089346
4,sum_layer_4,0.001698,0.001526,10.134756,1.112777
27,sum_layer_27,0.000979,0.000854,12.85297,1.147486


### **polqa**, hallu: 377, non-hallu: 1492

label,feature,0,1,median_perc_diff,median_proportion
0,sum_layer_0,0.010426,0.010803,-3.622393,0.965042
3,sum_layer_3,0.002654,0.002583,2.694934,1.027696
2,sum_layer_2,0.002167,0.002098,3.169014,1.032727
4,sum_layer_4,0.003618,0.00346,4.375329,1.045755
8,sum_layer_8,0.004383,0.004181,4.612707,1.048358
10,sum_layer_10,0.002935,0.0028,4.613385,1.048365
6,sum_layer_6,0.003641,0.003445,5.395495,1.057032
1,sum_layer_1,0.002472,0.002235,9.567901,1.105802
41,sum_layer_41,0.001112,0.001002,9.901414,1.109895
23,sum_layer_23,0.004353,0.003922,9.903593,1.109922


### **poquad_v2**, hallu: 425, non-hallu: 5822

label,feature,0,1,median_perc_diff,median_proportion
3,sum_layer_3,0.005615,0.005646,-0.543478,0.994595
2,sum_layer_2,0.00396,0.00396,0.0,1.0
4,sum_layer_4,0.00803,0.008003,0.332542,1.003337
1,sum_layer_1,0.004721,0.004585,2.868687,1.029534
0,sum_layer_0,0.023071,0.022339,3.174603,1.032787
10,sum_layer_10,0.007874,0.007519,4.505814,1.047184
41,sum_layer_41,0.002674,0.002504,6.348074,1.067784
6,sum_layer_6,0.008926,0.008163,8.547009,1.093458
8,sum_layer_8,0.01123,0.010048,10.529891,1.117692
38,sum_layer_38,0.002953,0.002464,16.537468,1.198142


### **xsum**, hallu: 822, non-hallu: 6954

label,feature,0,1,median_perc_diff,median_proportion
41,sum_layer_41,0.001572,0.001694,-7.79733,0.927667
4,sum_layer_4,0.004322,0.004515,-4.457193,0.95733
27,sum_layer_27,0.002797,0.002871,-2.625298,0.974419
39,sum_layer_39,0.007059,0.007196,-1.94542,0.980917
37,sum_layer_37,0.006538,0.006634,-1.458576,0.985624
40,sum_layer_40,0.004522,0.004587,-1.433994,0.985863
38,sum_layer_38,0.002979,0.003005,-0.864277,0.991431
1,sum_layer_1,0.003937,0.003956,-0.484496,0.995178
3,sum_layer_3,0.004154,0.004173,-0.459137,0.99543
36,sum_layer_36,0.00692,0.006935,-0.220507,0.9978


In [157]:
n_bins = int(2 * (len(summ_layers_df) ** (1/3)))
n_bins

74

In [158]:
numerical_df = summ_layers_df.drop(columns=['label', 'dataset']).apply(pd.qcut, q=n_bins, labels=False, duplicates='drop')

numerical_df['label'] = summ_layers_df['label'].values
numerical_df['dataset'] = summ_layers_df['dataset'].values

In [159]:
def get_prob_vec(df, col, target_col, min_val=0, max_val=57):
    
    all_values = np.arange(min_val, max_val + 1)
    pos = df.loc[df[target_col] == 1, col].value_counts(normalize=True).reindex(all_values, fill_value=0).sort_index()
    neg = df.loc[df[target_col] == 0, col].value_counts(normalize=True).reindex(all_values, fill_value=0).sort_index()

    return pos, neg

In [160]:
numerical_df.describe()

Unnamed: 0,sum_layer_0,sum_layer_1,sum_layer_2,sum_layer_3,sum_layer_4,sum_layer_5,sum_layer_6,sum_layer_7,sum_layer_8,sum_layer_9,...,sum_layer_33,sum_layer_34,sum_layer_35,sum_layer_36,sum_layer_37,sum_layer_38,sum_layer_39,sum_layer_40,sum_layer_41,label
count,51707.0,51707.0,51707.0,51707.0,51707.0,51707.0,51707.0,51707.0,51707.0,51707.0,...,51707.0,51707.0,51707.0,51707.0,51707.0,51707.0,51707.0,51707.0,51707.0,51707.0
mean,36.485428,36.487555,36.483068,36.489934,36.486511,36.487149,36.489411,36.488483,36.490011,36.49152,...,36.488812,36.487903,36.48858,36.488677,36.487845,36.485544,36.48976,36.489431,36.49005,0.077127
std,21.360857,21.361324,21.366027,21.361413,21.364483,21.361404,21.363177,21.362789,21.361609,21.360207,...,21.361471,21.362246,21.36257,21.362004,21.363058,21.35965,21.361958,21.360784,21.362365,0.266795
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,18.0,18.0,18.0,18.0,18.0,18.0,18.0,18.0,18.0,18.0,...,18.0,18.0,18.0,18.0,18.0,18.0,18.0,18.0,18.0,0.0
50%,36.0,36.0,36.0,36.0,36.0,36.0,36.0,36.0,36.0,36.0,...,36.0,36.0,36.0,36.0,36.0,36.0,36.0,36.0,36.0,0.0
75%,55.0,55.0,55.0,55.0,55.0,55.0,55.0,55.0,55.0,55.0,...,55.0,55.0,55.0,55.0,55.0,55.0,55.0,55.0,55.0,0.0
max,73.0,73.0,73.0,73.0,73.0,73.0,73.0,73.0,73.0,73.0,...,73.0,73.0,73.0,73.0,73.0,73.0,73.0,73.0,73.0,1.0


In [161]:
from scipy.special import kl_div
from scipy.spatial.distance import jensenshannon
from scipy.special import rel_entr
import numpy as np
import pandas as pd
from typing import Tuple

In [162]:
def calculate_probabilities(df: pd.DataFrame, label_col: str, n_bins: int) -> Tuple[np.ndarray, np.ndarray]:
    """
    Calculate positive and negative probability vectors for each column in the dataframe.

    Args:
        df (pd.DataFrame): The input dataframe containing numerical data.
        label_col (str): The name of the column containing the labels.
        n_bins (int): The number of bins to use for probability calculation.

    Returns:
        Tuple[np.ndarray, np.ndarray]: Two arrays containing the positive and negative probabilities.
    """
    
    pos_probs = np.zeros((df.shape[1] - 1, n_bins))
    neg_probs = np.zeros((df.shape[1] - 1, n_bins))

    for i, col in enumerate(df.columns[:-2]):
        pos, neg = get_prob_vec(df, col, label_col, max_val=n_bins - 1)
        pos_probs[i] = pos.values
        neg_probs[i] = neg.values

    return pos_probs, neg_probs

In [163]:
pos_probs, neg_probs = calculate_probabilities(numerical_df.drop(columns=['dataset']), 'label', n_bins)
jensen_divs_df = pd.DataFrame(jensenshannon(pos_probs, neg_probs, axis=1), index=numerical_df.columns[:-2], columns=['js_div'])

  p = p / np.sum(p, axis=axis, keepdims=True)
  q = q / np.sum(q, axis=axis, keepdims=True)


### Jensen Shanon features

In [164]:
jensen_divs_df.sort_values('js_div', ascending=False).head(10)

Unnamed: 0,js_div
sum_layer_30,0.2301
sum_layer_28,0.228094
sum_layer_29,0.226325
sum_layer_31,0.225094
sum_layer_36,0.224105
sum_layer_32,0.219144
sum_layer_34,0.218743
sum_layer_33,0.218434
sum_layer_26,0.20838
sum_layer_37,0.207045


In [165]:
models = {
    'lgbm': LGBMClassifier(
        n_estimators=250,
        learning_rate=0.001,
        max_depth=10,
        num_leaves=15,
        class_weight='balanced',
        reg_alpha=0.1,
        reg_lambda=0.1,
        random_state=42,
        n_jobs=-1,
        silent=True,
        verbose=-1,
    ),
    'logistic_reg': LogisticRegression(max_iter=10000, random_state=42, class_weight='balanced', n_jobs=-1),
}

In [166]:
TOP_N_FEATURES = 7

In [167]:
train_cols_dict = {
    'jensen': jensen_divs_df.nlargest(2 * TOP_N_FEATURES, 'js_div').index,
    'all': numerical_df.columns[:-2]
}

In [168]:
from imblearn.under_sampling import RandomUnderSampler

In [169]:
SPLIT_VAL = True

In [171]:
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler, MinMaxScaler

In [172]:
summ_layers_df = summ_layers_df.loc[summ_layers_df['dataset'].isin(['nq', 'xsum'])]

In [175]:
validation_results = []

# Loop over each unique dataset
for dataset in summ_layers_df['dataset'].unique():

    in_dist_sample = summ_layers_df.loc[summ_layers_df['dataset'] != dataset]
    out_dist_sample = summ_layers_df.loc[summ_layers_df['dataset'] == dataset]

    stats_grouped = in_dist_sample.drop(columns=['dataset']).groupby('label').agg(['median']).T
    stats_grouped['median_proportion'] = stats_grouped[0] / stats_grouped[1]
    stats_grouped = stats_grouped.reset_index().rename(columns={'level_0': 'feature', 'level_1': 'statistic'})

    train_cols_dict['median'] = stats_grouped.sort_values('median_proportion', ascending=False).head(TOP_N_FEATURES)['feature'].values.tolist() \
        + stats_grouped.sort_values('median_proportion', ascending=True).head(TOP_N_FEATURES)['feature'].values.tolist()
    
    # class_0 = in_dist_sample[in_dist_sample['label'] == 0]
    # class_1 = in_dist_sample[in_dist_sample['label'] == 1]
    # class_0_under = class_0.sample(n=len(class_1), random_state=42)
    # in_dist_sample = pd.concat([class_0_under, class_1])

    for method, train_cols in train_cols_dict.items():

        X_train, X_test = in_dist_sample[train_cols], out_dist_sample[train_cols]
        y_train, y_test = in_dist_sample['label'], out_dist_sample['label']

        if SPLIT_VAL:
            X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42, stratify=y_train)

        # rus = RandomUnderSampler(random_state=42)
        # X_train, y_train = rus.fit_resample(X_train, y_train)

        for model_name, model in models.items():

            if model_name == 'logistic_reg':

                scaler = RobustScaler()
                X_train = scaler.fit_transform(X_train)

                if SPLIT_VAL:
                    X_val = scaler.transform(X_val)

                X_test = scaler.transform(X_test)

            model.fit(X_train, y_train)

            train_pred = model.predict_proba(X_train)
            train_auc = roc_auc_score(y_train, train_pred[:, 1])

            print(f"Train AUC for {dataset} with {method} and {model_name = }: {train_auc}")

            auc_val = None

            if SPLIT_VAL:
                y_pred = model.predict_proba(X_val)
                auc_val = roc_auc_score(y_val, y_pred[:, 1])

            y_pred = model.predict_proba(X_test)
            auc_test = roc_auc_score(y_test, y_pred[:, 1])

            # Store the result
            validation_results.append({
                'dataset': dataset,
                'columns_selection': method,
                'model': model_name,
                'train_auc': train_auc,
                'val_auc': auc_val,
                'test_auc': auc_test
            })

    print('\n\n')

Train AUC for nq with jensen and model_name = 'lgbm': 0.7197659377735808
Train AUC for nq with jensen and model_name = 'logistic_reg': 0.654639493567401
Train AUC for nq with all and model_name = 'lgbm': 0.7439185134909159
Train AUC for nq with all and model_name = 'logistic_reg': 0.7179763571521471
Train AUC for nq with median and model_name = 'lgbm': 0.7303099680965823
Train AUC for nq with median and model_name = 'logistic_reg': 0.6670970185223439



Train AUC for xsum with jensen and model_name = 'lgbm': 0.7650725494856335
Train AUC for xsum with jensen and model_name = 'logistic_reg': 0.6956261320205765
Train AUC for xsum with all and model_name = 'lgbm': 0.8102623100550291
Train AUC for xsum with all and model_name = 'logistic_reg': 0.7599053223293422
Train AUC for xsum with median and model_name = 'lgbm': 0.7822031112410269
Train AUC for xsum with median and model_name = 'logistic_reg': 0.7007825455062027





In [176]:
df = pd.DataFrame(validation_results)

# Find the best columns_selection for each dataset based on val_auc
best_columns_selection = df.loc[df.groupby('dataset')['test_auc'].idxmax()]

# Color the best columns_selection for each dataset
def highlight_best_columns_selection(row):
    if row['columns_selection'] == best_columns_selection.loc[best_columns_selection['dataset'] == row['dataset'], 'columns_selection'].values[0]:
        return ['color: yellow'] * len(row)
    else:
        return [''] * len(row)

styled_df = df.style.apply(highlight_best_columns_selection, axis=1)
display(styled_df)

Unnamed: 0,dataset,columns_selection,model,train_auc,val_auc,test_auc
0,nq,jensen,lgbm,0.719766,0.585263,0.480849
1,nq,jensen,logistic_reg,0.654639,0.628049,0.554767
2,nq,all,lgbm,0.743919,0.61531,0.455478
3,nq,all,logistic_reg,0.717976,0.651848,0.573467
4,nq,median,lgbm,0.73031,0.623084,0.424008
5,nq,median,logistic_reg,0.667097,0.64077,0.559254
6,xsum,jensen,lgbm,0.765073,0.655322,0.502485
7,xsum,jensen,logistic_reg,0.695626,0.673706,0.508981
8,xsum,all,lgbm,0.810262,0.704382,0.496844
9,xsum,all,logistic_reg,0.759905,0.761762,0.516444


### Unfortunatelly, the results are not very promising. Still there is a problem with transfering the knowledge from `nq` to `xsum` dataset.