# Importing Libraries and Defining Variables

In [2]:
import os
from pathlib import Path
import statistics

import numpy as np
import pandas as pd
from scipy.stats import spearmanr
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score, KFold, StratifiedKFold, RepeatedStratifiedKFold

from tqdm import tqdm
import catboost as cb
import optuna

In [49]:
USE_CATBOOST_CLASSIFIER = True
USE_TASK_FEATURES = True
USE_LENGTH_FEATURES = True
USE_NEUTRAL_PROBA_FEATURE = True
RUN_NAME = 'new_test_article'

INPUT_DATA_PATH = '/kaggle/input/data/'
REFERENCE_PATH = '/kaggle/input/data/test/'
OUTPUT_DATA_PATH = '/kaggle/input/output/'
SUBMISSION_PATH = f'/kaggle/working/meta_model/{RUN_NAME}/'
RESULT_PATH = f'/kaggle/working/meta_model/{RUN_NAME}/test.score.txt'
MODELS = [
    'FacebookAI-roberta-large-mnli',
    'FacebookAI-roberta-large-mnli_inverse',
    'deberta-selfchecknli',
    'deberta-selfchecknli_inverse',
    'google_t5_xxl_true_nli_mixture',
    'google_t5_xxl_true_nli_mixture_inverse',
    'sentence-transformers-nli-roberta-large',
    'openchat',
    'microsoft-deberta-base-mnli', 
    'microsoft-deberta-large-mnli', 
    'microsoft-deberta-xlarge-mnli',
    'microsoft-deberta-v2-xlarge-mnli',
    'microsoft-deberta-v2-xxlarge-mnli',
    ]

RANDOM_SEED = 12345

Path(SUBMISSION_PATH).mkdir(parents=True, exist_ok=True)

# Preparing data

In [63]:
X = {}
y = {}
for sample in ['val', 'test']:
    X[f'{sample}-agnostic'] = []

    # Load 'task' and 'length' features
    file = f'{INPUT_DATA_PATH}{sample}/{sample}.model-agnostic.json'
    df = pd.read_json(file, orient='records')
    
    if sample == 'test':
        df.set_index('id', drop=True, inplace=True)
    
    if USE_TASK_FEATURES:
        df['DM'] = df['task'].map({'DM': 1, 'MT': 0, 'PG': 0})
        df['MT'] = df['task'].map({'DM': 0, 'MT': 1, 'PG': 0})
        df['PG'] = df['task'].map({'DM': 0, 'MT': 0, 'PG': 1})
        X[f'{sample}-agnostic'].append(df[['DM', 'MT', 'PG']])
    
    if USE_LENGTH_FEATURES:
        df['hyp_len'] = df['hyp'].str.len()
        df['tgt_len'] = df['tgt'].str.len()
        df['src_len'] = df['src'].str.len()
        X[f'{sample}-agnostic'].append(df[['hyp_len', 'tgt_len', 'src_len']])
    
#     print(file)

    # Load y
    df.rename(columns={"p(Hallucination)": "p"}, inplace=True)
    y[f'{sample}-agnostic'] = df[['label', 'p']]

    for model in MODELS:
        # Load features from models
        file = f'{OUTPUT_DATA_PATH}{model}/{sample}/{sample}.model-agnostic.csv'
        df = pd.read_csv(file)
        
        if sample == 'test':
            df.set_index('id', drop=True, inplace=True)
        
        if USE_NEUTRAL_PROBA_FEATURE:
            if "p(Entl)" in df:
                df['p(Neutr)'] = 1 - df['p(Contr)'] - df['p(Entl)']
            else:
                df['p(Entl)'] = 1 - df['p(Contr)']
        
        df.rename(columns={
            "p(Contr)": "p(Contr)_" + model,
            "p(Entl)": "p(Entl)_" + model,
            "p(Neutr)": "p(Neutr)_" + model,
            }, inplace=True)
        
        X[f'{sample}-agnostic'].append(df)
        
        print(file)

    # Concatenate X dataframes
    X[f'{sample}-agnostic'] = pd.concat(X[f'{sample}-agnostic'], axis=1)

/kaggle/input/output/FacebookAI-roberta-large-mnli/val/val.model-agnostic.csv
/kaggle/input/output/FacebookAI-roberta-large-mnli_inverse/val/val.model-agnostic.csv
/kaggle/input/output/deberta-selfchecknli/val/val.model-agnostic.csv
/kaggle/input/output/deberta-selfchecknli_inverse/val/val.model-agnostic.csv
/kaggle/input/output/google_t5_xxl_true_nli_mixture/val/val.model-agnostic.csv
/kaggle/input/output/google_t5_xxl_true_nli_mixture_inverse/val/val.model-agnostic.csv
/kaggle/input/output/sentence-transformers-nli-roberta-large/val/val.model-agnostic.csv
/kaggle/input/output/openchat/val/val.model-agnostic.csv
/kaggle/input/output/microsoft-deberta-base-mnli/val/val.model-agnostic.csv
/kaggle/input/output/microsoft-deberta-large-mnli/val/val.model-agnostic.csv
/kaggle/input/output/microsoft-deberta-xlarge-mnli/val/val.model-agnostic.csv
/kaggle/input/output/microsoft-deberta-v2-xlarge-mnli/val/val.model-agnostic.csv
/kaggle/input/output/microsoft-deberta-v2-xxlarge-mnli/val/val.mode

In [51]:
import os
print(os.listdir("/kaggle/input/"))

['metamodel_labels.cbm', 'score.py', 'output', 'metamodel_probs.cbm', 'data']


In [52]:
display(X['val-agnostic'])
display(y['val-agnostic'])

Unnamed: 0,DM,MT,PG,hyp_len,tgt_len,src_len,p(Contr)_FacebookAI-roberta-large-mnli,p(Entl)_FacebookAI-roberta-large-mnli,p(Contr)_FacebookAI-roberta-large-mnli_inverse,p(Entl)_FacebookAI-roberta-large-mnli_inverse,...,p(Neutr)_microsoft-deberta-large-mnli,p(Contr)_microsoft-deberta-xlarge-mnli,p(Entl)_microsoft-deberta-xlarge-mnli,p(Neutr)_microsoft-deberta-xlarge-mnli,p(Contr)_microsoft-deberta-v2-xlarge-mnli,p(Entl)_microsoft-deberta-v2-xlarge-mnli,p(Neutr)_microsoft-deberta-v2-xlarge-mnli,p(Contr)_microsoft-deberta-v2-xxlarge-mnli,p(Entl)_microsoft-deberta-v2-xxlarge-mnli,p(Neutr)_microsoft-deberta-v2-xxlarge-mnli
0,1,0,0,41,36,316,0.001844,0.998156,0.003966,0.996034,...,0.073263,0.001269,0.937311,0.061420,0.001208,0.956369,0.042423,0.002282,0.913186,0.084532
1,1,0,0,32,1,451,0.289209,0.710791,0.150303,0.849697,...,0.743700,0.056093,0.335795,0.608112,0.026490,0.200199,0.773311,0.034473,0.020775,0.944752
2,1,0,0,34,36,181,0.993463,0.006537,0.932218,0.067782,...,0.126522,0.409416,0.335741,0.254843,0.669810,0.130411,0.199779,0.604200,0.209205,0.186595
3,1,0,0,17,100,149,0.094564,0.905436,0.229191,0.770809,...,0.019645,0.004535,0.980909,0.014556,0.052765,0.911118,0.036117,0.007620,0.975416,0.016964
4,1,0,0,39,63,441,0.013538,0.986462,0.723117,0.276883,...,0.041360,0.008921,0.891682,0.099397,0.003898,0.962758,0.033344,0.013182,0.932905,0.053913
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
494,0,1,0,32,39,30,0.001673,0.998327,0.001751,0.998249,...,0.001543,0.000507,0.994063,0.005430,0.000151,0.998987,0.000862,0.000825,0.995205,0.003970
495,0,1,0,42,46,48,0.004702,0.995298,0.015190,0.984810,...,0.077747,0.024779,0.931660,0.043561,0.005710,0.983528,0.010762,0.001241,0.991755,0.007004
496,0,1,0,43,42,43,0.001838,0.998162,0.002023,0.997977,...,0.003287,0.000437,0.992136,0.007427,0.000263,0.998185,0.001552,0.000795,0.994654,0.004551
497,0,1,0,28,31,22,0.000960,0.999040,0.000589,0.999411,...,0.002048,0.000396,0.993571,0.006033,0.000123,0.998568,0.001309,0.000770,0.994802,0.004428


Unnamed: 0,label,p
0,Not Hallucination,0.2
1,Hallucination,0.8
2,Not Hallucination,0.0
3,Not Hallucination,0.2
4,Not Hallucination,0.0
...,...,...
494,Not Hallucination,0.0
495,Not Hallucination,0.2
496,Not Hallucination,0.0
497,Not Hallucination,0.0


# Testing our meta model

## Labels

In [53]:
MODEL_PATH = '/kaggle/input/metamodel_labels.cbm'
new_model = cb.CatBoostClassifier()
new_model.load_model(MODEL_PATH)

<catboost.core.CatBoostClassifier at 0x7fdd988b4bb0>

In [54]:
if USE_CATBOOST_CLASSIFIER:
    sample = 'test'
    y_test = y[f'{sample}-agnostic']['label']

    preds_label = new_model.predict(X[f'{sample}-agnostic'])
    
    accuracy = accuracy_score(y_test, preds_label)
    spearman = spearmanr(preds_label, y_test)[0]
print(f'{sample}-agnostic - done!')
print('Accuracy obtained is:', accuracy)
# print('Spearman (rho) obtained is:', spearman)

test-agnostic - done!
Accuracy obtained is: 0.822


In [55]:
# print(np.unique(preds_label), len(preds_label), len(y_test), set(y_test))
c1, c2 = 0, 0
d1, d2 = 0, 0
for i in preds_label:
    if 'Not' in i:
        c1 += 1
    else:
        d1 += 1
for i in y_test:
    if 'Not' in i:
        c2 += 1
    else:
        d2 += 1
(c1, d1), (c2, d2)

((932, 568), (889, 611))

## Probabilities

In [56]:
MODEL_PATH = '/kaggle/input/metamodel_probs.cbm'
new_model = cb.CatBoostRegressor()
new_model.load_model(MODEL_PATH, format="cbm")

<catboost.core.CatBoostRegressor at 0x7fdd988b5330>

In [57]:
from sklearn.metrics import mean_absolute_error

sample = 'test'
y_test = y[f'{sample}-agnostic']['p']

preds_proba = new_model.predict(X[f'{sample}-agnostic'])
np.clip(preds_proba, 0.0, 1.0, out=preds_proba)

mae = mean_absolute_error(y_test, preds_proba)
spearman = spearmanr(preds_proba, y_test)[0]

print(f'{sample}-agnostic - done!')
# print('MAE obtained is:', mae)
print('Spearman (rho) obtained is:', spearman)

test-agnostic - done!
Spearman (rho) obtained is: 0.7400489499199349


# Formatting the scores and labels

In [58]:
sample = 'test'
submission = {}
if USE_CATBOOST_CLASSIFIER:
    labels = preds_label
else:
    labels = pd.Series(preds_proba).apply(lambda x: 'Hallucination' if x > 0.5 else 'Not Hallucination').to_list()

submission = pd.DataFrame({
    'id': X[f'{sample}-agnostic'].index.to_list(),
    'label': labels,
    'p(Hallucination)': preds_proba,
    })
display(submission)

Unnamed: 0,id,label,p(Hallucination)
0,1,Not Hallucination,0.443453
1,2,Not Hallucination,0.460856
2,3,Not Hallucination,0.395440
3,6,Hallucination,0.543602
4,7,Hallucination,0.484745
...,...,...,...
1495,2992,Not Hallucination,0.399355
1496,2993,Hallucination,0.509339
1497,2994,Not Hallucination,0.422509
1498,2996,Not Hallucination,0.410471


## Save to json

In [59]:
json_file_path = f'{SUBMISSION_PATH}{sample}.model-agnostic.json'
!rm $json_file_path

In [60]:
sample = 'test'
json_file_path = f'{SUBMISSION_PATH}{sample}.model-agnostic.json'

# Check if file exists
assert not os.path.exists(json_file_path), 'File already exists.'

# Save to .json file
submission.to_json(path_or_buf=json_file_path, orient='records')

print(json_file_path)

/kaggle/working/meta_model/new_test_article/test.model-agnostic.json


## Evaluate

In [61]:
sample = 'test'
if sample == 'val':
    !python /kaggle/input/score.py {SUBMISSION_PATH} {REFERENCE_PATH} {RESULT_PATH} --is_val
#     score(SUBMISSION_PATH, REFERENCE_PATH, RESULT_PATH, is_val=True)
elif sample == 'test':
    !python /kaggle/input/score.py {SUBMISSION_PATH} {REFERENCE_PATH} {RESULT_PATH}
#     score(SUBMISSION_PATH, REFERENCE_PATH, RESULT_PATH, is_val=False)

with open(RESULT_PATH) as fp:
    print(fp.read())

agnostic_acc:0.822
agnostic_rho:0.7400489499199349

