# Setup

In [1]:
import os
from pathlib import Path
import statistics

import numpy as np
import pandas as pd
from scipy.stats import spearmanr
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score, KFold, StratifiedKFold, RepeatedStratifiedKFold

from tqdm import tqdm
import catboost as cb
import optuna

In [2]:
import os
print(os.listdir("/kaggle/input/"))

['nlp-project-dataset']


In [3]:
AUTO_FIND_HYPERPARAMETERS = True
MANUAL_FIND_HYPERPARAMETERS = False
EVALUATE_ON_VAL = True
USE_CATBOOST_CLASSIFIER = True
USE_TASK_FEATURES = True
USE_LENGTH_FEATURES = True
USE_NEUTRAL_PROBA_FEATURE = False
RUN_NAME = 'new_test_article'

INPUT_DATA_PATH = '/kaggle/input/nlp-project-dataset/data/'
REFERENCE_PATH = '/kaggle/input/nlp-project-dataset/data/test/'
OUTPUT_DATA_PATH = '/kaggle/input/nlp-project-dataset/output/'
SUBMISSION_PATH = f'/kaggle/working/meta_model/{RUN_NAME}/'
RESULT_PATH = f'/kaggle/working/meta_model/{RUN_NAME}/test.score.txt'
MODELS = [
    'FacebookAI-roberta-large-mnli',
    'FacebookAI-roberta-large-mnli_inverse',
    'deberta-selfchecknli',
    'deberta-selfchecknli_inverse',
    'google_t5_xxl_true_nli_mixture',
    'google_t5_xxl_true_nli_mixture_inverse',
    'sentence-transformers-nli-roberta-large',
    'openchat',
    'microsoft-deberta-base-mnli', 
    'microsoft-deberta-large-mnli', 
    'microsoft-deberta-xlarge-mnli',
    'microsoft-deberta-v2-xlarge-mnli',
    'microsoft-deberta-v2-xxlarge-mnli',
    ]

RANDOM_SEED = 12345

Path(SUBMISSION_PATH).mkdir(parents=True, exist_ok=True)

# Load data

In [4]:
X = {}
y = {}
for sample in ['val', 'test']:
    X[f'{sample}-agnostic'] = []

    # Load 'task' and 'length' features
    file = f'{INPUT_DATA_PATH}{sample}/{sample}.model-agnostic.json'
    df = pd.read_json(file, orient='records')
    
    if sample == 'test':
        df.set_index('id', drop=True, inplace=True)
    
    if USE_TASK_FEATURES:
        if AUTO_FIND_HYPERPARAMETERS:
            df['DM'] = df['task'].map({'DM': 1, 'MT': 0, 'PG': 0})
            df['MT'] = df['task'].map({'DM': 0, 'MT': 1, 'PG': 0})
            X[f'{sample}-agnostic'].append(df[['DM', 'MT']])
    
    if USE_LENGTH_FEATURES:
        df['hyp_len'] = df['hyp'].str.len()
        df['tgt_len'] = df['tgt'].str.len()
        df['src_len'] = df['src'].str.len()
        X[f'{sample}-agnostic'].append(df[['hyp_len', 'tgt_len', 'src_len']])
    
    print(file)

    # Load y
    df.rename(columns={"p(Hallucination)": "p"}, inplace=True)
    y[f'{sample}-agnostic'] = df[['label', 'p']]

    for model in MODELS:
        # Load features from models
        file = f'{OUTPUT_DATA_PATH}{model}/{sample}/{sample}.model-agnostic.csv'
        df = pd.read_csv(file)
        
        if sample == 'test':
            df.set_index('id', drop=True, inplace=True)
        
        if USE_NEUTRAL_PROBA_FEATURE:
            if "p(Entl)" in df:
                df['p(Neutr)'] = 1 - df['p(Contr)'] - df['p(Entl)']
            else:
                df['p(Entl)'] = 1 - df['p(Contr)']
        
        df.rename(columns={
            "p(Contr)": "p(Contr)_" + model,
            "p(Entl)": "p(Entl)_" + model,
            "p(Neutr)": "p(Neutr)_" + model,
            }, inplace=True)
        
        X[f'{sample}-agnostic'].append(df)
        
        print(file)

    # Concatenate X dataframes
    X[f'{sample}-agnostic'] = pd.concat(X[f'{sample}-agnostic'], axis=1)

/kaggle/input/nlp-project-dataset/data/val/val.model-agnostic.json
/kaggle/input/nlp-project-dataset/output/FacebookAI-roberta-large-mnli/val/val.model-agnostic.csv
/kaggle/input/nlp-project-dataset/output/FacebookAI-roberta-large-mnli_inverse/val/val.model-agnostic.csv
/kaggle/input/nlp-project-dataset/output/deberta-selfchecknli/val/val.model-agnostic.csv
/kaggle/input/nlp-project-dataset/output/deberta-selfchecknli_inverse/val/val.model-agnostic.csv
/kaggle/input/nlp-project-dataset/output/google_t5_xxl_true_nli_mixture/val/val.model-agnostic.csv
/kaggle/input/nlp-project-dataset/output/google_t5_xxl_true_nli_mixture_inverse/val/val.model-agnostic.csv
/kaggle/input/nlp-project-dataset/output/sentence-transformers-nli-roberta-large/val/val.model-agnostic.csv
/kaggle/input/nlp-project-dataset/output/openchat/val/val.model-agnostic.csv
/kaggle/input/nlp-project-dataset/output/microsoft-deberta-base-mnli/val/val.model-agnostic.csv
/kaggle/input/nlp-project-dataset/output/microsoft-deber

In [5]:
# Print dataframe names
print('For X data:')
for df_name in X:
    print('\t', df_name)
print()
print('For y data:')
for df_name in y:
    print('\t', df_name)

For X data:
	 val-agnostic
	 test-agnostic

For y data:
	 val-agnostic
	 test-agnostic


In [6]:
X['val-agnostic']

Unnamed: 0,DM,MT,hyp_len,tgt_len,src_len,p(Contr)_FacebookAI-roberta-large-mnli,p(Contr)_FacebookAI-roberta-large-mnli_inverse,p(Contr)_deberta-selfchecknli,p(Contr)_deberta-selfchecknli_inverse,p(Contr)_google_t5_xxl_true_nli_mixture,...,p(Contr)_microsoft-deberta-base-mnli,p(Entl)_microsoft-deberta-base-mnli,p(Contr)_microsoft-deberta-large-mnli,p(Entl)_microsoft-deberta-large-mnli,p(Contr)_microsoft-deberta-xlarge-mnli,p(Entl)_microsoft-deberta-xlarge-mnli,p(Contr)_microsoft-deberta-v2-xlarge-mnli,p(Entl)_microsoft-deberta-v2-xlarge-mnli,p(Contr)_microsoft-deberta-v2-xxlarge-mnli,p(Entl)_microsoft-deberta-v2-xxlarge-mnli
0,1,0,41,36,316,0.001844,0.003966,0.002097,0.027509,0,...,0.001249,0.918769,0.000930,0.925807,0.001269,0.937311,0.001208,0.956369,0.002282,0.913186
1,1,0,32,1,451,0.289209,0.150303,0.643224,0.616778,1,...,0.043280,0.451597,0.053752,0.202548,0.056093,0.335795,0.026490,0.200199,0.034473,0.020775
2,1,0,34,36,181,0.993463,0.932218,0.707190,0.608000,1,...,0.367452,0.423152,0.843042,0.030436,0.409416,0.335741,0.669810,0.130411,0.604200,0.209205
3,1,0,17,100,149,0.094564,0.229191,0.008859,0.861013,0,...,0.045262,0.933873,0.010749,0.969606,0.004535,0.980909,0.052765,0.911118,0.007620,0.975416
4,1,0,39,63,441,0.013538,0.723117,0.004337,0.934161,0,...,0.209675,0.284467,0.030628,0.928012,0.008921,0.891682,0.003898,0.962758,0.013182,0.932905
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
494,0,1,32,39,30,0.001673,0.001751,0.000138,0.000098,0,...,0.000238,0.997373,0.000179,0.998278,0.000507,0.994063,0.000151,0.998987,0.000825,0.995205
495,0,1,42,46,48,0.004702,0.015190,0.000263,0.000158,0,...,0.002412,0.963516,0.022022,0.900231,0.024779,0.931660,0.005710,0.983528,0.001241,0.991755
496,0,1,43,42,43,0.001838,0.002023,0.000291,0.000118,0,...,0.000273,0.996779,0.000256,0.996457,0.000437,0.992136,0.000263,0.998185,0.000795,0.994654
497,0,1,28,31,22,0.000960,0.000589,0.000053,0.000040,0,...,0.000354,0.995894,0.000149,0.997803,0.000396,0.993571,0.000123,0.998568,0.000770,0.994802


In [7]:
y['val-agnostic']

Unnamed: 0,label,p
0,Not Hallucination,0.2
1,Hallucination,0.8
2,Not Hallucination,0.0
3,Not Hallucination,0.2
4,Not Hallucination,0.0
...,...,...
494,Not Hallucination,0.0
495,Not Hallucination,0.2
496,Not Hallucination,0.0
497,Not Hallucination,0.0


# CatBoost Classifier
## Find parameters

In [8]:
import warnings
warnings.filterwarnings('ignore')

In [9]:
def objective(trial):
    params = {
        'iterations': trial.suggest_int('iterations', 4, 10),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.005, 0.1),
        'depth': trial.suggest_int('depth', 6, 12),
        'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-2, 10),
        'border_count': trial.suggest_int('border_count', 5, 200),
        'verbose': False,
        'task_type': 'GPU',
        'random_seed': RANDOM_SEED
    }

    model = cb.CatBoostClassifier(**params)

    # Cross validation
    sample = 'val'
    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=RANDOM_SEED)
    accuracy = cross_val_score(model, X[f'{sample}-agnostic'], y[f'{sample}-agnostic']['label'], cv=cv, scoring='accuracy').mean()

    return accuracy


best_params_dict = {}
if USE_CATBOOST_CLASSIFIER and AUTO_FIND_HYPERPARAMETERS:
    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=5)
    best_params_dict = study.best_params
#     best_params_dict = {'iterations': 216, 'learning_rate': 0.0102019005543733, 'depth': 12, 'l2_leaf_reg': 0.09976943036124933, 'border_count': 39}


elif USE_CATBOOST_CLASSIFIER:
    best_params_dict = {'iterations': 216, 'learning_rate': 0.0102019005543733, 'depth': 12, 'l2_leaf_reg': 0.09976943036124933, 'border_count': 39}

[I 2024-04-25 08:23:37,112] A new study created in memory with name: no-name-e94b49b5-9035-4475-8111-3f698bf9cfa2
[I 2024-04-25 08:23:46,757] Trial 0 finished with value: 0.8336326530612244 and parameters: {'iterations': 9, 'learning_rate': 0.09478147302164794, 'depth': 12, 'l2_leaf_reg': 0.03180637028435203, 'border_count': 65}. Best is trial 0 with value: 0.8336326530612244.
[I 2024-04-25 08:23:52,607] Trial 1 finished with value: 0.8197551020408163 and parameters: {'iterations': 10, 'learning_rate': 0.021593991452118738, 'depth': 7, 'l2_leaf_reg': 0.04004889790660488, 'border_count': 76}. Best is trial 0 with value: 0.8336326530612244.
[I 2024-04-25 08:24:00,422] Trial 2 finished with value: 0.8457551020408163 and parameters: {'iterations': 7, 'learning_rate': 0.03183845011312829, 'depth': 12, 'l2_leaf_reg': 0.8685136884272956, 'border_count': 12}. Best is trial 2 with value: 0.8457551020408163.
[I 2024-04-25 08:24:05,956] Trial 3 finished with value: 0.8337551020408164 and paramete

In [10]:
if USE_CATBOOST_CLASSIFIER:
    print("Best Params:", best_params_dict)

Best Params: {'iterations': 7, 'learning_rate': 0.03183845011312829, 'depth': 12, 'l2_leaf_reg': 0.8685136884272956, 'border_count': 12}


# Training our meta model

In [11]:
if USE_CATBOOST_CLASSIFIER:
    sample = 'val'
    model = cb.CatBoostClassifier(**best_params_dict)
    model.fit(X[f'{sample}-agnostic'], y[f'{sample}-agnostic']['label'], verbose=1)

0:	learn: 0.6606565	total: 15.6ms	remaining: 93.5ms
1:	learn: 0.6292072	total: 25.6ms	remaining: 64.1ms
2:	learn: 0.6017202	total: 35.5ms	remaining: 47.4ms
3:	learn: 0.5842324	total: 36.7ms	remaining: 27.5ms
4:	learn: 0.5591999	total: 46ms	remaining: 18.4ms
5:	learn: 0.5338759	total: 55.3ms	remaining: 9.22ms
6:	learn: 0.5111723	total: 64.8ms	remaining: 0us


In [12]:
MODEL_PATH = '/'.join(SUBMISSION_PATH.split('/')[:-1]) + '/metamodel_labels.cbm'
print(f'{MODEL_PATH}')
# model.save_model(MODEL_PATH, format="cbm", export_parameters=None)
model.save_model(MODEL_PATH)

/kaggle/working/meta_model/new_test_article/metamodel_labels.cbm


## Predict

In [13]:
new_model = cb.CatBoostClassifier()
new_model.load_model(MODEL_PATH)

<catboost.core.CatBoostClassifier at 0x7b273c09f7f0>

In [14]:
if USE_CATBOOST_CLASSIFIER:  
    sample = 'test'
    y_test = y[f'{sample}-agnostic']['label']

    preds_label = new_model.predict(X[f'{sample}-agnostic'])
#     preds_label_ = pd.Series(preds_label).apply(lambda x: 'Hallucination' if x > 0.5 else 'Not Hallucination').to_list()

    accuracy = accuracy_score(y_test, preds_label)
    spearman = spearmanr(preds_label, y_test)[0]
print(f'{sample}-agnostic - done!')
print('Accuracy obtained is:', accuracy)
print('Spearman (rho) obtained is:', spearman)

test-agnostic - done!
Accuracy obtained is: 0.822
Spearman (rho) obtained is: 0.6283685333499764


In [15]:
# print(np.unique(preds_label), len(preds_label), len(y_test), set(y_test))
c1, c2 = 0, 0
d1, d2 = 0, 0
for i in preds_label:
    if 'Not' in i:
        c1 += 1
    else:
        d1 += 1
for i in y_test:
    if 'Not' in i:
        c2 += 1
    else:
        d2 += 1
(c1, d1), (c2, d2)

((932, 568), (889, 611))

# CatBoost Regressor

## Train on 100 % of data

In [16]:
sample = 'val'
model = cb.CatBoostRegressor(**best_params_dict)
model.fit(X[f'{sample}-agnostic'], y[f'{sample}-agnostic']['p'], verbose=1)

0:	learn: 0.3453029	total: 7.24ms	remaining: 43.4ms
1:	learn: 0.3387809	total: 13.9ms	remaining: 34.7ms
2:	learn: 0.3327641	total: 20.2ms	remaining: 27ms
3:	learn: 0.3265059	total: 27.4ms	remaining: 20.6ms
4:	learn: 0.3206510	total: 29.7ms	remaining: 11.9ms
5:	learn: 0.3152983	total: 35.6ms	remaining: 5.94ms
6:	learn: 0.3097425	total: 41.9ms	remaining: 0us


<catboost.core.CatBoostRegressor at 0x7b273c09f610>

In [17]:
MODEL_PATH = '/'.join(SUBMISSION_PATH.split('/')[:-1]) + '/metamodel_probs.cbm'
# print(f'{MODEL_PATH}')
model.save_model(MODEL_PATH, format="cbm", export_parameters=None)

## Predict

In [18]:
new_model = cb.CatBoostRegressor()
new_model.load_model(MODEL_PATH, format="cbm")

<catboost.core.CatBoostRegressor at 0x7b2784d69930>

In [19]:
from sklearn.metrics import mean_absolute_error
sample = 'test'
y_test = y[f'{sample}-agnostic']['p']
# y_test_label = pd.Series(y_test).apply(lambda x: 'Hallucination' if x > 0.5 else 'Not Hallucination').to_list()


preds_proba = model.predict(X[f'{sample}-agnostic'])
np.clip(preds_proba, 0.0, 1.0, out=preds_proba)
# preds_proba_label = pd.Series(preds_proba).apply(lambda x: 'Hallucination' if x > 0.5 else 'Not Hallucination').to_list()

mae = mean_absolute_error(y_test, preds_proba)
spearman = spearmanr(preds_proba, y_test)[0]

print(f'{sample}-agnostic - done!')
print('MAE obtained is:', mae)
print('Spearman (rho) obtained is:', spearman)

test-agnostic - done!
MAE obtained is: 0.279055067978355
Spearman (rho) obtained is: 0.7400489499199349


# Submit
## Form dataframe

In [20]:
sample = 'test'
submission = {}
if USE_CATBOOST_CLASSIFIER:
    labels = preds_label
else:
    labels = pd.Series(preds_proba).apply(lambda x: 'Hallucination' if x > 0.5 else 'Not Hallucination').to_list()

submission = pd.DataFrame({
    'id': X[f'{sample}-agnostic'].index.to_list(),
    'label': labels,
    'p(Hallucination)': preds_proba,
    })
display(submission)

Unnamed: 0,id,label,p(Hallucination)
0,1,Not Hallucination,0.443453
1,2,Not Hallucination,0.460856
2,3,Not Hallucination,0.395440
3,6,Hallucination,0.543602
4,7,Hallucination,0.484745
...,...,...,...
1495,2992,Not Hallucination,0.399355
1496,2993,Hallucination,0.509339
1497,2994,Not Hallucination,0.422509
1498,2996,Not Hallucination,0.410471


## Save to json

In [21]:
json_file_path = f'{SUBMISSION_PATH}{sample}.model-agnostic.json'
!rm $json_file_path

In [22]:
sample = 'test'
json_file_path = f'{SUBMISSION_PATH}{sample}.model-agnostic.json'

# Check if file exists
assert not os.path.exists(json_file_path), 'File already exists.'

# Save to .json file
submission.to_json(path_or_buf=json_file_path, orient='records')

print(json_file_path)

/kaggle/working/meta_model/new_test_article/test.model-agnostic.json


## Evaluate

In [23]:
sample = 'test'
if sample == 'val':
    !python /kaggle/input/nlp-project-dataset/score.py {SUBMISSION_PATH} {REFERENCE_PATH} {RESULT_PATH} --is_val
#     score(SUBMISSION_PATH, REFERENCE_PATH, RESULT_PATH, is_val=True)
elif sample == 'test':
    !python /kaggle/input/nlp-project-dataset/score.py {SUBMISSION_PATH} {REFERENCE_PATH} {RESULT_PATH}
#     score(SUBMISSION_PATH, REFERENCE_PATH, RESULT_PATH, is_val=False)

with open(RESULT_PATH) as fp:
    print(fp.read())

agnostic_acc:0.822
agnostic_rho:0.7400489499199349



# Feature importance

In [24]:
display(pd.DataFrame({
    'feature_importance': model.get_feature_importance(), 
    'feature_names': X[f'val-agnostic'].columns,
}).sort_values(by=['feature_importance'], ascending=False))

Unnamed: 0,feature_importance,feature_names
9,16.530455,p(Contr)_google_t5_xxl_true_nli_mixture
2,8.775044,hyp_len
4,8.348604,src_len
8,7.320123,p(Contr)_deberta-selfchecknli_inverse
17,6.691777,p(Contr)_microsoft-deberta-xlarge-mnli
20,6.534162,p(Entl)_microsoft-deberta-v2-xlarge-mnli
5,5.600031,p(Contr)_FacebookAI-roberta-large-mnli
3,5.340725,tgt_len
12,4.847838,p(Contr)_openchat
7,3.920535,p(Contr)_deberta-selfchecknli
