In [1]:
# pip install scikit-learn
# pip install fasttext-langdetect
# pip install matplotlib

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import glob
from sklearn.metrics import classification_report, roc_curve, auc
from ftlangdetect import detect

In [3]:
def get_language(text):
    text = text.lower()
    res = detect(text=text.replace('\n', ' '), low_memory=False)
    if res['score'] > 0.5: return res['lang']
    return 'unknown'

# 1. Baseline

## Roberta

## 1.1 dev set

In [4]:
roberta_dev = pd.read_json("data/subtaskA_dev_multilingual.jsonl", lines=True)
roberta_dev['language'] = [get_language(text) for text in roberta_dev['text']]

file = "prediction/xlm_roberta_base_multi_dev.jsonl"
temp = pd.read_json(file, lines=True)
roberta_dev[['roberta_label', 'roberta_prob']] = temp[['label', 'probs']]



### 1.1.1 Accuracy for baseline

In [5]:
print(classification_report(roberta_dev['label'], roberta_dev['roberta_label'], digits=5, output_dict=False))

              precision    recall  f1-score   support

           0    0.82976   0.32900   0.47118      2000
           1    0.58154   0.93250   0.71634      2000

    accuracy                        0.63075      4000
   macro avg    0.70565   0.63075   0.59376      4000
weighted avg    0.70565   0.63075   0.59376      4000



### 1.1.2 Optimal classification threshold

In [6]:
roberta_dev_auc_dict = {}
languages = ['ru', 'ar', 'de']

# Calculate AUC and optimal threshold for each language
for lang in languages:
    lang_df = roberta_dev[roberta_dev['language'] == lang]
    if not lang_df.empty:
        fpr, tpr, thresholds = roc_curve(lang_df['label'], lang_df['roberta_prob'])
        lang_auc = auc(fpr, tpr)
        optimal_idx = np.argmax(tpr - fpr)
        optimal_threshold = thresholds[optimal_idx]
        roberta_dev_auc_dict[lang] = {'auc': lang_auc, 'th_optim': optimal_threshold}

fpr, tpr, thresholds = roc_curve(roberta_dev['label'], roberta_dev['roberta_prob'])
overall_auc = auc(fpr, tpr)
optimal_idx = np.argmax(tpr - fpr)
optimal_threshold = thresholds[optimal_idx]
roberta_dev_auc_dict['overall'] = {'auc': overall_auc, 'th_optim': optimal_threshold}

roberta_dev_auc_dict

{'ru': {'auc': 0.6693805894308943, 'th_optim': 0.9999693632000001},
 'ar': {'auc': 0.9978815261044177, 'th_optim': 0.9999364614},
 'de': {'auc': 0.966314, 'th_optim': 0.9999710321},
 'overall': {'auc': 0.855895125, 'th_optim': 0.9999693632000001}}

## 1.2 test set

In [7]:
roberta_test = pd.read_json("data/subtaskA_multilingual.jsonl", lines=True)
roberta_test['language'] = [get_language(text) for text in roberta_test['text']]

file = "prediction/xlm_roberta_base_multi_test.jsonl"
temp = pd.read_json(file, lines=True)
roberta_test[['roberta_label', 'roberta_prob']] = temp[['label', 'probs']]

### 1.2.1 Accuracy for baseline

In [8]:
print(classification_report(roberta_test['label'], roberta_test['roberta_label'], digits=5, output_dict=False))

              precision    recall  f1-score   support

           0    0.99756   0.48547   0.65311     20238
           1    0.67989   0.99892   0.80909     22140

    accuracy                        0.75372     42378
   macro avg    0.83872   0.74219   0.73110     42378
weighted avg    0.83160   0.75372   0.73460     42378



### 1.2.2 Optimal classification threshold

In [9]:
roberta_test_auc_dict = {}
languages = ['en', 'de', 'ar', 'it']

# Calculate AUC and optimal threshold for each language
for lang in languages:
    lang_df = roberta_test[roberta_test['language'] == lang]
    if not lang_df.empty:
        fpr, tpr, thresholds = roc_curve(lang_df['label'], lang_df['roberta_prob'])
        lang_auc = auc(fpr, tpr)
        optimal_idx = np.argmax(tpr - fpr)
        optimal_threshold = thresholds[optimal_idx]
        roberta_test_auc_dict[lang] = {'auc': lang_auc, 'th_optim': optimal_threshold}

fpr, tpr, thresholds = roc_curve(roberta_test['label'], roberta_test['roberta_prob'])
overall_auc = auc(fpr, tpr)
optimal_idx = np.argmax(tpr - fpr)
optimal_threshold = thresholds[optimal_idx]
roberta_test_auc_dict['overall'] = {'auc': overall_auc, 'th_optim': optimal_threshold}

roberta_test_auc_dict

{'en': {'auc': 0.9935276806792759, 'th_optim': 0.9999765158},
 'de': {'auc': 0.9910564256358992, 'th_optim': 0.9999238253},
 'ar': {'auc': 0.9968044659514713, 'th_optim': 0.9999694824},
 'it': {'auc': 0.9993302917734165, 'th_optim': 0.9999756813},
 'overall': {'auc': 0.9916376075469752, 'th_optim': 0.9999754429000001}}

## Llama

## 1.1 dev set

In [10]:
llama_dev = pd.read_json("data/subtaskA_dev_multilingual.jsonl", lines=True)
llama_dev['language'] = [get_language(text) for text in llama_dev['text']]

file = "prediction/llama_multi_dev.jsonl"
temp = pd.read_json(file, lines=True)
llama_dev[['llama_label', 'llama_prob']] = temp[['label', 'probs']]

### 1.1.1 Accuracy for baseline

In [11]:
print(classification_report(llama_dev['label'], llama_dev['llama_label'], digits=5, output_dict=False))

              precision    recall  f1-score   support

           0    0.93404   0.48150   0.63543      2000
           1    0.65072   0.96600   0.77762      2000

    accuracy                        0.72375      4000
   macro avg    0.79238   0.72375   0.70653      4000
weighted avg    0.79238   0.72375   0.70653      4000



### 1.1.2 Optimal classification threshold

In [12]:
llama_dev_auc_dict = {}
languages = ['ru', 'ar', 'de']

# Calculate AUC and optimal threshold for each language
for lang in languages:
    lang_df = llama_dev[llama_dev['language'] == lang]
    if not lang_df.empty:
        fpr, tpr, thresholds = roc_curve(lang_df['label'], lang_df['llama_prob'])
        lang_auc = auc(fpr, tpr)
        optimal_idx = np.argmax(tpr - fpr)
        optimal_threshold = thresholds[optimal_idx]
        llama_dev_auc_dict[lang] = {'auc': lang_auc, 'th_optim': optimal_threshold}

fpr, tpr, thresholds = roc_curve(llama_dev['label'], llama_dev['llama_prob'])
overall_auc = auc(fpr, tpr)
optimal_idx = np.argmax(tpr - fpr)
optimal_threshold = thresholds[optimal_idx]
llama_dev_auc_dict['overall'] = {'auc': overall_auc, 'th_optim': optimal_threshold}

llama_dev_auc_dict

{'ru': {'auc': 0.8439441056910568, 'th_optim': 1.0},
 'ar': {'auc': 0.9579477911646587, 'th_optim': 0.9990234375},
 'de': {'auc': 0.9373279999999999, 'th_optim': 1.0},
 'overall': {'auc': 0.9026313749999999, 'th_optim': 1.0}}

## 1.2 test set

In [13]:
llama_test = pd.read_json("data/subtaskA_multilingual.jsonl", lines=True)
llama_test['language'] = [get_language(text) for text in llama_test['text']]

file = "prediction/llama_multi_test.jsonl"
temp = pd.read_json(file, lines=True)
llama_test[['llama_label', 'llama_prob']] = temp[['label', 'probs']]

### 1.2.1 Accuracy for baseline

In [14]:
print(classification_report(llama_test['label'], llama_test['llama_label'], digits=5, output_dict=False))

              precision    recall  f1-score   support

           0    0.97570   0.70234   0.81676     20238
           1    0.78339   0.98401   0.87231     22140

    accuracy                        0.84950     42378
   macro avg    0.87954   0.84318   0.84453     42378
weighted avg    0.87523   0.84950   0.84578     42378



### 1.2.2 Optimal classification threshold

In [15]:
llama_test_auc_dict = {}
languages = ['en', 'de', 'ar', 'it']

# Calculate AUC and optimal threshold for each language
for lang in languages:
    lang_df = llama_test[llama_test['language'] == lang]
    if not lang_df.empty:
        fpr, tpr, thresholds = roc_curve(lang_df['label'], lang_df['llama_prob'])
        lang_auc = auc(fpr, tpr)
        optimal_idx = np.argmax(tpr - fpr)
        optimal_threshold = thresholds[optimal_idx]
        llama_test_auc_dict[lang] = {'auc': lang_auc, 'th_optim': optimal_threshold}

fpr, tpr, thresholds = roc_curve(llama_test['label'], llama_test['llama_prob'])
overall_auc = auc(fpr, tpr)
optimal_idx = np.argmax(tpr - fpr)
optimal_threshold = thresholds[optimal_idx]
llama_test_auc_dict['overall'] = {'auc': overall_auc, 'th_optim': optimal_threshold}

llama_test_auc_dict

{'en': {'auc': 0.9297217894764578, 'th_optim': 1.0},
 'de': {'auc': 0.9666838276130177, 'th_optim': 0.4055175781},
 'ar': {'auc': 0.9905688685180518, 'th_optim': 0.99609375},
 'it': {'auc': 0.9954626969591409, 'th_optim': 1.0},
 'overall': {'auc': 0.9431453061771782, 'th_optim': 1.0}}

# 2. Metric-based methods

## 2.1 dev set

In [16]:
s5_dev = pd.read_json("data/subtaskA_dev_multilingual.jsonl", lines=True)
s5_dev['language'] = [get_language(text) for text in s5_dev['text']]

file = "prediction/rank_entropy_ll_logrank_dev_statistic_metric.jsonl"
temp = pd.read_json(file, lines=True)
s5_dev[['rank', 'entropy', 'likelihood', 'log_rank']] = temp[['rank', 'entropy', 'likelihood', 'log_rank']]

file = "prediction/binocular_metric_dev.csv"
temp = pd.read_csv(file)
s5_dev[['binocular']] = temp[['binocular']]

### 2.1.1 Optimal classification threshold

In [17]:
#optimal classification threshold calculations
languages = ['ru', 'de', 'ar']
s5_dev_auc_dict = {}

for model in [x for x in s5_dev.columns.to_list()[6:]]:
    labels = s5_dev['label']
    fpr, tpr, thresholds = roc_curve(labels, s5_dev[model])
    s5_dev_auc_dict[model] = {'auc': auc(fpr, tpr), 'th_optim': thresholds[np.argmax(tpr - fpr)]}

    for test_language in languages:
        filtered = s5_dev[s5_dev.language == test_language]
        if filtered.empty:
            fpr, tpr, thresholds = np.array([0, 0]), np.array([0, 0]), np.array([0, 0])
        else:
            fpr, tpr, thresholds = roc_curve(filtered['label'], filtered[model])
        s5_dev_auc_dict[model][test_language] = {'auc': auc(fpr, tpr), 'th_optim': thresholds[np.argmax(tpr - fpr)] if len(thresholds) > 0 else 0}

s5_dev_auc_dict

{'rank': {'auc': 0.5227096250000001,
  'th_optim': -194.61175537109375,
  'ru': {'auc': 0.3832794715447154, 'th_optim': -64.93841552734375},
  'de': {'auc': 0.7299720000000001, 'th_optim': -194.61175537109375},
  'ar': {'auc': 0.7720020080321286, 'th_optim': -5.068426132202148}},
 'entropy': {'auc': 0.54484175,
  'th_optim': 1.805957436561584,
  'ru': {'auc': 0.7248231707317074, 'th_optim': 1.805957436561584},
  'de': {'auc': 0.3933, 'th_optim': 2.955816268920898},
  'ar': {'auc': 0.3155582329317269, 'th_optim': 1.7562482357025142}},
 'likelihood': {'auc': 0.5027961249999999,
  'th_optim': -2.023075819015503,
  'ru': {'auc': 0.34253607723577234, 'th_optim': -2.04640245437622},
  'de': {'auc': 0.74556, 'th_optim': -3.606225967407226},
  'ar': {'auc': 0.8104979919678715, 'th_optim': -2.023049831390381}},
 'log_rank': {'auc': 0.50150075,
  'th_optim': -0.9323371648788451,
  'ru': {'auc': 0.34744207317073167, 'th_optim': -0.9259398579597471},
  'de': {'auc': 0.746356, 'th_optim': -2.008877

### 2.1.2 Accuracy for S5

In [18]:
s5 = pd.DataFrame() #'rank', 'entropy', 'likelihood', 'log-rank', 'binocular'
selected = 'rank'
use_th = 'th_optim'
s5[selected] = [1 if (lang in languages) and (prob>=s5_dev_auc_dict[selected][lang][use_th]) else 1 if (lang not in languages) and (prob>=s5_dev_auc_dict[selected][use_th]) else 0 for lang, prob in zip(s5_dev['language'], s5_dev[selected])]
selected = 'entropy'
s5[selected] = [1 if (lang in languages) and (prob>=s5_dev_auc_dict[selected][lang][use_th]) else 1 if (lang not in languages) and (prob>=s5_dev_auc_dict[selected][use_th]) else 0 for lang, prob in zip(s5_dev['language'], s5_dev[selected])]
selected = 'likelihood'
s5[selected] = [1 if (lang in languages) and (prob>=s5_dev_auc_dict[selected][lang][use_th]) else 1 if (lang not in languages) and (prob>=s5_dev_auc_dict[selected][use_th]) else 0 for lang, prob in zip(s5_dev['language'], s5_dev[selected])]
selected = 'log_rank'
s5[selected] = [1 if (lang in languages) and (prob>=s5_dev_auc_dict[selected][lang][use_th]) else 1 if (lang not in languages) and (prob>=s5_dev_auc_dict[selected][use_th]) else 0 for lang, prob in zip(s5_dev['language'], s5_dev[selected])]
selected = 'binocular'
s5[selected] = [1 if (lang in languages) and (prob>=s5_dev_auc_dict[selected][lang][use_th]) else 1 if (lang not in languages) and (prob>=s5_dev_auc_dict[selected][use_th]) else 0 for lang, prob in zip(s5_dev['language'], s5_dev[selected])]

#### rank

In [19]:
print(classification_report(s5_dev['label'], s5['rank'], digits=5, output_dict=False))

              precision    recall  f1-score   support

           0    0.67788   0.38300   0.48946      2000
           1    0.57003   0.81800   0.67187      2000

    accuracy                        0.60050      4000
   macro avg    0.62396   0.60050   0.58066      4000
weighted avg    0.62396   0.60050   0.58066      4000



#### entropy

In [20]:
print(classification_report(s5_dev['label'], s5['entropy'], digits=5, output_dict=False))

              precision    recall  f1-score   support

           0    0.77078   0.30600   0.43808      2000
           1    0.56706   0.90900   0.69842      2000

    accuracy                        0.60750      4000
   macro avg    0.66892   0.60750   0.56825      4000
weighted avg    0.66892   0.60750   0.56825      4000



#### likelihood

In [21]:
print(classification_report(s5_dev['label'], s5['likelihood'], digits=5, output_dict=False))

              precision    recall  f1-score   support

           0    0.77197   0.34700   0.47879      2000
           1    0.57885   0.89750   0.70378      2000

    accuracy                        0.62225      4000
   macro avg    0.67541   0.62225   0.59128      4000
weighted avg    0.67541   0.62225   0.59128      4000



#### log_rank

In [22]:
print(classification_report(s5_dev['label'], s5['log_rank'], digits=5, output_dict=False))

              precision    recall  f1-score   support

           0    0.73663   0.36500   0.48813      2000
           1    0.57793   0.86950   0.69435      2000

    accuracy                        0.61725      4000
   macro avg    0.65728   0.61725   0.59124      4000
weighted avg    0.65728   0.61725   0.59124      4000



#### binocular

In [23]:
print(classification_report(s5_dev['label'], s5['binocular'], digits=5, output_dict=False))

              precision    recall  f1-score   support

           0    0.57080   0.65300   0.60914      2000
           1    0.59463   0.50900   0.54849      2000

    accuracy                        0.58100      4000
   macro avg    0.58272   0.58100   0.57882      4000
weighted avg    0.58272   0.58100   0.57882      4000



## 2.2 test set

In [24]:
s5_test = pd.read_json("data/subtaskA_multilingual.jsonl", lines=True)
s5_test['language'] = [get_language(text) for text in s5_test['text']]

file = "prediction/rank_entropy_ll_logrank_test_statistic_metric.jsonl"
temp = pd.read_json(file, lines=True)
s5_test[['rank', 'entropy', 'likelihood', 'log_rank']] = temp[['rank', 'entropy', 'likelihood', 'log_rank']]

file = "prediction/binocular_metric_test.csv"
temp = pd.read_csv(file)
s5_test[['binocular']] = temp[['binocular']]

### 2.2.1 Optimal classification threshold

In [25]:
#optimal classification threshold calculations
languages = ['en', 'it', 'de', 'ar']
s5_test_auc_dict = {}

for model in [x for x in s5_test.columns.to_list()[4:]]:
    labels = s5_test['label']
    fpr, tpr, thresholds = roc_curve(labels, s5_test[model])
    s5_test_auc_dict[model] = {'auc': auc(fpr, tpr), 'th_optim': thresholds[np.argmax(tpr - fpr)]}

    for test_language in languages:
        filtered = s5_test[s5_test.language == test_language]
        if filtered.empty:
            fpr, tpr, thresholds = np.array([0, 0]), np.array([0, 0]), np.array([0, 0])
        else:
            fpr, tpr, thresholds = roc_curve(filtered['label'], filtered[model])
        s5_test_auc_dict[model][test_language] = {'auc': auc(fpr, tpr), 'th_optim': thresholds[np.argmax(tpr - fpr)] if len(thresholds) > 0 else 0}

s5_test_auc_dict

{'rank': {'auc': 0.734150589690006,
  'th_optim': -48.46562576293945,
  'en': {'auc': 0.8843203738822354, 'th_optim': -48.46562576293945},
  'it': {'auc': 0.7366540665997139, 'th_optim': -319.14190673828125},
  'de': {'auc': 0.8188589359102523, 'th_optim': -192.0410614013672},
  'ar': {'auc': 0.792478392750625, 'th_optim': -4.099065780639648}},
 'entropy': {'auc': 0.23592029019081243,
  'th_optim': inf,
  'en': {'auc': 0.05007472374873752, 'th_optim': inf},
  'it': {'auc': 0.12014288120423056, 'th_optim': 3.541979074478149},
  'de': {'auc': 0.28796329001443965, 'th_optim': inf},
  'ar': {'auc': 0.14586139813000068, 'th_optim': inf}},
 'likelihood': {'auc': 0.7904263864796635,
  'th_optim': -2.735857248306274,
  'en': {'auc': 0.9652668069951141, 'th_optim': -2.718686580657959},
  'it': {'auc': 0.8302371475957161, 'th_optim': -4.318907737731934},
  'de': {'auc': 0.9068092857936243, 'th_optim': -3.423327445983886},
  'ar': {'auc': 0.936290569684399, 'th_optim': -2.009902715682983}},
 'log

### 2.2.2 Accuracy for S5

In [26]:
s5 = pd.DataFrame() #'rank', 'entropy', 'likelihood', 'log-rank', 'binocular'
selected = 'rank'
use_th = 'th_optim'
s5[selected] = [1 if (lang in languages) and (prob>=s5_test_auc_dict[selected][lang][use_th]) else 1 if (lang not in languages) and (prob>=s5_test_auc_dict[selected][use_th]) else 0 for lang, prob in zip(s5_test['language'], s5_test[selected])]
selected = 'entropy'
s5[selected] = [1 if (lang in languages) and (prob>=s5_test_auc_dict[selected][lang][use_th]) else 1 if (lang not in languages) and (prob>=s5_test_auc_dict[selected][use_th]) else 0 for lang, prob in zip(s5_test['language'], s5_test[selected])]
selected = 'likelihood'
s5[selected] = [1 if (lang in languages) and (prob>=s5_test_auc_dict[selected][lang][use_th]) else 1 if (lang not in languages) and (prob>=s5_test_auc_dict[selected][use_th]) else 0 for lang, prob in zip(s5_test['language'], s5_test[selected])]
selected = 'log_rank'
s5[selected] = [1 if (lang in languages) and (prob>=s5_test_auc_dict[selected][lang][use_th]) else 1 if (lang not in languages) and (prob>=s5_test_auc_dict[selected][use_th]) else 0 for lang, prob in zip(s5_test['language'], s5_test[selected])]
selected = 'binocular'
s5[selected] = [1 if (lang in languages) and (prob>=s5_test_auc_dict[selected][lang][use_th]) else 1 if (lang not in languages) and (prob>=s5_test_auc_dict[selected][use_th]) else 0 for lang, prob in zip(s5_test['language'], s5_test[selected])]

#### rank

In [27]:
print(classification_report(s5_test['label'], s5['rank'], digits=5, output_dict=False))

              precision    recall  f1-score   support

           0    0.74858   0.79430   0.77076     20238
           1    0.80085   0.75614   0.77786     22140

    accuracy                        0.77436     42378
   macro avg    0.77472   0.77522   0.77431     42378
weighted avg    0.77589   0.77436   0.77447     42378



#### entropy

In [28]:
print(classification_report(s5_test['label'], s5['entropy'], digits=5, output_dict=False))

              precision    recall  f1-score   support

           0    0.47385   0.85003   0.60849     20238
           1    0.50025   0.13722   0.21536     22140

    accuracy                        0.47763     42378
   macro avg    0.48705   0.49363   0.41193     42378
weighted avg    0.48764   0.47763   0.40310     42378



#### likelihood

In [29]:
print(classification_report(s5_test['label'], s5['likelihood'], digits=5, output_dict=False))

              precision    recall  f1-score   support

           0    0.86879   0.85167   0.86014     20238
           1    0.86681   0.88243   0.87455     22140

    accuracy                        0.86774     42378
   macro avg    0.86780   0.86705   0.86735     42378
weighted avg    0.86776   0.86774   0.86767     42378



#### log_rank

In [30]:
print(classification_report(s5_test['label'], s5['log_rank'], digits=5, output_dict=False))

              precision    recall  f1-score   support

           0    0.87622   0.84297   0.85927     20238
           1    0.86127   0.89115   0.87595     22140

    accuracy                        0.86814     42378
   macro avg    0.86875   0.86706   0.86761     42378
weighted avg    0.86841   0.86814   0.86799     42378



#### binocular

In [31]:
print(classification_report(s5_test['label'], s5['binocular'], digits=5, output_dict=False))

              precision    recall  f1-score   support

           0    0.47238   0.67867   0.55704     20238
           1    0.51113   0.30709   0.38367     22140

    accuracy                        0.48454     42378
   macro avg    0.49175   0.49288   0.47036     42378
weighted avg    0.49262   0.48454   0.46647     42378



# 3. LLM 

## 3.1 dev set

In [32]:
llm_dev = pd.read_json("data/subtaskA_dev_multilingual.jsonl", lines=True)
llm_dev['language'] = [get_language(text) for text in llm_dev['text']]

file = "prediction/falcon_dev_multi.jsonl"
temp = pd.read_json(file, lines=True)
llm_dev['falcon'] = temp['probs']

file = "prediction/mistral_dev_multi.jsonl"
temp = pd.read_json(file, lines=True)
llm_dev['mistral'] = temp['probs']

### 3.1.1 Optimal classification threshold

In [33]:
#optimal classification threshold calculations
languages = ['ru', 'de', 'ar']
llm_dev_auc_dict = {}

for model in [x for x in llm_dev.columns.to_list()[6:]]:
    labels = llm_dev['label']
    fpr, tpr, thresholds = roc_curve(labels, llm_dev[model])
    llm_dev_auc_dict[model] = {'auc': auc(fpr, tpr), 'th_optim': thresholds[np.argmax(tpr - fpr)]}

    for test_language in languages:
        filtered = llm_dev[llm_dev.language == test_language]
        if filtered.empty:
            fpr, tpr, thresholds = np.array([0, 0]), np.array([0, 0]), np.array([0, 0])
        else:
            fpr, tpr, thresholds = roc_curve(filtered['label'], filtered[model])
        llm_dev_auc_dict[model][test_language] = {'auc': auc(fpr, tpr), 'th_optim': thresholds[np.argmax(tpr - fpr)] if len(thresholds) > 0 else 0}

llm_dev_auc_dict

{'falcon': {'auc': 0.877502625,
  'th_optim': 0.1291503906,
  'ru': {'auc': 0.9594547764227643, 'th_optim': 0.1722412109},
  'de': {'auc': 0.8454059999999999, 'th_optim': 1.0},
  'ar': {'auc': 0.7347630522088353, 'th_optim': 0.0041198730000000005}},
 'mistral': {'auc': 0.9039201250000001,
  'th_optim': 1.0,
  'ru': {'auc': 0.9077169715447154, 'th_optim': 0.5888671875},
  'de': {'auc': 0.966332, 'th_optim': 1.0},
  'ar': {'auc': 0.844726907630522, 'th_optim': 1.0}}}

### 3.1.2 Accuracy for LLM

#### Mistral

In [34]:
mistral = pd.DataFrame()
selected = 'mistral'
use_th = 'th_optim'
mistral[selected] = [1 if (lang in languages) and (prob>=llm_dev_auc_dict[selected][lang][use_th]) else 1 if (lang not in languages) and (prob>=llm_dev_auc_dict[selected][use_th]) else 0 for lang, prob in zip(llm_dev['language'], llm_dev[selected])]

In [35]:
print(classification_report(llm_dev['label'], mistral['mistral'], digits=5, output_dict=False))

              precision    recall  f1-score   support

           0    0.90067   0.87950   0.88996      2000
           1    0.88227   0.90300   0.89251      2000

    accuracy                        0.89125      4000
   macro avg    0.89147   0.89125   0.89123      4000
weighted avg    0.89147   0.89125   0.89123      4000



#### falcon

In [36]:
falcon = pd.DataFrame()
selected = 'falcon'
use_th = 'th_optim'
falcon[selected] = [1 if (lang in languages) and (prob>=llm_dev_auc_dict[selected][lang][use_th]) else 1 if (lang not in languages) and (prob>=llm_dev_auc_dict[selected][use_th]) else 0 for lang, prob in zip(llm_dev['language'], llm_dev[selected])]

In [37]:
print(classification_report(llm_dev['label'], falcon['falcon'], digits=5, output_dict=False))

              precision    recall  f1-score   support

           0    0.87827   0.82250   0.84947      2000
           1    0.83310   0.88600   0.85874      2000

    accuracy                        0.85425      4000
   macro avg    0.85568   0.85425   0.85410      4000
weighted avg    0.85568   0.85425   0.85410      4000



## 3.2 test set

In [38]:
llm_test = pd.read_json("data/subtaskA_multilingual.jsonl", lines=True)
llm_test['language'] = [get_language(text) for text in llm_test['text']]

file = "prediction/falcon_test_multi.jsonl"
temp = pd.read_json(file, lines=True)
llm_test['falcon'] = temp['probs']

file = "prediction/mistral_test_multi.jsonl"
temp = pd.read_json(file, lines=True)
llm_test['mistral'] = temp['probs']

### 3.2.1 Optimal classification threshold

In [39]:
#optimal classification threshold calculations
languages = ['en', 'it', 'de', 'ar']
llm_test_auc_dict = {}

for model in [x for x in llm_test.columns.to_list()[4:]]:
    labels = llm_test['label']
    fpr, tpr, thresholds = roc_curve(labels, llm_test[model])
    llm_test_auc_dict[model] = {'auc': auc(fpr, tpr), 'th_optim': thresholds[np.argmax(tpr - fpr)]}

    for test_language in languages:
        filtered = llm_test[llm_test.language == test_language]
        if filtered.empty:
            fpr, tpr, thresholds = np.array([0, 0]), np.array([0, 0]), np.array([0, 0])
        else:
            fpr, tpr, thresholds = roc_curve(filtered['label'], filtered[model])
        llm_test_auc_dict[model][test_language] = {'auc': auc(fpr, tpr), 'th_optim': thresholds[np.argmax(tpr - fpr)] if len(thresholds) > 0 else 0}

llm_test_auc_dict

{'falcon': {'auc': 0.9491783302637191,
  'th_optim': 1.0,
  'en': {'auc': 0.9370427968754189, 'th_optim': 1.0},
  'it': {'auc': 0.9976826838099255, 'th_optim': 1.0},
  'de': {'auc': 0.9662147617460847, 'th_optim': 0.0046272278000000005},
  'ar': {'auc': 0.9901645541391457, 'th_optim': 0.9340820312}},
 'mistral': {'auc': 0.9730138876725591,
  'th_optim': 1.0,
  'en': {'auc': 0.9816146685862734, 'th_optim': 1.0},
  'it': {'auc': 0.9962090330731164, 'th_optim': 1.0},
  'de': {'auc': 0.9685180495390425, 'th_optim': 1.0},
  'ar': {'auc': 0.8002750425617939, 'th_optim': 1.0}}}

### 3.2.2 Accuracy for LLM

#### Mistral

In [40]:
mistral = pd.DataFrame()
selected = 'mistral'
use_th = 'th_optim'
mistral[selected] = [1 if (lang in languages) and (prob>=llm_test_auc_dict[selected][lang][use_th]) else 1 if (lang not in languages) and (prob>=llm_dev_auc_dict[selected][use_th]) else 0 for lang, prob in zip(llm_test['language'], llm_test[selected])]

In [41]:
print(classification_report(llm_test['label'], mistral['mistral'], digits=5, output_dict=False))

              precision    recall  f1-score   support

           0    0.99647   0.94723   0.97122     20238
           1    0.95385   0.99693   0.97491     22140

    accuracy                        0.97319     42378
   macro avg    0.97516   0.97208   0.97307     42378
weighted avg    0.97420   0.97319   0.97315     42378



#### falcon

In [42]:
falcon = pd.DataFrame()
selected = 'falcon'
use_th = 'th_optim'
falcon[selected] = [1 if (lang in languages) and (prob>=llm_test_auc_dict[selected][lang][use_th]) else 1 if (lang not in languages) and (prob>=llm_dev_auc_dict[selected][use_th]) else 0 for lang, prob in zip(llm_test['language'], llm_test[selected])]

In [43]:
print(classification_report(llm_test['label'], falcon['falcon'], digits=5, output_dict=False))

              precision    recall  f1-score   support

           0    0.96545   0.91551   0.93982     20238
           1    0.92625   0.97005   0.94765     22140

    accuracy                        0.94400     42378
   macro avg    0.94585   0.94278   0.94373     42378
weighted avg    0.94497   0.94400   0.94391     42378



# 4. LLM2 (MF, ML)

## 4.1 dev set

In [44]:
llm2_dev = pd.read_json("data/subtaskA_dev_multilingual.jsonl", lines=True)
llm2_dev['language'] = [get_language(text) for text in llm2_dev['text']]

file = "prediction/falcon_dev_multi.jsonl"
temp = pd.read_json(file, lines=True)
llm2_dev['falcon'] = temp['probs']

file = "prediction/mistral_dev_multi.jsonl"
temp = pd.read_json(file, lines=True)
llm2_dev['mistral'] = temp['probs']

file = "prediction/llama_multi_dev.jsonl"
temp = pd.read_json(file, lines=True)
llm2_dev['llama'] = temp['probs']

### 4.1.1 Optimal classification threshold

In [45]:
languages = ['ru', 'de', 'ar']
llm2_dev_auc_dict = {'falcon': {}, 'mistral': {}, 'llama': {}}

for model in [x for x in llm2_dev.columns.to_list()[6:]]:
    fpr, tpr, thresholds = roc_curve(llm2_dev['label'], llm2_dev[model])
    overall_auc = auc(fpr, tpr)
    optimal_idx = np.argmax(tpr - fpr)
    optimal_threshold = thresholds[optimal_idx]
    llm2_dev_auc_dict[model]['auc'] = overall_auc
    llm2_dev_auc_dict[model]['th_optim'] = optimal_threshold

    for lang in languages:
        lang_df = llm2_dev[llm2_dev['language'] == lang]
        if not lang_df.empty:
            fpr, tpr, thresholds = roc_curve(lang_df['label'], lang_df[model])
            lang_auc = auc(fpr, tpr)
            optimal_idx = np.argmax(tpr - fpr)
            optimal_threshold = thresholds[optimal_idx]
            llm2_dev_auc_dict[model][lang] = {'auc': lang_auc, 'th_optim': optimal_threshold}

llm2_dev_auc_dict


{'falcon': {'auc': 0.877502625,
  'th_optim': 0.1291503906,
  'ru': {'auc': 0.9594547764227643, 'th_optim': 0.1722412109},
  'de': {'auc': 0.8454059999999999, 'th_optim': 1.0},
  'ar': {'auc': 0.7347630522088353, 'th_optim': 0.0041198730000000005}},
 'mistral': {'auc': 0.9039201250000001,
  'th_optim': 1.0,
  'ru': {'auc': 0.9077169715447154, 'th_optim': 0.5888671875},
  'de': {'auc': 0.966332, 'th_optim': 1.0},
  'ar': {'auc': 0.844726907630522, 'th_optim': 1.0}},
 'llama': {'auc': 0.9026313749999999,
  'th_optim': 1.0,
  'ru': {'auc': 0.8439441056910568, 'th_optim': 1.0},
  'de': {'auc': 0.9373279999999999, 'th_optim': 1.0},
  'ar': {'auc': 0.9579477911646587, 'th_optim': 0.9990234375}}}

### 4.1.2 dev set prediction

In [46]:
llm2 = llm2_dev[['llama', 'mistral']]   # ML
llm2['soft_vote'] = llm2.mean(axis=1)
llm2['language'] = llm2_dev['language']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  llm2['soft_vote'] = llm2.mean(axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  llm2['language'] = llm2_dev['language']


In [47]:
total_th_optims = [llm2_dev_auc_dict['llama']['th_optim'], llm2_dev_auc_dict['mistral']['th_optim']]
total_average_th_optim = sum(total_th_optims) / len(total_th_optims)

average_th_optim = {'th_optim': total_average_th_optim}

for lang in languages:
    th_optim_llama = llm2_dev_auc_dict['llama'].get(lang, {}).get('th_optim', 0)
    th_optim_mistral = llm2_dev_auc_dict['mistral'].get(lang, {}).get('th_optim', 0)
    average_th_optim[lang] = (th_optim_llama + th_optim_mistral) / 2

In [48]:
# calculate the threshold mean
llm2['prediction'] = [1 if (lang in languages) and (prob>=average_th_optim[lang]) else 1 if (lang not in languages) and (prob>=average_th_optim["th_optim"]) else 0 for lang, prob in zip(llm2['language'], llm2["soft_vote"])]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  llm2['prediction'] = [1 if (lang in languages) and (prob>=average_th_optim[lang]) else 1 if (lang not in languages) and (prob>=average_th_optim["th_optim"]) else 0 for lang, prob in zip(llm2['language'], llm2["soft_vote"])]


### 4.1.3 Accuracy for LLM2

In [49]:
print(classification_report(llm2_dev['label'], llm2['prediction'], digits=5, output_dict=False))

              precision    recall  f1-score   support

           0    0.88355   0.95600   0.91835      2000
           1    0.95207   0.87400   0.91137      2000

    accuracy                        0.91500      4000
   macro avg    0.91781   0.91500   0.91486      4000
weighted avg    0.91781   0.91500   0.91486      4000



## 4.2 test set

In [50]:
llm2_test = pd.read_json("data/subtaskA_multilingual.jsonl", lines=True)
llm2_test['language'] = [get_language(text) for text in llm2_test['text']]

file = "prediction/falcon_test_multi.jsonl"
temp = pd.read_json(file, lines=True)
llm2_test['falcon'] = temp['probs']

file = "prediction/mistral_test_multi.jsonl"
temp = pd.read_json(file, lines=True)
llm2_test['mistral'] = temp['probs']

file = "prediction/llama_multi_test.jsonl"
temp = pd.read_json(file, lines=True)
llm2_test['llama'] = temp['probs']

### 4.2.1 Optimal classification threshold

In [51]:
languages = ['en', 'it', 'de', 'ar']
llm2_test_auc_dict = {'falcon': {}, 'mistral': {}, 'llama':{}}

for model in [x for x in llm2_dev.columns.to_list()[6:]]:
    fpr, tpr, thresholds = roc_curve(llm2_test['label'], llm2_test[model])
    overall_auc = auc(fpr, tpr)
    optimal_idx = np.argmax(tpr - fpr)
    optimal_threshold = thresholds[optimal_idx]
    llm2_test_auc_dict[model]['auc'] = overall_auc
    llm2_test_auc_dict[model]['th_optim'] = optimal_threshold

    for lang in languages:
        lang_df = llm2_test[llm2_test['language'] == lang]
        if not lang_df.empty:
            fpr, tpr, thresholds = roc_curve(lang_df['label'], lang_df[model])
            lang_auc = auc(fpr, tpr)
            optimal_idx = np.argmax(tpr - fpr)
            optimal_threshold = thresholds[optimal_idx]
            llm2_test_auc_dict[model][lang] = {'auc': lang_auc, 'th_optim': optimal_threshold}

llm2_test_auc_dict

{'falcon': {'auc': 0.9491783302637191,
  'th_optim': 1.0,
  'en': {'auc': 0.9370427968754189, 'th_optim': 1.0},
  'it': {'auc': 0.9976826838099255, 'th_optim': 1.0},
  'de': {'auc': 0.9662147617460847, 'th_optim': 0.0046272278000000005},
  'ar': {'auc': 0.9901645541391457, 'th_optim': 0.9340820312}},
 'mistral': {'auc': 0.9730138876725591,
  'th_optim': 1.0,
  'en': {'auc': 0.9816146685862734, 'th_optim': 1.0},
  'it': {'auc': 0.9962090330731164, 'th_optim': 1.0},
  'de': {'auc': 0.9685180495390425, 'th_optim': 1.0},
  'ar': {'auc': 0.8002750425617939, 'th_optim': 1.0}},
 'llama': {'auc': 0.9431453061771782,
  'th_optim': 1.0,
  'en': {'auc': 0.9297217894764578, 'th_optim': 1.0},
  'it': {'auc': 0.9954626969591409, 'th_optim': 1.0},
  'de': {'auc': 0.9666838276130177, 'th_optim': 0.4055175781},
  'ar': {'auc': 0.9905688685180518, 'th_optim': 0.99609375}}}

### 4.2.2 test set prediction

In [52]:
llm2 = llm2_test[['llama', 'mistral']]  # ML
llm2['soft_vote'] = llm2.mean(axis=1)
llm2['language'] = llm2_test['language']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  llm2['soft_vote'] = llm2.mean(axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  llm2['language'] = llm2_test['language']


In [53]:
total_th_optims = [llm2_test_auc_dict['llama']['th_optim'], llm2_test_auc_dict['mistral']['th_optim']]
total_average_th_optim = sum(total_th_optims) / len(total_th_optims)

average_th_optim = {'th_optim': total_average_th_optim}

for lang in languages:
    th_optim_llama = llm2_test_auc_dict['llama'].get(lang, {}).get('th_optim', 0)
    th_optim_mistral = llm2_test_auc_dict['mistral'].get(lang, {}).get('th_optim', 0)
    average_th_optim[lang] = (th_optim_llama + th_optim_mistral) / 2

In [54]:
# calculate the threshold mean
llm2['prediction'] = [1 if (lang in languages) and (prob>=average_th_optim[lang]) else 1 if (lang not in languages) and (prob>=average_th_optim["th_optim"]) else 0 for lang, prob in zip(llm2['language'], llm2["soft_vote"])]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  llm2['prediction'] = [1 if (lang in languages) and (prob>=average_th_optim[lang]) else 1 if (lang not in languages) and (prob>=average_th_optim["th_optim"]) else 0 for lang, prob in zip(llm2['language'], llm2["soft_vote"])]


### 4.2.3 Accuracy for LLM2

In [55]:
print(classification_report(llm2_test['label'], llm2['prediction'], digits=5, output_dict=False))

              precision    recall  f1-score   support

           0    0.97571   0.98058   0.97814     20238
           1    0.98217   0.97769   0.97992     22140

    accuracy                        0.97907     42378
   macro avg    0.97894   0.97913   0.97903     42378
weighted avg    0.97908   0.97907   0.97907     42378



# 5. LLM2S3 (MF-ELL, MF-REB, MF-ELB, ML-ELL, ML-REB, ML-ELB)

 ## 5.1 dev set

In [56]:
llm2s3_dev = pd.read_json("data/subtaskA_dev_multilingual.jsonl", lines=True)
llm2s3_dev['language'] = [get_language(text) for text in llm2s3_dev['text']]

file = "prediction/rank_entropy_ll_logrank_dev_statistic_metric.jsonl"
temp = pd.read_json(file, lines=True)
llm2s3_dev[['entropy', 'likelihood', 'log_rank', 'rank']] = temp[['entropy', 'likelihood', 'log_rank', 'rank']]

file = "prediction/binocular_metric_dev.csv"
temp = pd.read_csv(file)
llm2s3_dev[['binocular']] = temp[['binocular']]

file = "prediction/falcon_dev_multi.jsonl"
temp = pd.read_json(file, lines=True)
llm2s3_dev['falcon'] = temp['probs']

file = "prediction/mistral_dev_multi.jsonl"
temp = pd.read_json(file, lines=True)
llm2s3_dev['mistral'] = temp['probs']

file = "prediction/llama_multi_dev.jsonl"
temp = pd.read_json(file, lines=True)
llm2s3_dev['llama'] = temp['probs']

### 5.1.1 Optimal classification threshold

In [57]:
#optimal classification threshold calculations
languages = ['ru', 'de', 'ar']
llm2S3_dev_auc_dict = {}

for model in [x for x in llm2s3_dev.columns.to_list()[6:]]:
    labels = llm2s3_dev['label']
    fpr, tpr, thresholds = roc_curve(labels, llm2s3_dev[model])
    llm2S3_dev_auc_dict[model] = {'auc': auc(fpr, tpr), 'th_optim': thresholds[np.argmax(tpr - fpr)]}

    for test_language in languages:
        filtered = llm2s3_dev[llm2s3_dev.language == test_language]
        if filtered.empty:
            fpr, tpr, thresholds = np.array([0, 0]), np.array([0, 0]), np.array([0, 0])
        else:
            fpr, tpr, thresholds = roc_curve(filtered['label'], filtered[model])
        llm2S3_dev_auc_dict[model][test_language] = {'auc': auc(fpr, tpr), 'th_optim': thresholds[np.argmax(tpr - fpr)] if len(thresholds) > 0 else 0}

llm2S3_dev_auc_dict

{'entropy': {'auc': 0.54484175,
  'th_optim': 1.805957436561584,
  'ru': {'auc': 0.7248231707317074, 'th_optim': 1.805957436561584},
  'de': {'auc': 0.3933, 'th_optim': 2.955816268920898},
  'ar': {'auc': 0.3155582329317269, 'th_optim': 1.7562482357025142}},
 'likelihood': {'auc': 0.5027961249999999,
  'th_optim': -2.023075819015503,
  'ru': {'auc': 0.34253607723577234, 'th_optim': -2.04640245437622},
  'de': {'auc': 0.74556, 'th_optim': -3.606225967407226},
  'ar': {'auc': 0.8104979919678715, 'th_optim': -2.023049831390381}},
 'log_rank': {'auc': 0.50150075,
  'th_optim': -0.9323371648788451,
  'ru': {'auc': 0.34744207317073167, 'th_optim': -0.9259398579597471},
  'de': {'auc': 0.746356, 'th_optim': -2.008877754211426},
  'ar': {'auc': 0.7871847389558233, 'th_optim': -0.9462776184082031}},
 'rank': {'auc': 0.5227096250000001,
  'th_optim': -194.61175537109375,
  'ru': {'auc': 0.3832794715447154, 'th_optim': -64.93841552734375},
  'de': {'auc': 0.7299720000000001, 'th_optim': -194.6117

### 5.1.2 Dev set predictions

In [58]:
use_th = 'th_optim'

#### Statistical detection

In [59]:
s5 = pd.DataFrame() 
selected = 'entropy'
s5[selected] = [1 if (lang in languages) and (prob>=llm2S3_dev_auc_dict[selected][lang][use_th]) else 1 if (lang not in languages) and (prob>=llm2S3_dev_auc_dict[selected][use_th]) else 0 for lang, prob in zip(llm2s3_dev['language'], llm2s3_dev[selected])]
selected = 'rank'
s5[selected] = [1 if (lang in languages) and (prob>=llm2S3_dev_auc_dict[selected][lang][use_th]) else 1 if (lang not in languages) and (prob>=llm2S3_dev_auc_dict[selected][use_th]) else 0 for lang, prob in zip(llm2s3_dev['language'], llm2s3_dev[selected])]
selected = 'binocular'
s5[selected] = [1 if (lang in languages) and (prob>=llm2S3_dev_auc_dict[selected][lang][use_th]) else 1 if (lang not in languages) and (prob>=llm2S3_dev_auc_dict[selected][use_th]) else 0 for lang, prob in zip(llm2s3_dev['language'], llm2s3_dev[selected])]

#### LLMs

In [60]:
llm2s3 = pd.DataFrame()
selected = 'llama'
llm2s3[selected] = [1 if (lang in languages) and (prob>=llm2S3_dev_auc_dict[selected][lang][use_th]) else 1 if (lang not in languages) and (prob>=llm2S3_dev_auc_dict[selected][use_th]) else 0 for lang, prob in zip(llm2s3_dev['language'], llm2s3_dev[selected])]
selected = 'mistral'
llm2s3[selected] = [1 if (lang in languages) and (prob>=llm2S3_dev_auc_dict[selected][lang][use_th]) else 1 if (lang not in languages) and (prob>=llm2S3_dev_auc_dict[selected][use_th]) else 0 for lang, prob in zip(llm2s3_dev['language'], llm2s3_dev[selected])]

#### Two-step majority voting

In [61]:
#statistical part majority voting
llm2s3_dev['s3'] = [1 if x+y+z>=2 else 0 for x,y,z in zip(s5['entropy'], s5['rank'], s5['binocular'])]

In [62]:
#final majority voting
llm2s3_dev['llm2s3'] = [1 if x+y+z>=2 else 0 for x,y,z in zip(llm2s3['llama'], llm2s3['mistral'], llm2s3_dev['s3'])]

### 5.1.3 Accuracy for LLM2S3

In [63]:
print(classification_report(llm2s3_dev['label'], llm2s3_dev['llm2s3'], digits=5, output_dict=False))

              precision    recall  f1-score   support

           0    0.92857   0.88400   0.90574      2000
           1    0.88931   0.93200   0.91016      2000

    accuracy                        0.90800      4000
   macro avg    0.90894   0.90800   0.90795      4000
weighted avg    0.90894   0.90800   0.90795      4000



## 5.2 test set

In [64]:
llm2s3_test = pd.read_json("data/subtaskA_multilingual.jsonl", lines=True)
llm2s3_test['language'] = [get_language(text) for text in llm2s3_test['text']]

file = "prediction/rank_entropy_ll_logrank_test_statistic_metric.jsonl"
temp = pd.read_json(file, lines=True)
llm2s3_test[['entropy', 'likelihood', 'log_rank', 'rank']] = temp[['entropy', 'likelihood', 'log_rank', 'rank']]

file = "prediction/binocular_metric_test.csv"
temp = pd.read_csv(file)
llm2s3_test[['binocular']] = temp[['binocular']]

file = "prediction/falcon_test_multi.jsonl"
temp = pd.read_json(file, lines=True)
llm2s3_test['falcon'] = temp['probs']

file = "prediction/mistral_test_multi.jsonl"
temp = pd.read_json(file, lines=True)
llm2s3_test['mistral'] = temp['probs']

file = "prediction/llama_multi_test.jsonl"
temp = pd.read_json(file, lines=True)
llm2s3_test['llama'] = temp['probs']

### 5.2.1 Optimal classification threshold

In [65]:
#optimal classification threshold calculations
languages = ['en', 'it', 'de', 'ar']
llm2S3_test_auc_dict = {}

for model in [x for x in llm2s3_test.columns.to_list()[4:]]:
    labels = llm2s3_test['label']
    fpr, tpr, thresholds = roc_curve(labels, llm2s3_test[model])
    llm2S3_test_auc_dict[model] = {'auc': auc(fpr, tpr), 'th_optim': thresholds[np.argmax(tpr - fpr)]}

    for test_language in languages:
        filtered = llm2s3_test[llm2s3_test.language == test_language]
        if filtered.empty:
            fpr, tpr, thresholds = np.array([0, 0]), np.array([0, 0]), np.array([0, 0])
        else:
            fpr, tpr, thresholds = roc_curve(filtered['label'], filtered[model])
        llm2S3_test_auc_dict[model][test_language] = {'auc': auc(fpr, tpr), 'th_optim': thresholds[np.argmax(tpr - fpr)] if len(thresholds) > 0 else 0}

llm2S3_test_auc_dict

{'entropy': {'auc': 0.23592029019081243,
  'th_optim': inf,
  'en': {'auc': 0.05007472374873752, 'th_optim': inf},
  'it': {'auc': 0.12014288120423056, 'th_optim': 3.541979074478149},
  'de': {'auc': 0.28796329001443965, 'th_optim': inf},
  'ar': {'auc': 0.14586139813000068, 'th_optim': inf}},
 'likelihood': {'auc': 0.7904263864796635,
  'th_optim': -2.735857248306274,
  'en': {'auc': 0.9652668069951141, 'th_optim': -2.718686580657959},
  'it': {'auc': 0.8302371475957161, 'th_optim': -4.318907737731934},
  'de': {'auc': 0.9068092857936243, 'th_optim': -3.423327445983886},
  'ar': {'auc': 0.936290569684399, 'th_optim': -2.009902715682983}},
 'log_rank': {'auc': 0.7789901203233464,
  'th_optim': -1.307896137237548,
  'en': {'auc': 0.9656900407315608, 'th_optim': -1.307896137237548},
  'it': {'auc': 0.8131632187007596, 'th_optim': -2.570108652114868},
  'de': {'auc': 0.9150423192269243, 'th_optim': -1.914580821990966},
  'ar': {'auc': 0.9222891446121936, 'th_optim': -0.936908066272735}},


### 5.2.2 Test set predictions

In [66]:
use_th = 'th_optim'

#### Statistical detection

In [67]:
s5 = pd.DataFrame() 
selected = 'entropy'
s5[selected] = [1 if (lang in languages) and (prob>=llm2S3_test_auc_dict[selected][lang][use_th]) else 1 if (lang not in languages) and (prob>=llm2S3_test_auc_dict[selected][use_th]) else 0 for lang, prob in zip(llm2s3_test['language'], llm2s3_test[selected])]
selected = 'rank'
s5[selected] = [1 if (lang in languages) and (prob>=llm2S3_test_auc_dict[selected][lang][use_th]) else 1 if (lang not in languages) and (prob>=llm2S3_test_auc_dict[selected][use_th]) else 0 for lang, prob in zip(llm2s3_test['language'], llm2s3_test[selected])]
selected = 'binocular'
s5[selected] = [1 if (lang in languages) and (prob>=llm2S3_test_auc_dict[selected][lang][use_th]) else 1 if (lang not in languages) and (prob>=llm2S3_test_auc_dict[selected][use_th]) else 0 for lang, prob in zip(llm2s3_test['language'], llm2s3_test[selected])]

#### LLM

In [68]:
llm2s3 = pd.DataFrame()
selected = 'llama'
llm2s3[selected] = [1 if (lang in languages) and (prob>=llm2S3_test_auc_dict[selected][lang][use_th]) else 1 if (lang not in languages) and (prob>=llm2S3_test_auc_dict[selected][use_th]) else 0 for lang, prob in zip(llm2s3_test['language'], llm2s3_test[selected])]
selected = 'mistral'
llm2s3[selected] = [1 if (lang in languages) and (prob>=llm2S3_test_auc_dict[selected][lang][use_th]) else 1 if (lang not in languages) and (prob>=llm2S3_test_auc_dict[selected][use_th]) else 0 for lang, prob in zip(llm2s3_test['language'], llm2s3_test[selected])]

#### Two-step majority voting

In [69]:
#statistical part majority voting
llm2s3_test['s3'] = [1 if x+y+z>=2 else 0 for x,y,z in zip(s5['entropy'], s5['rank'], s5['binocular'])]

In [70]:
#final majority voting
llm2s3_test['llm2s3'] = [1 if x+y+z>=2 else 0 for x,y,z in zip(llm2s3['llama'], llm2s3['mistral'], llm2s3_test['s3'])]

### 5.2.3 Accuracy for LLM2S3

In [71]:
print(classification_report(llm2s3_test['label'], llm2s3_test['llm2s3'], digits=5, output_dict=False))

              precision    recall  f1-score   support

           0    0.98410   0.97534   0.97970     20238
           1    0.97764   0.98559   0.98160     22140

    accuracy                        0.98070     42378
   macro avg    0.98087   0.98047   0.98065     42378
weighted avg    0.98072   0.98070   0.98069     42378

