In [None]:
import os
import json
import numpy as np
from collections import defaultdict
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
from datasets import load_dataset #load_dataset from Huggingface
from scipy import stats
from scipy.stats import rankdata, spearmanr, pearsonr
import statsmodels.stats.proportion as smp

In [None]:
plt.rcParams["savefig.format"] = 'pdf'
plt.rcParams['font.family'] = 'Palatino'

In [None]:
LANG_DICT = {'afrikaans':'afr_Latn' ,
'english': 'eng_Latn',
'amharic':'amh_Ethi' ,
'armenian':'hye_Armn' ,
'assamese':'asm_Beng' ,
'basque':'eus_Latn' ,
'bengali':'ben_Beng' ,
'bulgarian':'bul_Cyrl' ,
'burmese':'mya_Mymr' ,
'catalan':'cat_Latn' ,
'central kurdish':'ckb_Arab' ,
'croatian': 'hrv_Latn',
'dutch': 'nld_Latn',
'xhosa': 'xho_Latn',
'macedonian': 'mkd_Cyrl',
'czech':'ces_Latn' ,
'danish':'dan_Latn' ,
'eastern panjabi':'pan_Guru' ,
'egyptian arabic':'arz_Arab' ,
'estonian':'est_Latn' ,
'finnish':'fin_Latn' ,
'french':'fra_Latn' ,
'georgian':'kat_Geor' ,
'german':'deu_Latn' ,
'greek':'ell_Grek' ,
'gujarati':'guj_Gujr' ,
'hausa':'hau_Latn' ,
'hebrew':'heb_Hebr' ,
'hindi':'hin_Deva' ,
'hungarian':'hun_Latn' ,
'icelandic':'isl_Latn' ,
'indonesian':'ind_Latn' ,
'italian':'ita_Latn' ,
'japanese':'jpn_Jpan' ,
'javanese':'jav_Latn' ,
'kannada':'kan_Knda' ,
'kazakh':'kaz_Cyrl' ,
'khmer':'khm_Khmr' ,
'korean':'kor_Hang' ,
'kyrgyz':'kir_Cyrl' ,
'lao':'lao_Laoo' ,
'lithuanian':'lit_Latn' ,
'malayalam':'mal_Mlym' ,
'marathi':'mar_Deva' ,
'mesopotamian arabic':'acm_Arab' ,
'modern standard arabic':'arb_Arab' ,
'moroccan arabic':'ary_arab' ,
'najdi arabic':'ars_Arab' ,
'nepali':'npi_Deva' ,
'north azerbaijani':'azj_Latn' ,
'north levantine arabic':'apc_Arab' ,
'northern uzbek':'uzn_Latn' ,
'norwegian bokmal':'nob_Latn' ,
'odia':'ory_Orya' ,
'polish':'pol_Latn' ,
'portuguese':'por_Latn' ,
'romanian':'ron_Latn' ,
'russian':'rus_Cyrl' ,
'serbian':'srp_Cyrl' ,
'simplified chinese':'zho_Hans' ,
'sindhi':'snd_Arab' ,
'sinhala':'sin_Sinh' ,
'slovak':'slk_Latn' ,
'slovenian':'slv_Latn' ,
'somali':'som_Latn' ,
'southern pashto':'pbt_Arab' ,
'spanish':'spa_Latn' ,
'standard latvian':'lvs_Latn' ,
'standard malay':'zsm_Latn' ,
'sundanese':'sun_Latn' ,
'swahili':'swh_Latn' ,
'swedish':'swe_Latn' ,
'tamil':'tam_Taml' ,
'telugu':'tel_Telu' ,
'thai':'tha_Thai' ,
'tosk albanian':'als_Latn' ,
'traditional chinese':'zho_Hant' ,
'turkish':'tur_Latn' ,
'ukrainian':'ukr_Cyrl' ,
'urdu':'urd_Arab' ,
'vietnamese':'vie_Latn' ,
'western persian':'pes_Arab'}

LANGUAGE=[k for k,v in LANG_DICT.items()]
LANGUAGE_wo_ENGLISH = [k for k,v in LANG_DICT.items() if k!='english']

In [None]:
def get_accuracy_outputs(lang, dataset='belebele', model='Llama3.1'):
    model_dict = {'Llama3.1': 'meta-llama__Llama-3.1-8B'}
    lang_code = LANG_DICT[lang]
    model_code = model_dict[model]
    #hf = load_dataset("Kartik221/Belebele_test", lang_code)
    #label = hf['test']['correct_answer_num']
    #answer = []
    #for i in range(len(label)):
        #answer.append(label[i])

    accuracy_data_path = f'../../accuracy_outputs/{model}/{dataset}_5shot/{lang_code}/'

    # Find the .jsonl file in the directory
    jsonl_file = [f for f in os.listdir(accuracy_data_path) if f.endswith('.jsonl')][0]
    file_path = os.path.join(accuracy_data_path, jsonl_file)
    # Read the jsonl file line by line
    accuracy_results = []
    with open(file_path, 'r') as f:
        for line in f:
            accuracy_results.append(json.loads(line))

    resps = [item['resps'] for item in accuracy_results]
    accuracy = [item['acc'] for item in accuracy_results]

    score_diff = []
    for i in range(len(accuracy)):
        logprob = [float(resps[i][0][0][0]), float(resps[i][1][0][0]), float(resps[i][2][0][0]), float(resps[i][3][0][0])]
        model_pred_idx = np.argmax(logprob)
        pred_logprob = logprob[model_pred_idx]
        next_best_logprob = max([logprob[i] for i in range(3) if i!=model_pred_idx])    
        score_diff.append(pred_logprob - next_best_logprob)
    return score_diff, accuracy


In [None]:
acc_dict = defaultdict(dict)
score_diff_dict = defaultdict(dict)

conf_matrices = defaultdict(dict)

score_diff_dict['english'], acc_dict['english'] = get_accuracy_outputs('english', 'belebele', 'Llama3.1')

for lang in LANGUAGE:
    score_diff_dict[lang],acc_dict[lang]  = get_accuracy_outputs(lang, 'belebele', 'Llama3.1')
    ''' 
    correct_id_eng = [i for i,acc in enumerate(acc_dict['english']) if acc==1]
    incorrect_id_eng = [i for i,acc in enumerate(acc_dict['english']) if acc==0]
       
    correct_id_lang = [i for i,acc in enumerate(acc_dict[lang]) if acc==1]
    incorrect_id_lang = [i for i,acc in enumerate(acc_dict[lang]) if acc==0]

    # Example language data (Replace with actual lists)
    conf_matrices[lang] = compute_confusion_matrix(correct_id_eng, incorrect_id_eng, correct_id_lang, incorrect_id_lang)
    
    correct_eng_incorrect_lang_ids = list(set(correct_id_eng) & set(incorrect_id_lang))
    correct_eng_correct_lang_ids = list(set(correct_id_eng) & set(correct_id_lang))

# Plot multiple confusion matrices
fig, axes = plt.subplots(2, 5, figsize=(20, 10))  # 2 rows, 5 columns

for ax, (lang, matrix) in zip(axes.flat, conf_matrices.items()):
    
    sns.heatmap(matrix, annot=True, fmt="d", cmap="Blues", xticklabels=["Correct_Eng", "Incorrect_Eng"], 
                yticklabels=["Correct_Lang", "Incorrect_Lang"], ax=ax)
    ax.set_title(lang)

    plt.tight_layout()
plt.show()   
    
    '''
    


In [None]:
def plot_DALI(dataset, lang, model, mode):
    if dataset == 'belebele':
        lang_code = LANG_DICT[lang]
    
        
    if mode == 'DALI':
        DAS_path = f'../../alignment_outputs/{model}/{dataset}_dali/DALI_{lang_code}_lasttoken.json'
    if mode == 'DALIStrong':
        DAS_path = f'../../alignment_outputs/{model}/{dataset}_dali_strong/DALI_{lang_code}_lasttoken.json'

    if mode == 'MEXAFlores':
        DAS_path = f'../../alignment_outputs/{model}/flores_mexa/{lang_code}.json'
    if mode == 'MEXATask':
        DAS_path = f'../../alignment_outputs/{model}/{dataset}_mexa/{lang_code}.json'
    with open(DAS_path) as f:
        lang_DAS = json.load(f)
    return lang_DAS

In [None]:
# Calculate accuracies and get DAS scores for each language
def plot_alignment_by_layers(mode):
    accuracies = {}
    max_das_scores = {}
    mean_das_scores={}
    all_das_avgs = {}

    fig, ax = plt.subplots(figsize=(10, 6))
    plt.rcParams['font.family'] = 'Palatino'

    

    

    for lang in LANGUAGE_wo_ENGLISH:
        # Calculate accuracy
        #accuracies[lang] = (sum(acc_dict[lang])/len(acc_dict[lang])) 
        # Get DAS data for this language
        lang_DAS = plot_DALI('belebele', lang, 'Llama3.1', mode)
        
        if mode == 'DALI' or mode == 'DALIStrong':
            lang_DAS = {int(outer_k): {int(inner_k): v for inner_k, v in inner_v.items()} 
                        for outer_k, inner_v in lang_DAS.items()}
            layer_avgs = []
            for layer in range(32):
                layer_scores = [lang_DAS[sample][layer] for sample in lang_DAS.keys()]
                layer_avgs.append(np.mean(layer_scores))

            all_das_avgs[lang] = layer_avgs
            max_das_scores[lang] = np.max(all_das_avgs[lang])
            mean_das_scores[lang] = np.mean(all_das_avgs[lang])
            ax.plot(range(32), layer_avgs, 
                label=f'{lang.capitalize()}', 
                marker='o', markersize=2)


            
        else: 
            lang_DAS = {int(k): v for k,v in lang_DAS.items()}
            layer_avgs = []
            for layer in range(32):
                layer_avgs.append(np.mean(lang_DAS[layer]))      
            all_das_avgs[lang] = layer_avgs
            max_das_scores[lang] = np.max(all_das_avgs[lang])
            mean_das_scores[lang] = np.mean(all_das_avgs[lang])
            

            ax.plot(range(32), layer_avgs, 
                label=f'{lang.capitalize()}', 
                marker='o', markersize=2)

    ax.set_xlabel('Layer')
    ax.tick_params(axis='both',labelsize=14)
    ax.set_ylabel(f'{mode}')
    #ax.set_title('DALI Scores Across Layers (story completion)')
    ax.grid(True, alpha=0.3)
    ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    ax.set_ylim(0,1)
    #plt.tight_layout()
    #plt.show()
    return accuracies, max_das_scores, mean_das_scores

In [None]:
acc, max_dali, mean_dali = plot_alignment_by_layers('DALI')

In [None]:
acc, max_dali_strong, mean_dali_strong = plot_alignment_by_layers('DALIStrong')

In [None]:
acc, max_mexa_t, mean_mexa_t = plot_alignment_by_layers('MEXATask')

In [None]:
acc, max_mexa_flores, mean_mexa_flores = plot_alignment_by_layers('MEXAFlores')

In [None]:
# Create a dataframe with the required metrics
data = {
    'Language': LANGUAGE_wo_ENGLISH,
    'Max_DALI': [max_dali[lang] for lang in LANGUAGE_wo_ENGLISH],
    'Mean_DALI': [mean_dali[lang] for lang in LANGUAGE_wo_ENGLISH],
    'Max_DALI_Strong': [max_dali_strong[lang] for lang in LANGUAGE_wo_ENGLISH],
    'Mean_DALI_Strong': [mean_dali_strong[lang] for lang in LANGUAGE_wo_ENGLISH],
    'Max_MEXA_T': [max_mexa_t[lang] for lang in LANGUAGE_wo_ENGLISH],
    'Mean_MEXA_T': [mean_mexa_t[lang] for lang in LANGUAGE_wo_ENGLISH],
    'Max_MEXA_Flores': [max_mexa_flores[lang] for lang in LANGUAGE_wo_ENGLISH],
    'Mean_MEXA_Flores': [mean_mexa_flores[lang] for lang in LANGUAGE_wo_ENGLISH]
}

df_metrics = pd.DataFrame(data)

In [None]:
df_metrics.to_excel("../../../../Images_DALI/belebele_pooled_alignment.xlsx", index=False)

In [None]:
def plot_DALI_xstorycloze(dataset, lang, model, mode):
    if mode == 'DALI':
        DAS_path = f'../../alignment_outputs/{model}/{dataset}_dali/BAS_{lang}_lasttoken.json'
    if mode == 'DALIStrong':
        DAS_path = f'../../alignment_outputs/{model}/{dataset}_dali_strong/DALI_{lang}_lasttoken.json'

    if mode == 'MEXAFlores':
        flores_dict = {'arabic': 'arb_Arab', 
                       'spanish': 'spa_Latn',
                        'basque': 'eus_Latn',
                         'hindi': 'hin_Deva',
                          'indonesian': 'ind_Latn',
                           'burmese': 'mya_Mymr',
                            'russian': 'rus_Cyrl',
                             'telugu': 'tel_Telu',
                              'chinese': 'zho_Hans',
                               'swahili': 'swh_Latn'}
        
        lang_code = flores_dict[lang]

        DAS_path = f'../../alignment_outputs/{model}/flores_mexa/{lang_code}.json'
    if mode == 'MEXATask':
        DAS_path = f'../../alignment_outputs/{model}/{dataset}_mexa/{lang}.json'
    with open(DAS_path) as f:
        lang_DAS = json.load(f)
    return lang_DAS

In [None]:
# Calculate accuracies and get DAS scores for each language
def plot_alignment_by_layers_xstorycloze(mode):
    accuracies = {}
    max_das_scores = {}
    mean_das_scores = {}
    all_das_avgs = {}

    fig, ax = plt.subplots(figsize=(10, 6))
    plt.rcParams['font.family'] = 'Palatino'

    lang_list = ['arabic', 'spanish', 'basque', 'hindi', 'indonesian', 'burmese', 'russian', 'telugu', 'chinese', 'swahili']
    for lang in lang_list:
        # Calculate accuracy
        #accuracies[lang] = (sum(acc_dict[lang])/len(acc_dict[lang])) 
        # Get DAS data for this language
        lang_DAS = plot_DALI_xstorycloze('xstorycloze', lang, 'Llama3.1', mode)
        
        if mode == 'DALI' or mode == 'DALIStrong':
            lang_DAS = {int(outer_k): {int(inner_k): v for inner_k, v in inner_v.items()} 
                        for outer_k, inner_v in lang_DAS.items()}
            layer_avgs = []
            for layer in range(32):
                layer_scores = [lang_DAS[sample][layer] for sample in lang_DAS.keys()]
                layer_avgs.append(np.mean(layer_scores))

            all_das_avgs[lang] = layer_avgs
            max_das_scores[lang] = np.max(all_das_avgs[lang])
            mean_das_scores[lang] = np.mean(all_das_avgs[lang])
            ax.plot(range(32), layer_avgs, 
                label=f'{lang.capitalize()}', 
                marker='o', markersize=2)


            
        else: 
            lang_DAS = {int(k): v for k,v in lang_DAS.items()}
            layer_avgs = []
            for layer in range(32):
                layer_avgs.append(np.mean(lang_DAS[layer]))      
            all_das_avgs[lang] = layer_avgs
            max_das_scores[lang] = np.max(all_das_avgs[lang])
            mean_das_scores[lang] = np.mean(all_das_avgs[lang])
            

            ax.plot(range(32), layer_avgs, 
                label=f'{lang.capitalize()}', 
                marker='o', markersize=2)

    ax.set_xlabel('Layer')
    ax.tick_params(axis='both',labelsize=14)
    ax.set_ylabel(f'{mode}')
    #ax.set_title('DALI Scores Across Layers (story completion)')
    ax.grid(True, alpha=0.3)
    ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    ax.set_ylim(0,1)
    #plt.tight_layout()
    #plt.show()
    return max_das_scores, mean_das_scores

In [None]:
max_dali_story, mean_dali_story = plot_alignment_by_layers_xstorycloze('DALI')
max_dalistrong_story, mean_dalistrong_story = plot_alignment_by_layers_xstorycloze('DALIStrong')
max_mexa_story, mean_mexa_story = plot_alignment_by_layers_xstorycloze('MEXATask')


In [None]:
# Create a dataframe with the required metrics
lang_list= ['arabic', 'spanish', 'basque', 'hindi', 'indonesian', 'burmese', 'russian', 'telugu', 'chinese', 'swahili']

data = {
    'Language': lang_list,
    'Max_DALI': [max_dali_story[lang] for lang in lang_list],
    'Mean_DALI': [mean_dali_story[lang] for lang in lang_list],
    'Max_DALI_Strong': [max_dalistrong_story[lang] for lang in lang_list],
    'Mean_DALI_Strong': [mean_dalistrong_story[lang] for lang in lang_list],
    'Max_MEXA_T': [max_mexa_story[lang] for lang in lang_list],
    'Mean_MEXA_T': [mean_mexa_story[lang] for lang in lang_list]
}

df_metrics = pd.DataFrame(data)

In [None]:
df_metrics.to_excel("../../../../Images_DALI/xstorycloze_pooled_alignment.xlsx", index=False)

In [None]:
def plot_DALI_xcopa(dataset, lang, model, mode):
    if mode == 'DALI':
        DAS_path = f'../../alignment_outputs/{model}/{dataset}_dali/DALI_{lang}_lasttoken.json'
    if mode == 'DALIStrong':
        DAS_path = f'../../alignment_outputs/{model}/{dataset}_dali_strong/DALI_{lang}_lasttoken.json'

    if mode == 'MEXAFlores':
        flores_dict = {'chinese': 'zho_Hans', 
                       'indonesian': 'ind_Latn',
                        'italian': 'ita_Latn',
                         'swahili': 'swh_Latn',
                          'tamil': 'tam_Taml',
                           'thai': 'tha_Thai',
                            'turkish': 'tur_Latn',
                             'vietnamese': 'vie_Latn'}
        
        lang_code = flores_dict[lang]

        DAS_path = f'../../alignment_outputs/{model}/flores_mexa/{lang_code}.json'
    if mode == 'MEXATask':
        DAS_path = f'../../alignment_outputs/{model}/{dataset}_mexa/{lang}.json'
    with open(DAS_path) as f:
        lang_DAS = json.load(f)
    return lang_DAS

In [None]:
# Calculate accuracies and get DAS scores for each language
def plot_alignment_by_layers_xcopa(mode):
    accuracies = {}
    max_das_scores = {}
    all_das_avgs = {}
    mean_das_scores={}

    fig, ax = plt.subplots(figsize=(10, 6))
    plt.rcParams['font.family'] = 'Palatino'

    lang_list = ['chinese', 'indonesian', 'italian', 'swahili', 'tamil', 'thai', 'turkish', 'vietnamese']

    for lang in lang_list:
        # Calculate accuracy
        #accuracies[lang] = (sum(acc_dict[lang])/len(acc_dict[lang])) 
        # Get DAS data for this language
        lang_DAS = plot_DALI_xcopa('xcopa', lang, 'Llama3.1', mode)
        
        if mode == 'DALI' or mode == 'DALIStrong':
            lang_DAS = {int(outer_k): {int(inner_k): v for inner_k, v in inner_v.items()} 
                        for outer_k, inner_v in lang_DAS.items()}
            layer_avgs = []
            for layer in range(32):
                layer_scores = [lang_DAS[sample][layer] for sample in lang_DAS.keys()]
                layer_avgs.append(np.mean(layer_scores))

            all_das_avgs[lang] = layer_avgs
            max_das_scores[lang] = np.max(all_das_avgs[lang])
            mean_das_scores[lang] = np.mean(all_das_avgs[lang])
            ax.plot(range(32), layer_avgs, 
                label=f'{lang.capitalize()}', 
                marker='o', markersize=2)


            
        else: 
            lang_DAS = {int(k): v for k,v in lang_DAS.items()}
            layer_avgs = []
            for layer in range(32):
                layer_avgs.append(np.mean(lang_DAS[layer]))      
            all_das_avgs[lang] = layer_avgs
            max_das_scores[lang] = np.max(all_das_avgs[lang])
            mean_das_scores[lang] = np.mean(all_das_avgs[lang])
            

            ax.plot(range(32), layer_avgs, 
                label=f'{lang.capitalize()}', 
                marker='o', markersize=2)

    ax.set_xlabel('Layer')
    ax.tick_params(axis='both',labelsize=14)
    ax.set_ylabel(f'{mode}')
    #ax.set_title('DALI Scores Across Layers (story completion)')
    ax.grid(True, alpha=0.3)
    ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    ax.set_ylim(0,1)
    #plt.tight_layout()
    #plt.show()
    return max_das_scores, mean_das_scores

In [None]:
max_dali_copa, mean_dali_copa = plot_alignment_by_layers_xcopa('DALI')
max_dalistrong_copa, mean_dalistrong_copa = plot_alignment_by_layers_xcopa('DALIStrong')
max_mexa_copa, mean_mexa_copa = plot_alignment_by_layers_xcopa('MEXATask')

In [None]:
# Create a dataframe with the required metrics
lang_list= ['chinese', 'indonesian', 'italian', 'swahili', 'tamil', 'thai', 'turkish', 'vietnamese']

data = {
    'Language': lang_list,
    'Max_DALI': [max_dali_copa[lang] for lang in lang_list],
    'Mean_DALI': [mean_dali_copa[lang] for lang in lang_list],
    'Max_DALI_Strong': [max_dalistrong_copa[lang] for lang in lang_list],
    'Mean_DALI_Strong': [mean_dalistrong_copa[lang] for lang in lang_list],
    'Max_MEXA_T': [max_mexa_copa[lang] for lang in lang_list],
    'Mean_MEXA_T': [mean_mexa_copa[lang] for lang in lang_list]
}

df_metrics = pd.DataFrame(data)

In [None]:
df_metrics.to_excel("../../../../Images_DALI/xcopa_pooled_alignment.xlsx", index=False)