In [17]:
import os
import json
import pandas as pd
from bert_score import score

def compute_bert(folder_path, convert):
    # Load all JSON files into a list of dataframes
    dataframes = []
    for file_name in os.listdir(folder_path):
        if file_name.endswith(".json"):
            file_path = os.path.join(folder_path, file_name)
            try:
                with open(file_path, "r", encoding="utf-8") as f:
                    data = json.load(f)
                df_temp = pd.json_normalize(data)
                dataframes.append(df_temp)
            except Exception as e:
                print(f"Error reading {file_path}: {e}")
    
    if not dataframes:
        raise ValueError("No valid JSON files found in the specified folder.")
    
    df = pd.concat(dataframes, ignore_index=True)
    if convert:
        df["response"] = df["output"]
        df["output"] = df["label"]
    
    # Define a helper function to compute BERTScore
    def BERTScore(predictions, references, model="ar/QA", device=""):
        return score(
            cands=predictions,
            refs=references,
            batch_size=32,
            model_type=model,
            device=device,
            num_layers=12,
        )
    
    # Select the model from the map
    model = "bert-base-multilingual-uncased"
    
    # Extract predictions and references from the dataframe
    predictions = df["response"].to_list()
    references = df["output"].to_list()
    
    # Compute BERTScore
    precision, recall, f1 = BERTScore(predictions, references, model=model, device=3)
    
    # Format and return the mean f1 score
    precision_ = f"{precision.mean():.4f}"  # sum(scores["precision"]) / len(scores["precision"])
    recall_ = f"{recall.mean():.4f}"  # sum(scores["recall"]) / len(scores["recall"])
    f1_ = f"{f1.mean():.4f}"  # sum(scores["f1"]) / len(scores["f1"])

    print(f"precision: {precision_}\nrecall: {recall_}\nf1: {f1_}")
    df["f1"] = f1
    return df


In [71]:
ft_llama = compute_bert(folder_path = "/export/home/mohamedbayan/Bayan/LJP/Arabic-LJP/results/llama-3.2-ft",convert = False)

precision: 0.7649
recall: 0.7278
f1: 0.7401


In [72]:
base_llama = compute_bert(folder_path = "/export/home/mohamedbayan/Bayan/LJP/results/Llama-3.2-3b-instruct/LJP", convert = True)

precision: 0.5262
recall: 0.5690
f1: 0.5439


# Statistical Significance

## Llama-3.2

In [92]:
import pandas as pd
from scipy.stats import wilcoxon

def compare_systems(base_df, tuned_df):
    # Group by "Instruction" and calculate the mean F1 scores for both systems
    old_system = base_df.groupby("Instruction", as_index=False)["f1"].mean()
    new_system = tuned_df.groupby("Instruction", as_index=False)["f1"].mean()

    # Merge the data on "Instruction"
    merged = pd.merge(old_system,new_system, on="Instruction", how="outer")
    print(merged)
    # Drop rows with NaN values in F1 scores
    merged = merged.dropna(subset=["f1_x", "f1_y"])

    # Extract old and new system scores
    old_system_scores = merged["f1_x"]
    new_system_scores = merged["f1_y"]

    # Apply Wilcoxon signed-rank test
    wilcoxon_statistic, wilcoxon_p_value = wilcoxon(new_system_scores, old_system_scores, alternative='greater')

    # Define significance level
    alpha = 0.05

    # Print results
    print(f"Wilcoxon test statistic: {wilcoxon_statistic}")
    print(f"P-value: {wilcoxon_p_value}")
    if wilcoxon_p_value < alpha:
        print(f"The result is statistically significant (p-value = {wilcoxon_p_value:.4f}).")
    else:
        print(f"The result is not statistically significant (p-value = {wilcoxon_p_value:.4f}).")


In [74]:
compare_systems(base_llama, ft_llama)


Wilcoxon test statistic: 2850.0
P-value: 2.6401864586600792e-14
The result is statistically significant (p-value = 0.0000).


## Llama3.1

In [75]:
ft_llama = compute_bert(folder_path = "/export/home/mohamedbayan/Bayan/LJP/Arabic-LJP/results/llama-3.1-ft",convert = False)
base_llama = compute_bert(folder_path = "/export/home/mohamedbayan/Bayan/LJP/results/Meta-Llama-3.1-8B-Instruct/LJP", convert = True)

precision: 0.7802
recall: 0.7416
f1: 0.7550
precision: 0.5481
recall: 0.6189
f1: 0.5795


In [93]:
compare_systems(base_llama, ft_llama)

                                          Instruction      f1_x      f1_y
0   إذا كانت هذه هي الأسباب وهذه هي الوقائع، ما نص...  0.575977  0.758273
1   ابدأ بتحليل الأسباب، ثم اكتب نص الحكم بناءً عل...  0.582247  0.775244
2   استخدم الأسباب لتحليل الوقائع وصياغة نص الحكم ...  0.588537  0.778784
3   استخدم الأسباب لتحليل الوقائع وصياغة نص الحكم ...  0.568490  0.733270
4   استخدم التحليل المنطقي والقانوني للأسباب لصياغ...  0.574055  0.753575
..                                                ...       ...       ...
70  ما هو نص الحكم الذي يمكن استنتاجه من الوقائع و...  0.581248  0.777551
71  ما هو نص الحكم العادل الذي يجب إصداره وفقًا لل...  0.605633  0.744589
72             ما هو نص الحكم المتوقع من هذه الوقائع؟  0.571130  0.696606
73  ما هو نص الحكم المناسب الذي يمكن استنباطه من ا...  0.562017  0.747094
74  ولد نص الحكم النهائي باستخدام التحليل القانوني...  0.610523  0.801367

[75 rows x 3 columns]
Wilcoxon test statistic: 2850.0
P-value: 2.6401864586600792e-14
The result is statistical

## Both Llama version

In [82]:
large_llama = compute_bert(folder_path = "/export/home/mohamedbayan/Bayan/LJP/Arabic-LJP/results/llama-3.1-ft",convert = False)
small_llama = compute_bert(folder_path = "/export/home/mohamedbayan/Bayan/LJP/Arabic-LJP/results/llama-3.2-ft", convert = False)

precision: 0.7802
recall: 0.7416
f1: 0.7550
precision: 0.7649
recall: 0.7278
f1: 0.7401


In [84]:
compare_systems(small_llama, large_llama)

Wilcoxon test statistic: 2262.0
P-value: 4.9390190229004445e-06
The result is statistically significant (p-value = 0.0000).
