In [1]:
from datasets import load_dataset

llama1 = load_dataset('Kurosawama/EVALUATION_trans_Llama-3.1-8B', split = 'train')
llama1

Dataset({
    features: ['FOLIO', "meta-llama/Llama-3.1-8B's Answer"],
    num_rows: 203
})

In [2]:
import re
aux_gs = llama1['FOLIO'][0]
aux_llm = llama1['meta-llama/Llama-3.1-8B\'s Answer'][0]
aux_llm1 = re.split('Premises:', aux_llm)[1]

In [3]:
def premise_per_answer(ds_answer, llm):
    """
    ds_answer = ds[column][value] ; Elemento directo del conjunto de datos de HuggingFace.
    llm = bool ; Determina si la respuesta viene o no de un LLM.
    
    function_list = Conjunto (en formato de lista) de las premisas usadas.
    len(instance) = Cantidad de premisas en el conjunto de datos
    len(function_list) = Cantidad de apariciones de funciones en las premisas
    len(function_set) = Apariciones individuales de funciones con variables.
    """
    instance = ds_answer.split('\n')
    if llm:
        for i in range(len(instance)):
            instance[i] = re.sub('(:::)+([ A-z.]+)', '', instance[i])
            instance[i] = re.sub('(  )+', '', instance[i])
            # Hasta este punto ya limpiamos la lista de premisas.

    function_list = []
    for _ in instance:
        aux = re.finditer(r'[A-z]+\(([A-z]+(,? [A-z]+)*)\)', _)
        for regex in aux:
            function_list.append(regex.group())
    function_set = list(set(function_list))
    #print(function_list)
    return function_list, len(instance), len(function_list), len(function_set)

In [12]:
def compare_answers(gs, llm):
    """
    Vibe it till you make it.
    
    Inputs are two entries of answers extracted from the dataset.

    Returns the final coefficient.
    """
    # Getting correct data
    gs_premises, gs_prem_count, gs_function_appearances, gs_total_functions = premise_per_answer(gs, False)
    llm_premises, llm_prem_count, llm_function_appearances, llm_total_functions = premise_per_answer(llm, True)

    # Set operations
    union_of_premises = len(list(set(gs_premises).union(set(llm_premises))))
    intersection_of_premises = len(list(set(gs_premises).intersection(set(llm_premises))))
    iou = intersection_of_premises / union_of_premises

    # Absolute values
    prem_dif = abs(gs_prem_count - llm_prem_count)
    func_apps_dif = abs(gs_function_appearances - llm_function_appearances)
    total_func_dif = abs(gs_total_functions - llm_total_functions)

    logic_coefficient = iou + prem_dif + func_apps_dif + total_func_dif
    logic_coefficient = round(logic_coefficient, 3)
    print(logic_coefficient)
    
compare_answers(aux_gs, aux_llm)

29.0


In [13]:
def full_coefficient(dataset, column_name):
    folio_col = dataset['FOLIO']
    llm_col = dataset[column_name]
    for i in range(len(folio_col)):
        compare_answers(folio_col[i], llm_col[i])

In [14]:
full_coefficient(llama1, 'meta-llama/Llama-3.1-8B\'s Answer')

29.0
26.0
24.037
11.1176
10.1
11.1875
25.2609
17.0
20.2941
16.0
27.0741
17.08
21.1333
14.0769
14.0
48.0
15.2143
17.5
18.5833
16.2857
18.2857
24.0833
17.0833
22.1053
28.05
12.05
25.0667
11.3333
13.2667
17.2857
26.2
26.2
26.3333
43.0
33.0588
25.0
29.0526
36.125
16.3
14.2353
22.3077
17.2941
10.1538
7.1333
10.1538
45.0
36.0435
33.0
15.0
19.0
20.0
20.0588
21.0
19.087
21.1667
17.25
21.0909
6.2143
17.0
14.2273
12.2308
15.2143
31.3636
34.3636
28.0
20.5833
13.0
24.5
27.0
27.0
16.1667
30.0
13.0
10.2
14.0
22.0
16.0
16.0
18.25
24.4545
19.3636
19.3333
20.2143
15.0
18.0
14.0
16.0
23.0
18.0769
26.0769
20.0
29.0
11.0667
36.0833
21.0385
20.4545
22.0
21.75
21.4
29.4167
17.4
12.0
14.0
13.0
23.0
18.0
16.0
15.1
16.1053
12.0455
23.2
10.1053
16.1
13.0526
15.0909
17.15
20.1667
19.2
11.0
17.0
41.5
21.2727
24.1818
14.0
16.2381
17.35
17.2273
12.0909
15.0952
14.0909
14.0909
17.0
38.0714
38.1
20.2727
37.0
22.2308
21.0
17.0
19.05
19.0526
12.087
16.0357
17.1364
14.1053
14.0
11.0
11.0
14.1111
25.0
32.0435
17.0588
16.