In [1]:
from datasets import load_dataset
import re

In [43]:
class Evaluation:
	"""
		Permite evaluar el dataset de las respuestas de los modelos.
	"""
	def __init__(self, dataset, split):
		"""
		dataset = str ;  Nombre del dataset en HuggingFace
		split = str ; Split a trabajar.
		"""
		self.ds = load_dataset(dataset, split = split) 
	
	def premises_per_answer(self, ds_answer, llm):
		"""
			Limpia la respuesta de un dataset y extrae las premisas necesarias para calcular LogicSim.

			llm = Bool ; Señala si se va a evaluar la respuesta generada por un LLM.
		"""
		instance = ds_answer.split('\n')
		if llm:
			for i in range(len(instance)):
				instance[i] = re.sub('(:::)+([ A-z.]+)', '', instance[i])
				instance[i] = re.sub('(  )+', '', instance[i])
			
		function_list = []
		for _ in instance:
			# La siguiente expresión separa las respuestas en sus premisas.
			aux = re.finditer(r'[A-z]+\(([A-z]+(,? [A-z]+)*)\)', _)
			for regex in aux:
				function_list.append(regex.group())
		function_set = list(set(function_list))

		return function_list, len(instance), len(function_list), len(function_set)

	def compare_answers(self, folio, llm):
		"""
			Dadas dos entradas de un mismo dataset (folio[i], llm_ans[i]), extrae y calcula los valores para LogicSim(x,y)
		"""
		gs_prem, gs_prem_count, gs_funcs_apps, gs_total_funcs = self.premises_per_answer(folio, False)
		llm_prem, llm_prem_count, llm_funcs_apps, llm_total_funcs = self.premises_per_answer(llm, True)

		# Operaciones de conjuntos
		union_prem = len(list(set(gs_prem).union(set(llm_prem))))
		intersection_prem = len(list(set(gs_prem).intersection(set(llm_prem))))
		iou = intersection_prem / union_prem

		# Valores absolutos
		prem_dif = abs(gs_prem_count - llm_prem_count)
		func_apps_dif = abs(gs_funcs_apps - llm_funcs_apps)
		func_total_dif = abs(gs_total_funcs - llm_total_funcs)

		logicsim = round(iou + prem_dif + func_apps_dif + func_total_dif, 2)
		#print(logicsim)
		return logicsim

	def logic_sim(self, column_name):
		"""
			Calcula LogicSim(x,y) entre x = FOLIO_answer, y = LLM_answer.

			column_name = str ; El nombre de la columna donde se almacenan las respuestas de los LLMs.
		"""
		folio_column = self.ds['FOLIO'] # Ya existe una columna del ds que se llama 'FOLIO'.

		# OBS: Hay que cambiar esta parte, los nombres de las columnas están muy wack.
		llm_column = self.ds[column_name]
		average = 0

		for i in range(len(folio_column)):
			average += self.compare_answers(folio_column[i], llm_column[i])
		print("LogicSim promedio: {}".format(round(average/len(folio_column), 2)))

In [37]:
a = Evaluation('Kurosawama/EVAL_Llama-3.1-8B', 'trans')
column = list(a.ds.features.keys())[1]
a.logic_sim(column)

LogicSim promedio: 20.443743842364533


In [44]:
def trans_logic_sim(ds_name):
    aux = Evaluation(ds_name, 'trans')
    column = list(aux.ds.features.keys())[1]
    aux.logic_sim(column)
    #----
    aux1 = Evaluation(ds_name, 'inference')
    column = list(aux.ds.features.keys())[1]
    aux1.logic_sim(column)

dataset_name = [
    'Kurosawama/EVAL_gemma-3-1b-it',
    'Kurosawama/EVAL_Llama-3.2-3B',
    'Kurosawama/EVAL_Llama-3.1-8B',
    'Kurosawama/EVAL_Llama-3.2-3B-Instruct',
    'Kurosawama/EVAL_Llama-3.1-8B-Instruct'
]

for _ in dataset_name:
    print("=====================")
    print("Modelo: {}".format(_))
    trans_logic_sim(_)

Modelo: Kurosawama/EVAL_gemma-3-1b-it
LogicSim promedio: 29.08
LogicSim promedio: 46.7
Modelo: Kurosawama/EVAL_Llama-3.2-3B
LogicSim promedio: 24.37
LogicSim promedio: 47.45
Modelo: Kurosawama/EVAL_Llama-3.1-8B
LogicSim promedio: 20.44
LogicSim promedio: 47.02
Modelo: Kurosawama/EVAL_Llama-3.2-3B-Instruct
LogicSim promedio: 25.76
LogicSim promedio: 56.58
Modelo: Kurosawama/EVAL_Llama-3.1-8B-Instruct
LogicSim promedio: 22.86
LogicSim promedio: 58.52


In [None]:
# Lo de arriba es de los modelos -LA

In [45]:
dataset_name = [
    #'Kurosawama/EVAL_gemma-3-1b-it_BASE',
    #'Kurosawama/EVAL_Llama-3.2-3B_BASE',
    'Kurosawama/EVAL_Llama-3.1-8B_BASE',
    #'Kurosawama/EVAL_Llama-3.2-3B-Instruct_BASE',
    #'Kurosawama/EVAL_Llama-3.1-8B-Instruct_BASE'
]

for _ in dataset_name:
    print("=====================")
    print("Modelo: {}".format(_))
    trans_logic_sim(_)

Modelo: Kurosawama/EVAL_Llama-3.1-8B_BASE


README.md:   0%|          | 0.00/444 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


trans-00000-of-00001.parquet:   0%|          | 0.00/71.4k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


inference-00000-of-00001.parquet:   0%|          | 0.00/42.8k [00:00<?, ?B/s]

Generating trans split:   0%|          | 0/203 [00:00<?, ? examples/s]

Generating inference split:   0%|          | 0/203 [00:00<?, ? examples/s]

LogicSim promedio: 20.69
LogicSim promedio: 45.9
