In [1]:
import os
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, AutoPeftModelForCausalLM, TaskType, PeftModel
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed, Trainer, TrainingArguments, BitsAndBytesConfig, \
    DataCollatorForLanguageModeling, Trainer, TrainingArguments, logging, pipeline
from torch import cuda, bfloat16
import transformers
from trl import DataCollatorForCompletionOnlyLM, SFTConfig, SFTTrainer
import pandas as pd
from textwrap import dedent
from datasets import Dataset, load_dataset, concatenate_datasets
from sklearn.metrics import precision_score, recall_score, f1_score
import warnings
from metrics import  calculate_metrics, calculate_metrics2, calc_mets_my
import gc
import sys
import time


In [2]:
PROJECT = "Llama3-8B-QLora-FineTune-Omni"
MODEL_NAME = 'meta-llama/Meta-Llama-3-8B-Instruct'

In [3]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=bfloat16
)


model_config = transformers.AutoConfig.from_pretrained(
    MODEL_NAME,
    token=True
)

In [4]:
lora_config = LoraConfig(
    r=64,
    lora_alpha=16,
    target_modules=[
        "self_attn.q_proj",
        "self_attn.k_proj",
        "self_attn.v_proj",
        "self_attn.o_proj",
        "mlp.gate_proj",
        "mlp.up_proj",
        "mlp.down_proj",
    ],
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.CAUSAL_LM,
)

In [5]:
def format_test_distance(row, examples):
    prompt = dedent(
        f"""
        Place 1: '{row["e1"]}'
        Place 2: '{row["e2"]}'
        Distance: {row['distance']}
    """
    )
    samples = ""
    for i in examples:
        samples=samples+ dedent(
        f"""
        Place 1: '{i["e1"]}'
        Place 2: '{i["e2"]}'
        Distance: {i['distance']}
        Answer: {i['answer']}
    """
    )
    messages = [
        {
            "role": "system",
            "content":  "Two place descriptions and the geographic distance between them is provided. Do the two place descriptions refer to the same real-world place? Answer with 'Yes' if they do and 'No' if they do not.",
        },
        {"role": "user", "content": prompt},
    ]

    full_prompt = messages[0]["content"] + samples + prompt + "Answer: "
    return full_prompt
    

In [6]:
def format_test_gtminer_distance(row, examples):
    prompt = dedent(
        f"""
        Place 1: '{row["e1"]}'
        Place 2: '{row["e2"]}'
        Distance: {row['distance']}
    """
    )
    samples = ""
    for i in examples:
        samples=samples+ dedent(
        f"""
        Place 1: '{i["e1"]}'
        Place 2: '{i["e2"]}'
        Distance: {i['distance']}
        Answer: {i['answer']}
    """
    )
    messages = [
        {
            "role": "system",
            "content":  "Two place descriptions and the geographic distance between them is provided. Answer with 'same_as' if the first place is the same as the second place. Answer with 'part_of' if the first place is a part of the second place and is located inside the second place. Answer with 'serves' if the first place provides a service to the second place in terms of human mobility, assistance, etc. Answer with 'unknown' if the two places show none of these relations.",
        },
        {"role": "user", "content": prompt},
    ]

    full_prompt = messages[0]["content"] + samples + prompt + "Answer: "
    return full_prompt
    

In [7]:
def format_test_att_val(row, examples):
    prompt = dedent(
        f"""
        Place 1: '{row["e1"]}'
        Place 2: '{row["e2"]}'
    """
    )
    samples = ""
    for i in examples:
        samples=samples+ dedent(
        f"""
        Place 1: '{i["e1"]}'
        Place 2: '{i["e2"]}'
       Answer: {i['answer']}
    """
    )
    messages = [
        {
            "role": "system",
            "content":  "Do the two place descriptions refer to the same real-world place? Answer with 'Yes' if they do and 'No' if they do not.",
        },
        {"role": "user", "content": prompt},
    ]

    full_prompt = messages[0]["content"] + samples + prompt + "Answer: "
    return full_prompt

In [8]:
def format_test_gtminer_att_val(row, examples):
    prompt = dedent(
        f"""
        Place 1: '{row["e1"]}'
        Place 2: '{row["e2"]}'
    """
    )
    samples = ""
    for i in examples:
        samples=samples+ dedent(
        f"""
        Place 1: '{i["e1"]}'
        Place 2: '{i["e2"]}'
        Answer: {i['answer']}
    """
    )
    messages = [
        {
            "role": "system",
            "content":  "Two place descriptions are provided. Answer with 'same_as' if the first place is the same as the second place. Answer with 'part_of' if the first place is a part of the second place and is located inside the second place. Answer with 'serves' if the first place provides a service to the second place in terms of human mobility, assistance, etc. Answer with 'unknown' if the two places show none of these relations.",
        },
        {"role": "user", "content": prompt},
    ]

    full_prompt = messages[0]["content"] + samples + prompt + "Answer: "
    return full_prompt

In [9]:
dataset_folder_path = ['datasets\\my_data_distance\\auck\\', 'datasets\\my_data_distance\\hope\\', 'datasets\\my_data_distance\\norse\\','datasets\\my_data_distance\\north\\', 'datasets\\my_data_distance\\palm\\', 'datasets\\osm_fsq_distance\\edi\\', 'datasets\\osm_fsq_distance\\pit\\', 'datasets\\osm_fsq_distance\\sin\\', 'datasets\\osm_fsq_distance\\tor\\', 'datasets\\osm_yelp_distance\\edi\\', 'datasets\\osm_yelp_distance\\pit\\', 'datasets\\osm_yelp_distance\\sin\\', 'datasets\\osm_yelp_distance\\tor\\', 'datasets\\acheson_distance\\swiss\\']

In [None]:
logging.set_verbosity_error()
for dataset_folder in dataset_folder_path:
 
    warnings.filterwarnings("ignore")
    
    print(dataset_folder.split("\\"))
    dataset_output_path_1, dataset_output_path_2 = dataset_folder.split("\\")[-3], dataset_folder.split("\\")[-2]
        
    dataset = load_dataset(
        "json",
        data_files={"train": dataset_folder+"train.json", "valid": dataset_folder+"valid.json", "test": dataset_folder+"test.json"},
    )
    print("successfully loaded dataset.......")
    
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        trust_remote_code=True,
        config=model_config,
        quantization_config=bnb_config,
        device_map='auto',
        token=True
    )
    
    print("loaded model........")
    tokenizer = transformers.AutoTokenizer.from_pretrained(
        MODEL_NAME,
        token=True
    )
    
    print("loaded tokenizer........")
    PAD_TOKEN = tokenizer.eos_token
    tokenizer.add_special_tokens({"pad_token": PAD_TOKEN})
    tokenizer.padding_side = "right"
    
    
    print(dataset['train'][0]['text'])
    train_data = dataset["train"]
    
    # random_samples = train_data.shuffle(seed=42).select(range(4))
    
    yes_class = train_data.filter(lambda x: x["answer"] == "Yes")
    no_class = train_data.filter(lambda x: x["answer"] == "No")
    
    yes_samples = yes_class.shuffle(seed=42).select(range(2))
    no_samples = no_class.shuffle(seed=42).select(range(2))
    
    random_samples = concatenate_datasets([yes_samples, no_samples])
    
    random_samples = random_samples.shuffle(seed=42)
    
    test_prompts = [format_test_distance(x, random_samples) for x in dataset['test']]
    
    print(test_prompts[0])
    
    model.eval()
    
    batch_size=10
    results=[]
    start_time_test = time.time()
    with torch.no_grad():
        # for i in range(0, len(test_prompts), batch_size):
        for prompt in test_prompts:
                inputs = tokenizer(prompt, return_tensors="pt").to(device='cuda')
                # outputs = model.pipeline(inputs.input_ids)
                # batch = test_prompts[i:i + batch_size]
                # inputs = tokenizer(batch, return_tensors="pt", truncation=True,padding=True).to(device='cuda')
                outputs = model.generate(
                    inputs.input_ids, 
                    max_length=256,  # Maximum length of the generated text
                    max_new_tokens= 1,
                    num_return_sequences=1,  # Number of sequences to generate
                    no_repeat_ngram_size=2,  # Avoid repeating phrases
                    temperature=0.01,  # Controls randomness; lower is less random
                    top_k=50,  # Top-k sampling
                )
                prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
                # prediction = tokenizer.batch_decode(outputs, skip_special_tokens=True)
                # prediction = tokenizer.decode(outputs[:, inputs.shape[1]:])
                # results.extend([x.strip() for x in prediction])
                results.append(prediction.strip())

    end_time_test = time.time()
    elapsed_time_test = end_time_test - start_time_test
    
    print("testing completed........")
    
    predictions = [x.split(" ")[-1].strip() for x in results]
    # predictions = [x.split("\n")[-1].strip() for x in results]
    
    predictions = [1 if label == "Yes" else 0 if label == "No" else 2 for label in predictions]
    # print(predictions)
    labels = [1 if label == "Yes" else 0 for label in dataset['test']['answer']]
    # print(labels)
    print(dataset_folder.split("\\"))
    
    try:
        my_mets = calc_mets_my(predictions, labels)
        print(my_mets)
        
    except Exception as e:
        print(e)
        print('my calc failed')
        my_mets = 'my calc failed'
    
    try:
        bin_mets = calculate_metrics(predictions, labels, 'binary')
        print(bin_mets)
    except Exception as e:
        print(e)
        print('binary failed')
        bin_mets = 'binary failed'
        
    try:
        micro_mets = calculate_metrics(predictions, labels, 'micro')
        print(micro_mets)
    except Exception as e:
        print(e)
        print('micro failed')
        micro_mets = 'micro failed'
        
    try:
        macro_mets = calculate_metrics(predictions, labels, 'macro')
        print(macro_mets)
    except Exception as e:
        print(e)
        print('macro failed')
        macro_mets = 'macro failed'
        
    with open("class_balanced_distance_results.txt", "a", encoding='utf-8') as f:
        f.write(str(dataset_output_path_1))
        f.write(str(dataset_output_path_2))
        f.write('\n')
        f.write(str(dataset['train'][0]['text']))
        f.write('\n')
        f.write(str(results[0]))
        f.write('\n')
        f.write('\n')
        f.write(str(my_mets))
        f.write('\n')
        f.write(str(bin_mets))
        f.write('\n')
        f.write(str(micro_mets))
        f.write('\n')
        f.write(str(macro_mets))
        f.write('\n')
        # f.write(str(elapsed_time_train))
        f.write('\n')
        f.write(str(elapsed_time_test))
        f.write('\n')
        f.write('\n')
        f.write('********************************')
        f.write('\n')
        f.write('\n')
        
    
    del model  # Delete the model instance
    del tokenizer
    gc.collect()
    torch.cuda.empty_cache()
    


In [11]:
dataset_folder_path = ['datasets\\my_data_att_val\\auck\\', 'datasets\\my_data_att_val\\hope\\', 'datasets\\my_data_att_val\\norse\\','datasets\\my_data_att_val\\north\\', 'datasets\\my_data_att_val\\palm\\', 'datasets\\osm_fsq_att_val\\edi\\', 'datasets\\osm_fsq_att_val\\pit\\', 'datasets\\osm_fsq_att_val\\sin\\', 'datasets\\osm_fsq_att_val\\tor\\', 'datasets\\osm_yelp_att_val\\edi\\', 'datasets\\osm_yelp_att_val\\pit\\', 'datasets\\osm_yelp_att_val\\sin\\', 'datasets\\osm_yelp_att_val\\tor\\', 'datasets\\acheson_att_val\\swiss\\']

In [None]:
logging.set_verbosity_error()
for dataset_folder in dataset_folder_path:
 
    warnings.filterwarnings("ignore")
    
    print(dataset_folder.split("\\"))
    dataset_output_path_1, dataset_output_path_2 = dataset_folder.split("\\")[-3], dataset_folder.split("\\")[-2]
        
    dataset = load_dataset(
        "json",
        data_files={"train": dataset_folder+"train.json", "valid": dataset_folder+"valid.json", "test": dataset_folder+"test.json"},
    )
    print("successfully loaded dataset.......")
    
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        trust_remote_code=True,
        config=model_config,
        quantization_config=bnb_config,
        device_map='auto',
        token=True
    )
    
    print("loaded model........")
    tokenizer = transformers.AutoTokenizer.from_pretrained(
        MODEL_NAME,
        token=True
    )
    
    print("loaded tokenizer........")
    PAD_TOKEN = tokenizer.eos_token
    tokenizer.add_special_tokens({"pad_token": PAD_TOKEN})
    tokenizer.padding_side = "right"
    
    
    print(dataset['train'][0]['text'])
    train_data = dataset["train"]
    
    # random_samples = train_data.shuffle(seed=42).select(range(4))
    
    yes_class = train_data.filter(lambda x: x["answer"] == "Yes")
    no_class = train_data.filter(lambda x: x["answer"] == "No")
    
    yes_samples = yes_class.shuffle(seed=42).select(range(2))
    no_samples = no_class.shuffle(seed=42).select(range(2))
    
    random_samples = concatenate_datasets([yes_samples, no_samples])
    
    random_samples = random_samples.shuffle(seed=42)
    
    test_prompts = [format_test_att_val(x, random_samples) for x in dataset['test']]
    
    print(test_prompts[0])
    
    model.eval()
    
    batch_size=10
    results=[]
    start_time_test = time.time()
    with torch.no_grad():
        # for i in range(0, len(test_prompts), batch_size):
        for prompt in test_prompts:
                inputs = tokenizer(prompt, return_tensors="pt").to(device='cuda')
                # outputs = model.pipeline(inputs.input_ids)
                # batch = test_prompts[i:i + batch_size]
                # inputs = tokenizer(batch, return_tensors="pt", truncation=True,padding=True).to(device='cuda')
                outputs = model.generate(
                    inputs.input_ids, 
                    max_length=256,  # Maximum length of the generated text
                    max_new_tokens= 1,
                    num_return_sequences=1,  # Number of sequences to generate
                    no_repeat_ngram_size=2,  # Avoid repeating phrases
                    temperature=0.01,  # Controls randomness; lower is less random
                    top_k=50,  # Top-k sampling
                )
                prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
                # prediction = tokenizer.batch_decode(outputs, skip_special_tokens=True)
                # prediction = tokenizer.decode(outputs[:, inputs.shape[1]:])
                # results.extend([x.strip() for x in prediction])
                results.append(prediction.strip())

    end_time_test = time.time()
    elapsed_time_test = end_time_test - start_time_test
    
    print("testing completed........")
    
    predictions = [x.split(" ")[-1].strip() for x in results]
    # predictions = [x.split("\n")[-1].strip() for x in results]
    
    predictions = [1 if label == "Yes" else 0 if label == "No" else 2 for label in predictions]
    # print(predictions)
    labels = [1 if label == "Yes" else 0 for label in dataset['test']['answer']]
    # print(labels)
    print(dataset_folder.split("\\"))
    
    try:
        my_mets = calc_mets_my(predictions, labels)
        print(my_mets)
        
    except Exception as e:
        print(e)
        print('my calc failed')
        my_mets = 'my calc failed'
    
    try:
        bin_mets = calculate_metrics(predictions, labels, 'binary')
        print(bin_mets)
    except Exception as e:
        print(e)
        print('binary failed')
        bin_mets = 'binary failed'
        
    try:
        micro_mets = calculate_metrics(predictions, labels, 'micro')
        print(micro_mets)
    except Exception as e:
        print(e)
        print('micro failed')
        micro_mets = 'micro failed'
        
    try:
        macro_mets = calculate_metrics(predictions, labels, 'macro')
        print(macro_mets)
    except Exception as e:
        print(e)
        print('macro failed')
        macro_mets = 'macro failed'
        
    with open("class_balanced_att_val_results.txt", "a", encoding='utf-8') as f:
        f.write(str(dataset_output_path_1))
        f.write(str(dataset_output_path_2))
        f.write('\n')
        f.write(str(dataset['train'][0]['text']))
        f.write('\n')
        f.write(str(results[0]))
        f.write('\n')
        f.write('\n')
        f.write(str(my_mets))
        f.write('\n')
        f.write(str(bin_mets))
        f.write('\n')
        f.write(str(micro_mets))
        f.write('\n')
        f.write(str(macro_mets))
        f.write('\n')
        # f.write(str(elapsed_time_train))
        f.write('\n')
        f.write(str(elapsed_time_test))
        f.write('\n')
        f.write('\n')
        f.write('********************************')
        f.write('\n')
        f.write('\n')
        
    
    del model  # Delete the model instance
    del tokenizer
    gc.collect()
    torch.cuda.empty_cache()
    


In [13]:
dataset_folder_path = ['datasets\\gtminer_distance\\mel\\', 'datasets\gtminer_distance\\sea\\', 'datasets\gtminer_distance\\sin\\', 'datasets\\gtminer_distance\\tor\\']

In [None]:
logging.set_verbosity_error()


for dataset_folder in dataset_folder_path:
 
    warnings.filterwarnings("ignore")
    
    print(dataset_folder.split("\\"))
    dataset_output_path_1, dataset_output_path_2 = dataset_folder.split("\\")[-3], dataset_folder.split("\\")[-2]
        
    dataset = load_dataset(
        "json",
        data_files={"train": dataset_folder+"train.json", "valid": dataset_folder+"valid.json", "test": dataset_folder+"test.json"},
    )
    print("successfully loaded dataset.......")
    
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        trust_remote_code=True,
        config=model_config,
        quantization_config=bnb_config,
        device_map='auto',
        token=True
    )
    
    print("loaded model........")
    tokenizer = transformers.AutoTokenizer.from_pretrained(
        MODEL_NAME,
        token=True
    )
    
    print("loaded tokenizer........")
    PAD_TOKEN = tokenizer.eos_token
    tokenizer.add_special_tokens({"pad_token": PAD_TOKEN})
    tokenizer.padding_side = "right"
    
    
    # model = prepare_model_for_kbit_training(model)
    # model = get_peft_model(model, lora_config)
    
    print(dataset['train'][0]['text'])

    train_data = dataset["train"]
    
    # random_samples = train_data.shuffle(seed=42).select(range(4))
    
    same_as_class = train_data.filter(lambda x: x["answer"] == "same_as")
    part_of_class = train_data.filter(lambda x: x["answer"] == "part_of")
    serves_class = train_data.filter(lambda x: x["answer"] == "serves")
    unknown_class = train_data.filter(lambda x: x["answer"] == "unknown")
    
    same_as_samples = same_as_class.shuffle(seed=42).select(range(2))
    part_of_samples = part_of_class.shuffle(seed=42).select(range(2))
    serves_samples = serves_class.shuffle(seed=42).select(range(2))
    unknown_samples = unknown_class.shuffle(seed=42).select(range(2))
    
    random_samples = concatenate_datasets([same_as_samples, part_of_samples, serves_samples, unknown_samples])
    
    random_samples = random_samples.shuffle(seed=42)
    
    # random_samples = same_as_samples + part_of_samples + serves_samples + unknown_samples
    
    test_prompts = [format_test_gtminer_distance(x, random_samples) for x in dataset['test']]
    # elif dataset_output_path_1 =="gtminer3":
    #     test_prompts = [format_test_gtminer3(x) for x in dataset['test']]
    
    print(test_prompts[0])
    
    results=[]
    start_time_test = time.time()  
    
    model.eval()
    with torch.no_grad():
        for prompt in test_prompts:
                inputs = tokenizer(prompt, return_tensors="pt").to(device='cuda')
                # outputs = model.pipeline(inputs.input_ids)
                outputs = model.generate(
                    inputs.input_ids, 
                    max_length=300,  # Maximum length of the generated text
                    max_new_tokens= 2,
                    num_return_sequences=1,  # Number of sequences to generate
                    no_repeat_ngram_size=2,  # Avoid repeating phrases
                    temperature=0.01,  # Controls randomness; lower is less random
                    top_k=50,  # Top-k sampling
                )
                prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
                # prediction = tokenizer.decode(outputs[:, inputs.shape[1]:])
                results.append(prediction.strip())
    
    end_time_test = time.time()
    elapsed_time_test = end_time_test - start_time_test
    
    print("testing completed........")
    # print(results)
    predictions = [x.split(":")[-1].strip() for x in results]
    # predictions = [x.split("\n")[-1].strip() for x in results]
    
    predictions = [1 if label in ["same_as", "same", "same-as"] else 2 if label in ["part_of", "part-of", "partof"] else 3 if label in ["serves", "served"] else 0 if label in ["unknown"] else 4 for label in predictions]
    # print(predictions)
    labels = [1 if label == "same_as" else 2 if label == "part_of" else 3 if label == "serves" else 0 if label == "unknown" else 5 for label in dataset['test']['answer']]
    # print(labels)
    print(dataset_folder.split("\\"))
    
    try:
        my_mets = calculate_metrics2(predictions, labels)
        print(my_mets)
        
    except Exception as e:
        print(e)
        print('my calc failed')
        my_mets = 'my calc failed'
    
    try:
        bin_mets = calculate_metrics(predictions, labels, 'binary')
        print(bin_mets)
    except Exception as e:
        print(e)
        print('binary failed')
        bin_mets = 'binary failed'
        
    try:
        micro_mets = calculate_metrics(predictions, labels, 'micro')
        print(micro_mets)
    except Exception as e:
        print(e)
        print('micro failed')
        micro_mets = 'micro failed'
        
    try:
        macro_mets = calculate_metrics(predictions, labels, 'macro')
        print(macro_mets)
    except Exception as e:
        print(e)
        print('macro failed')
        macro_mets = 'macro failed'
        
    with open("gtminer_class_balanced_distance_results.txt", "a", encoding='utf-8') as f:
        f.write(str(dataset_output_path_1))
        f.write(str(dataset_output_path_2))
        f.write('\n')
        f.write(str(dataset['train'][0]['text']))
        f.write('\n')
        f.write(str(results[0]))
        f.write('\n')
        f.write('\n')
        f.write(str(my_mets))
        f.write('\n')
        f.write(str(bin_mets))
        f.write('\n')
        f.write(str(micro_mets))
        f.write('\n')
        f.write(str(macro_mets))
        f.write('\n')
        # f.write(str(elapsed_time_train))
        f.write('\n')
        f.write(str(elapsed_time_test))
        f.write('\n')
        f.write('\n')
        f.write('********************************')
        f.write('\n')
        f.write('\n')
    
    del model  # Delete the model instance
    del tokenizer
    gc.collect()
    torch.cuda.empty_cache()

In [15]:
dataset_folder_path = ['datasets\\gtminer_att_val\\mel\\', 'datasets\gtminer_att_val\\sea\\', 'datasets\gtminer_att_val\\sin\\', 'datasets\\gtminer_att_val\\tor\\']


In [None]:
format_test_gtminer_att_val
logging.set_verbosity_error()


for dataset_folder in dataset_folder_path:
 
    warnings.filterwarnings("ignore")
    
    print(dataset_folder.split("\\"))
    dataset_output_path_1, dataset_output_path_2 = dataset_folder.split("\\")[-3], dataset_folder.split("\\")[-2]
        
    dataset = load_dataset(
        "json",
        data_files={"train": dataset_folder+"train.json", "valid": dataset_folder+"valid.json", "test": dataset_folder+"test.json"},
    )
    print("successfully loaded dataset.......")
    
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        trust_remote_code=True,
        config=model_config,
        quantization_config=bnb_config,
        device_map='auto',
        token=True
    )
    
    print("loaded model........")
    tokenizer = transformers.AutoTokenizer.from_pretrained(
        MODEL_NAME,
        token=True
    )
    
    print("loaded tokenizer........")
    PAD_TOKEN = tokenizer.eos_token
    tokenizer.add_special_tokens({"pad_token": PAD_TOKEN})
    tokenizer.padding_side = "right"
    
    
    # model = prepare_model_for_kbit_training(model)
    # model = get_peft_model(model, lora_config)
    
    print(dataset['train'][0]['text'])

    train_data = dataset["train"]
    
    # random_samples = train_data.shuffle(seed=42).select(range(4))
    
    same_as_class = train_data.filter(lambda x: x["answer"] == "same_as")
    part_of_class = train_data.filter(lambda x: x["answer"] == "part_of")
    serves_class = train_data.filter(lambda x: x["answer"] == "serves")
    unknown_class = train_data.filter(lambda x: x["answer"] == "unknown")
    
    same_as_samples = same_as_class.shuffle(seed=42).select(range(2))
    part_of_samples = part_of_class.shuffle(seed=42).select(range(2))
    serves_samples = serves_class.shuffle(seed=42).select(range(2))
    unknown_samples = unknown_class.shuffle(seed=42).select(range(2))
    
    random_samples = concatenate_datasets([same_as_samples, part_of_samples, serves_samples, unknown_samples])
    
    random_samples = random_samples.shuffle(seed=42)
    
    # random_samples = same_as_samples + part_of_samples + serves_samples + unknown_samples
    
    test_prompts = [format_test_gtminer_att_val(x, random_samples) for x in dataset['test']]
    # elif dataset_output_path_1 =="gtminer3":
    #     test_prompts = [format_test_gtminer3(x) for x in dataset['test']]
    
    print(test_prompts[0])
    
    results=[]
    start_time_test = time.time()  
    
    model.eval()
    with torch.no_grad():
        for prompt in test_prompts:
                inputs = tokenizer(prompt, return_tensors="pt").to(device='cuda')
                # outputs = model.pipeline(inputs.input_ids)
                outputs = model.generate(
                    inputs.input_ids, 
                    max_length=300,  # Maximum length of the generated text
                    max_new_tokens= 2,
                    num_return_sequences=1,  # Number of sequences to generate
                    no_repeat_ngram_size=2,  # Avoid repeating phrases
                    temperature=0.01,  # Controls randomness; lower is less random
                    top_k=50,  # Top-k sampling
                )
                prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
                # prediction = tokenizer.decode(outputs[:, inputs.shape[1]:])
                results.append(prediction.strip())
    
    end_time_test = time.time()
    elapsed_time_test = end_time_test - start_time_test
    
    print("testing completed........")
    # print(results)
    predictions = [x.split(":")[-1].strip() for x in results]
    # predictions = [x.split("\n")[-1].strip() for x in results]
    
    predictions = [1 if label in ["same_as", "same", "same-as"] else 2 if label in ["part_of", "part-of", "partof"] else 3 if label in ["serves", "served"] else 0 if label in ["unknown"] else 4 for label in predictions]
    # print(predictions)
    labels = [1 if label == "same_as" else 2 if label == "part_of" else 3 if label == "serves" else 0 if label == "unknown" else 5 for label in dataset['test']['answer']]
    # print(labels)
    print(dataset_folder.split("\\"))
    
    try:
        my_mets = calculate_metrics2(predictions, labels)
        print(my_mets)
        
    except Exception as e:
        print(e)
        print('my calc failed')
        my_mets = 'my calc failed'
    
    try:
        bin_mets = calculate_metrics(predictions, labels, 'binary')
        print(bin_mets)
    except Exception as e:
        print(e)
        print('binary failed')
        bin_mets = 'binary failed'
        
    try:
        micro_mets = calculate_metrics(predictions, labels, 'micro')
        print(micro_mets)
    except Exception as e:
        print(e)
        print('micro failed')
        micro_mets = 'micro failed'
        
    try:
        macro_mets = calculate_metrics(predictions, labels, 'macro')
        print(macro_mets)
    except Exception as e:
        print(e)
        print('macro failed')
        macro_mets = 'macro failed'
        
    with open("gtminer_class_balanced_att_val_results.txt", "a", encoding='utf-8') as f:
        f.write(str(dataset_output_path_1))
        f.write(str(dataset_output_path_2))
        f.write('\n')
        f.write(str(dataset['train'][0]['text']))
        f.write('\n')
        f.write(str(results[0]))
        f.write('\n')
        f.write('\n')
        f.write(str(my_mets))
        f.write('\n')
        f.write(str(bin_mets))
        f.write('\n')
        f.write(str(micro_mets))
        f.write('\n')
        f.write(str(macro_mets))
        f.write('\n')
        # f.write(str(elapsed_time_train))
        f.write('\n')
        f.write(str(elapsed_time_test))
        f.write('\n')
        f.write('\n')
        f.write('********************************')
        f.write('\n')
        f.write('\n')
    
    del model  # Delete the model instance
    del tokenizer
    gc.collect()
    torch.cuda.empty_cache()