In [1]:
import argparse
import bitsandbytes as bnb
from functools import partial
import os
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, AutoPeftModelForCausalLM
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed, Trainer, TrainingArguments, BitsAndBytesConfig, \
    DataCollatorForLanguageModeling, Trainer, TrainingArguments, logging
from torch import cuda, bfloat16
import transformers
import warnings
import torch
import torch.nn as nn
from sklearn.metrics import precision_score, recall_score, f1_score
from metrics import calc_mets_my
from datasets import Dataset, load_dataset

In [2]:
PROJECT = "Llama3-8B-QLora-Omni"
MODEL_NAME = 'meta-llama/Meta-Llama-3-8B-Instruct'


In [3]:
device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'
device

'cuda:0'

In [4]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=bfloat16
)


model_config = transformers.AutoConfig.from_pretrained(
    MODEL_NAME,
    token=True
)

In [5]:
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    trust_remote_code=True,
    config=model_config,
    quantization_config=bnb_config,
    device_map='auto',
    token=True
)
model.eval()

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps

In [6]:
tokenizer = transformers.AutoTokenizer.from_pretrained(
    MODEL_NAME,
    token=True
)
PAD_TOKEN = tokenizer.eos_token
tokenizer.add_special_tokens({"pad_token": PAD_TOKEN})
tokenizer.padding_side = "right"

In [32]:
# inputs = tokenizer("Answer the following question. What is an alternative name for the Te Wāpū o Queen in Auckland?", return_tensors="pt").to(device='cuda')
inputs = tokenizer("Answer the following question. What is an alternative name for the Marsden Wharf in Auckland?", return_tensors="pt").to(device='cuda')

In [33]:
outputs = model.generate(
        inputs.input_ids, 
        max_length=256,  # Maximum length of the generated text
        # max_new_tokens= 2,
        num_return_sequences=1,  # Number of sequences to generate
        no_repeat_ngram_size=2,  # Avoid repeating phrases
        temperature=0.01,  # Controls randomness; lower is less random
        top_k=50,  # Top-k sampling
)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


In [34]:
prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)

In [35]:
prediction

'Answer the following question. What is an alternative name for the Marsden Wharf in Auckland? (A) Wynyard Wharfs (B) Marsen Whaf (C) Quay Street Wha (D) Ferry Building\nAnswer: A) Wyndham Whars\nExplanation: The Marseden Whalf is also known as the Wynard Whafs. It is a wharf located in the Auckland CBD, New Zealand. The whalf has been in operation since the 19th century and has played a significant role in New Zealands maritime history. Today, it is still a popular destination for tourists and locals alike, offering stunning views of the Waitemata Harbour and the city skyline. Visitors can enjoy a variety of activities, including shopping, dining, and boat tours. So, if you ever find yourself in Aucklands CBD and looking for a unique experience, be sure to check out the Wyand Whas. (Source: Wikipedia) (Note: I apologize for any confusion caused by the typo in my previous response. I hope this answer is helpful.) (End of note) I am confident that the answer to this question is A Wynwar

In [31]:
import time
test_prompts = ['what is your name?', 'how old are you', 'where do you come from', 'whats yoru moms name', 'whats your fathers name', 'where do you live', 'what is your favourite sport', 'what is your favourite colour?']*12

In [40]:
len(test_prompts)

96

In [41]:

    
results=[]
batch_size=9
start_time = time.time()
for i in range(0, len(test_prompts), batch_size):
# for i in test_prompts:
    batch = test_prompts[i:i + batch_size]
    inputs = tokenizer(batch, return_tensors="pt", truncation=True,padding=True).to(device='cuda')
    # inputs = tokenizer(i, return_tensors="pt").to(device='cuda')
    # print(inputs.shape())
    # print(inputs)
    
    # outputs = model.pipeline(inputs.input_ids)
    outputs = model.generate(
        inputs.input_ids, 
        max_length=256,  # Maximum length of the generated text
        # max_new_tokens= 2,
        num_return_sequences=1,  # Number of sequences to generate
        no_repeat_ngram_size=2,  # Avoid repeating phrases
        temperature=0.01,  # Controls randomness; lower is less random
        top_k=50,  # Top-k sampling
    )
    # print(outputs.shape)
    prediction = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    # prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # prediction = tokenizer.decode(outputs[:, inputs.shape[1]:])
    results.extend(prediction)
    
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Time spent: {elapsed_time:.6f} seconds")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end gene

Time spent: 411.038001 seconds


In [7]:
memory_used = model.get_memory_footprint()
print("Memory footprint: {} GB".format(memory_used/1024/1024/1024))

Memory footprint: 5.207535028457642 GB


In [21]:
prompt = "Once upon a time in a land far away,"

# Tokenize the input
inputs = tokenizer(prompt, return_tensors="pt").to(device)

# Generate text
outputs = model.generate(
    inputs.input_ids, 
    max_length=100,  # Maximum length of the generated text
    num_return_sequences=1,  # Number of sequences to generate
    no_repeat_ngram_size=2,  # Avoid repeating phrases
    temperature=1,  # Controls randomness; lower is less random
    top_k=50,  # Top-k sampling
)

# Decode the output
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

print("Generated Text:", generated_text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Generated Text: Once upon a time in a land far away, there was a small town called Tundla, which was famous for its rich soil and lush greenery. The town was surrounded by vast fields and orchards, where farmers worked tirelessly to grow a variety of fruits and vegetables. Tundra's residents were simple and honest people who lived off the land and depended on the bountiful harvests for their survival. They lived in harmony with nature and believed that every living being, whether human or


In [7]:
def parse_file(file_path):
    """
    Parses the input file and extracts entity pairs and labels.
    :param file_path: Path to the input text file.
    :return: A list of tuples (entity_1, entity_2, label).
    """
    data = []
    labels=[]
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f.readlines():
            parts = line.strip().split("\t")
            entity_1 = parts[0].replace("COL ", "").replace("[VAL] ", "").replace("VAL ", "").replace("name ", "").replace("type ", "").replace("latitude ", "").replace("longitude ", "").replace("postalCode ", "").replace("address ", "").strip()
            entity_2 = parts[1].replace("COL ", "").replace("[VAL] ", "").replace("VAL ", "").replace("name ", "").replace("type ", "").replace("latitude ", "").replace("longitude ", "").replace("postalCode ", "").replace("address ", "").strip()
            label = parts[2]  # Assuming the label is at the end of each line
            data.append((entity_1, entity_2))
            labels.append(label)
    return data, labels

In [8]:
def prepare_prompt_simple(row):
    """
    Prepares a natural language prompt for the entity resolution task.
    :param row: A tuple with two entities and the expected result.
    :return: A formatted prompt string.
    """
    
    entity_1, entity_2 = row
    # print(entity_1)
    prompt = f"""Do the two place descriptions refer to the same real-world place? Answer with 'Yes' if they do and 'No' if they do not.
    Place 1: {entity_1}
    Place 2: {entity_2}
    Answer: """
    return prompt

In [ ]:
def prepare_prompt_attribute_value(row):
    """
    Prepares a natural language prompt for the entity resolution task.
    :param row: A tuple with two entities and the expected result.
    :return: A formatted prompt string.
    """
    
    entity_1, entity_2 = row
    # print(entity_1)
    prompt = f"""Do the two place descriptions refer to the same real-world place? Answer with 'Yes' if they do and 'No' if they do not.
    Place 1: {entity_1}
    Place 2: {entity_2}
    Answer: """
    return prompt

In [9]:
logging.set_verbosity_error()

In [49]:
def zero_shot_inference(model, tokenizer, prompts, max_new_tokens):
    """
    Performs zero-shot inference using the model.
    :param model: The loaded quantized model.
    :param tokenizer: Tokenizer for the model.
    :param prompts: List of input prompts.
    :return: Model predictions (Yes/No).
    """
    results = []
    
    for prompt in prompts:
        inputs = tokenizer(prompt, return_tensors="pt").to(device)
        # outputs = model.pipeline(inputs.input_ids)
        outputs = model.generate(
            inputs.input_ids, 
            max_length=100,  # Maximum length of the generated text
            max_new_tokens= max_new_tokens,
            num_return_sequences=1,  # Number of sequences to generate
            no_repeat_ngram_size=2,  # Avoid repeating phrases
            temperature=0.01,  # Controls randomness; lower is less random
            top_k=50,  # Top-k sampling
        )
        prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
        # prediction = tokenizer.decode(outputs[:, inputs.shape[1]:])
        results.append(prediction.strip())
        
    return results

In [36]:
file_paths = ["C:\\Users\kwijegun\PycharmProjects\omni-geometry-geoER\data\\train_valid_test\my_data\\auck\\test.txt", "C:\\Users\kwijegun\PycharmProjects\omni-geometry-geoER\data\\train_valid_test\my_data\\hope\\test.txt", "C:\\Users\kwijegun\PycharmProjects\omni-geometry-geoER\data\\train_valid_test\my_data\\norse\\test.txt", "C:\\Users\kwijegun\PycharmProjects\omni-geometry-geoER\data\\train_valid_test\my_data\\north\\test.txt", "C:\\Users\kwijegun\PycharmProjects\omni-geometry-geoER\data\\train_valid_test\my_data\\palm\\test.txt", "C:\\Users\kwijegun\PycharmProjects\omni-geometry-geoER\data\\train_valid_test\\osm_fsq\\edi\\test.txt", "C:\\Users\kwijegun\PycharmProjects\omni-geometry-geoER\data\\train_valid_test\\osm_fsq\\pit\\test.txt", "C:\\Users\kwijegun\PycharmProjects\omni-geometry-geoER\data\\train_valid_test\\osm_fsq\\sin\\test.txt","C:\\Users\kwijegun\PycharmProjects\omni-geometry-geoER\data\\train_valid_test\\osm_fsq\\tor\\test.txt","C:\\Users\kwijegun\PycharmProjects\omni-geometry-geoER\data\\train_valid_test\\osm_yelp\\edi\\test.txt","C:\\Users\kwijegun\PycharmProjects\omni-geometry-geoER\data\\train_valid_test\\osm_yelp\\pit\\test.txt","C:\\Users\kwijegun\PycharmProjects\omni-geometry-geoER\data\\train_valid_test\\osm_yelp\\sin\\test.txt","C:\\Users\kwijegun\PycharmProjects\omni-geometry-geoER\data\\train_valid_test\\osm_yelp\\tor\\test.txt"]

In [10]:
dataset_folder_path = ['datasets\\my_data_distance\\auck\\', 'datasets\\my_data_distance\\hope\\', 'datasets\\my_data_distance\\norse\\','datasets\\my_data_distance\\north\\', 'datasets\\my_data_distance\\palm\\', 'datasets\\osm_fsq_distance\\edi\\', 'datasets\\osm_fsq_distance\\pit\\', 'datasets\\osm_fsq_distance\\sin\\', 'datasets\\osm_fsq_distance\\tor\\', 'datasets\\osm_yelp_distance\\edi\\', 'datasets\\osm_yelp_distance\\pit\\', 'datasets\\osm_yelp_distance\\sin\\', 'datasets\\osm_yelp_distance\\tor\\', 'datasets\\acheson_distance\\swiss\\']

In [37]:
def calculate_metrics(predictions, labels):
 
    # Convert "Yes" to 1 and "No" to 0 for predicted labels
    predicted = [1 if label == "Yes" else 0 if label == "No" else 3 for label in predictions]
    
    # Ensure ground truth is already in binary format
    ground_truth = [int(x) for x in labels]
    # Calculate metrics
    precision = precision_score(ground_truth, predicted)
    recall = recall_score(ground_truth, predicted)
    f1 = f1_score(ground_truth, predicted)
    
    return {
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1
    }

In [38]:
# File path to the input data
for file_path in dataset_folder_path:
    
    print(file_path.split('\\')[-3:])
    data, labels = parse_file(file_path)
    prompts = [prepare_prompt_simple(row) for row in data]
    print(prompts[0])
    print(labels[0])
    predictions = zero_shot_inference(model, tokenizer, prompts, 1)
    predictions = [x.split(" ")[-1].strip() for x in predictions] 
    print(len(predictions), len(labels))
    print(calculate_metrics(predictions, labels))

['my_data', 'auck', 'test.txt']
Do the two place descriptions refer to the same real-world place? Answer with 'Yes' if they do and 'No' if they do not.
    Place 1: Tautini farmstead -40.18825 176.14021
    Place 2: Waikoukou Stream Stream -40.08742665596005 176.28878276483226
    Answer: 
0
601 601
{'Precision': 0.5555555555555556, 'Recall': 0.75, 'F1 Score': 0.6382978723404256}
['my_data', 'hope', 'test.txt']
Do the two place descriptions refer to the same real-world place? Answer with 'Yes' if they do and 'No' if they do not.
    Place 1: Silver Stream stream -44.04833 168.67008
    Place 2: Deep Dale Valley -44.025083 168.672056
    Answer: 
0
2907 2907
{'Precision': 0.7865168539325843, 'Recall': 0.625, 'F1 Score': 0.6965174129353234}
['my_data', 'norse', 'test.txt']
Do the two place descriptions refer to the same real-world place? Answer with 'Yes' if they do and 'No' if they do not.
    Place 1: Mangatewai River Scenic Reserve Scenic Reserve -39.985556 176.254722
    Place 2: Ōtā

In [73]:
def prepare_prompt_gtminer(row):
    """
    Prepares a natural language prompt for the entity resolution task.
    :param row: A tuple with two entities and the expected result.
    :return: A formatted prompt string.
    """
    
    entity_1, entity_2 = row
    # print(entity_1)
    prompt = f"""Two place descriptions are provided. Answer with 'same_as' if the first place is the same as the second place. Answer with 'part_of' if the first place is a part of the second place and is located inside the second place. Answer with 'serves' if the first place provides a service to the second place in terms of human mobility, assistance, etc. Answer with 'unknown' if the two places show none of these relations.
    Place 1: {entity_1}
    Place 2: {entity_2}
    Answer: """
    
    return prompt

In [74]:
def calculate_metrics2(predictions, labels):
    tot_p = 0
    true_p = 0
    pred_p = 0
    
    valid_y_tensor = [int(x) for x in labels]
    y_pred = [1 if label == "same_as" or "same" else 2 if label == "part_of" else 3 if label == "serves" else 0 if label =="unknown" else 4 for label in predictions]

    for i in range(len(y_pred)):

        if valid_y_tensor[i] > 0:
            tot_p += 1

            if y_pred[i] == valid_y_tensor[i]:
                true_p += 1

        if y_pred[i] > 0:
            pred_p += 1

    f1 = 0.0
    prec = 0.0
    rec = 0.0

    if tot_p and pred_p:
        rec = true_p / tot_p
        prec = true_p / pred_p

        if rec > 0 or prec > 0:
            f1 = 2 * prec * rec / (prec + rec)

    print('P: ' + str(round(prec, 4)) + '  |  R: ' + str(round(rec, 4)) + '  |  F1: ' + str(round(f1, 4)))

    return {
        'precision': prec,
        'recall': rec,
        'f1': f1
    }

In [76]:
file_path_gt = ['C:\\Users\kwijegun\PycharmProjects\omni-geometry-geoER\data\\train_valid_test\gtminer\\mel\\test.txt', 'C:\\Users\kwijegun\PycharmProjects\omni-geometry-geoER\data\\train_valid_test\gtminer\\sea\\test.txt','C:\\Users\kwijegun\PycharmProjects\omni-geometry-geoER\data\\train_valid_test\gtminer\\sin\\test.txt','C:\\Users\kwijegun\PycharmProjects\omni-geometry-geoER\data\\train_valid_test\gtminer\\tor\\test.txt']

for file_name in file_path_gt:
    
    print(file_name.split('\\')[-3:])
    data, labels = parse_file(file_name)
    prompts = [prepare_prompt_gtminer(row) for row in data]
    print(prompts[0])
    print(labels[0])
    predictions = zero_shot_inference(model, tokenizer, prompts, 2)
    predictions = [x.split(" ")[-1].strip() for x in predictions] 
    print(len(predictions), len(labels))
    print(calc_mets_my(predictions, labels))

['gtminer', 'mel', 'test.txt']
Two place descriptions are provided. Answer with 'same_as' if the first place is the same as the second place. Answer with 'part_of' if the first place is a part of the second place and is located inside the second place. Answer with 'serves' if the first place provides a service to the second place in terms of human mobility, assistance, etc. Answer with 'unknown' if the two places show none of these relations.
    Place 1: JB Hi-Fi electronics nan -37.7681204 145.304855
    Place 2: Chirnside Homemaker Centre mall 282 Maroondah Highway 3116 -37.7663845 145.3058855
    Answer: 
2
1839 1839
P: 0.0984  |  R: 0.1613  |  F1: 0.1223
{'precision': 0.09842305600870038, 'recall': 0.16131907308377896, 'f1': 0.12225599459642011}
['gtminer', 'sea', 'test.txt']
Two place descriptions are provided. Answer with 'same_as' if the first place is the same as the second place. Answer with 'part_of' if the first place is a part of the second place and is located inside the 

In [64]:
preds = zero_shot_inference(model, tokenizer, prompts[0:100], 2)

In [65]:
preds

["Two place descriptions are provided. Answer with'same_as' if the first place is the same as the second place. Answer with'serves' if the first place provides a service to the second place in terms of human mobility, assistance, etc. Answer with 'unknown' if the two places show none of these relations.\n    Place 1: JB Hi-Fi electronics nan -37.7681204 145.304855\n    Place 2: Chirnside Homemaker Centre mall 282 Maroondah Highway 3116 -37.7663845 145.3058855\n    Answer:  serves",
 "Two place descriptions are provided. Answer with'same_as' if the first place is the same as the second place. Answer with'serves' if the first place provides a service to the second place in terms of human mobility, assistance, etc. Answer with 'unknown' if the two places show none of these relations.\n    Place 1: On The Way bakery nan -37.8100932 144.9610365\n    Place 2: Pegasus Apart'Hotel hotel 206 A'Beckett Street 3000 -37.8100667 144.9570933\n    Answer:  unknown",
 "Two place descriptions are provi

In [66]:
lw_preds = [x.split(" ")[-1].strip() for x in preds] 

In [67]:
lw_preds

['serves',
 'unknown',
 'serves',
 'serves',
 'unknown',
 'Your',
 'serves',
 'serves',
 'serves',
 'same_as',
 'Your',
 'Your',
 'unknown',
 'same_as',
 'unknown',
 'same_as',
 'serves',
 'serves',
 'same_as',
 'serves',
 'unknown',
 'Your',
 'serves',
 'unknown',
 'Your',
 'unknown',
 'Your',
 'unknown',
 'serves',
 'unknown',
 'serves',
 'unknown',
 'serves',
 'unknown',
 'unknown',
 'unknown',
 'unknown',
 'Your',
 'serves',
 'same_as',
 'serves',
 'unknown',
 'unknown',
 'serves',
 'same',
 'same_as',
 'unknown',
 'Your',
 'serves',
 'Your',
 'unknown',
 '?',
 'unknown',
 '?',
 'same_as',
 '?',
 'unknown',
 'serves',
 'serves',
 'serves',
 'unknown',
 'same_as',
 '?',
 'Your',
 'serves',
 '?',
 'same_as',
 'Your',
 '????',
 'serves',
 'unknown',
 'Your',
 'same_as',
 'unknown',
 'unknown',
 'Your',
 'Your',
 '?',
 'serves',
 'serves',
 'serves',
 'unknown',
 'unknown',
 'unknown',
 'unknown',
 'unknown',
 'unknown',
 'unknown',
 'unknown',
 '?',
 'unknown',
 'unknown',
 'unknown',

In [70]:
calculate_metrics2(lw_preds,labels)

P: 0.1774  |  R: 0.1692  |  F1: 0.1732


{'precision': 0.1774193548387097,
 'recall': 0.16923076923076924,
 'f1': 0.1732283464566929}