In [52]:
import os
import transformers
import pandas as pd
from textwrap import dedent
import math


In [53]:
MODEL_NAME = 'meta-llama/Meta-Llama-3-8B-Instruct'



In [54]:
tokenizer = transformers.AutoTokenizer.from_pretrained(
    MODEL_NAME,
    token=True
)

In [55]:
PAD_TOKEN = tokenizer.eos_token
tokenizer.add_special_tokens({"pad_token": PAD_TOKEN})
tokenizer.padding_side = "right"

In [56]:

def haversine_distance(lat1, lon1, lat2, lon2):
    """
    Calculate the great-circle distance between two points on the Earth's surface.

    Parameters:
    lat1, lon1: Latitude and Longitude of the first point in decimal degrees.
    lat2, lon2: Latitude and Longitude of the second point in decimal degrees.

    Returns:
    Distance in kilometers.
    """
    # Radius of the Earth in kilometers
    R = 6371.0

    # Convert latitude and longitude from degrees to radians
    lat1_rad = math.radians(lat1)
    lon1_rad = math.radians(lon1)
    lat2_rad = math.radians(lat2)
    lon2_rad = math.radians(lon2)

    # Differences in coordinates
    dlat = lat2_rad - lat1_rad
    dlon = lon2_rad - lon1_rad

    # Haversine formula
    a = math.sin(dlat / 2)**2 + math.cos(lat1_rad) * math.cos(lat2_rad) * math.sin(dlon / 2)**2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))

    # Distance
    distance = R * c
    return f"{distance:.2f}km"

In [57]:
def get_lat_long(entity):

    words = entity.lower().split()
    for i,word in enumerate(words):
        if words[i-2] == 'latitude' and words[i-1] == 'val':
          latitude = float(word)
          longitude = float(words[i+4])
    return  latitude, longitude

In [58]:
def parse_file(file_path):
    """
    Parses the input file and extracts entity pairs and labels.
    :param file_path: Path to the input text file.
    :return: A list of tuples (entity_1, entity_2, label).
    """
    data = []
    labels=[]
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f.readlines():
            parts = line.strip().split("\t")
            ent_1_lat, ent_1_lon = get_lat_long(parts[0])
            entity_1 = parts[0].replace("COL ", "").replace("[VAL] ", "").replace("VAL ", "").replace("name ", "").replace("type ", "").replace("latitude ", "").replace("longitude ", "").replace("postalCode ", "").replace("address ", "").strip()
            ent_2_lat, ent_2_lon = get_lat_long(parts[1])
            entity_2 = parts[1].replace("COL ", "").replace("[VAL] ", "").replace("VAL ", "").replace("name ", "").replace("type ", "").replace("latitude ", "").replace("longitude ", "").replace("postalCode ", "").replace("address ", "").strip()
            label = parts[2]  
            dist = haversine_distance(ent_1_lat, ent_1_lon, ent_2_lat, ent_2_lon)
            data.append((entity_1, entity_2, dist))
            labels.append(label)
    return data, labels

In [59]:
def parse_file_att_val(file_path):
    """
    Parses the input file and extracts entity pairs and labels.
    :param file_path: Path to the input text file.
    :return: A list of tuples (entity_1, entity_2, label).
    """
    data = []
    labels=[]
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f.readlines():
            parts = line.strip().split("\t")
            ent_1_lat, ent_1_lon = get_lat_long(parts[0])
            entity_1 = parts[0].replace("COL ", "").replace("[VAL] ", "").replace("VAL ", "").replace("name ", "name: ").replace("type ", "type: ").replace("latitude ", "latitude: ").replace("longitude ", "longitude: ").replace("postalCode ", "postalCode: ").replace("address ", "address: ").strip()
            ent_2_lat, ent_2_lon = get_lat_long(parts[1])
            entity_2 = parts[1].replace("COL ", "").replace("[VAL] ", "").replace("VAL ", "").replace("name ", "name: ").replace("type ", "type: ").replace("latitude ", "latitude: ").replace("longitude ", "longitude: ").replace("postalCode ", "postalCode: ").replace("address ", "address: ").strip()
            label = parts[2] 
            dist = haversine_distance(ent_1_lat, ent_1_lon, ent_2_lat, ent_2_lon)
            data.append((entity_1, entity_2, dist))
            labels.append(label)
    return data, labels

In [60]:
def parse_file_plm(file_path):
    """
    Parses the input file and extracts entity pairs and labels.
    :param file_path: Path to the input text file.
    :return: A list of tuples (entity_1, entity_2, label).
    """
    data = []
    labels=[]
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f.readlines():
            parts = line.strip().split("\t")
            ent_1_lat, ent_1_lon = get_lat_long(parts[0])
            # entity_1 = parts[0].replace("COL ", "").replace("[VAL] ", "").replace("VAL ", "").replace("name ", "name: ").replace("type ", "type: ").replace("latitude ", "latitude: ").replace("longitude ", "longitude: ").replace("postalCode ", "postalCode: ").replace("address ", "address: ").strip()
            # ent_2_lat, ent_2_lon = get_lat_long(parts[1])
            # entity_2 = parts[1].replace("COL ", "").replace("[VAL] ", "").replace("VAL ", "").replace("name ", "name: ").replace("type ", "type: ").replace("latitude ", "latitude: ").replace("longitude ", "longitude: ").replace("postalCode ", "postalCode: ").replace("address ", "address: ").strip()
            label = parts[2] 
            # dist = haversine_distance(ent_1_lat, ent_1_lon, ent_2_lat, ent_2_lon)
            data.append((parts[0], parts[1]))
            labels.append(label)
    return data, labels

In [61]:
def format_example(row: dict):
    prompt = dedent(
        f"""
    Place1: '{row["e1"]}'
    Place2: '{row["e2"]}'
    """
    )
    messages = [
        {
            "role": "system",
            "content": "Do the two place descriptions refer to the same real-world place? Answer with 'Yes' if they do and 'No' if they do not.",
        },
        {"role": "user", "content": prompt},
        {"role": "assistant", "content": row["answer"]},
    ]
    return tokenizer.apply_chat_template(messages, tokenize=False)
     

In [62]:
def format_example_distance(row: dict):
    prompt = dedent(
        f"""
    Place1: '{row["e1"]}'
    Place2: '{row["e2"]}'
    Distance: {row["distance"]}
    
    """
    )
    messages = [
        {
            "role": "system",
            "content": "Two place descriptions and the geographic distance between them is provided. Do the two place descriptions refer to the same real-world place? Answer with 'Yes' if they do and 'No' if they do not.",
        },
        {"role": "user", "content": prompt},
        {"role": "assistant", "content": row["answer"]},
    ]
    return tokenizer.apply_chat_template(messages, tokenize=False)

In [63]:
def format_example_gtminer(row: dict):
    prompt = dedent(
        f"""
    Place 1: '{row["e1"]}'
    Place 2: '{row["e2"]}'
    Answer only with: same_as, part_of, serves, unknown
    """
    )
    messages = [
        {
            "role": "system",
            "content": "Two place descriptions are provided. Answer with 'same_as' if the first place is the same as the second place. Answer with 'part_of' if the first place is a part of the second place and is located inside the second place. Answer with 'serves' if the first place provides a service to the second place in terms of human mobility, assistance, etc. Answer with 'unknown' if the two places show none of these relations.",
        },
        {"role": "user", "content": prompt},
        {"role": "assistant", "content": row["answer"]},
    ]
    return tokenizer.apply_chat_template(messages, tokenize=False)

In [64]:
def format_example_gtminer_distance(row: dict):
    prompt = dedent(
        f"""
    Place1: '{row["e1"]}'
    Place2: '{row["e2"]}'
    Distance: {row["distance"]}
    Answer only with: same_as, part_of, serves, unknown
    """
    )
    messages = [
        {
            "role": "system",
            "content": "Two place descriptions and the geographic distance between them are provided. Answer with 'same_as' if the first place is the same as the second place. Answer with 'part_of' if the first place is a part of the second place and is located inside the second place. Answer with 'serves' if the first place provides a service to the second place in terms of human mobility, assistance, etc. Answer with 'unknown' if the two places show none of these relations.",
        },
        {"role": "user", "content": prompt},
        {"role": "assistant", "content": row["answer"]},
    ]
    return tokenizer.apply_chat_template(messages, tokenize=False)

In [65]:
def format_example_gtminer_simple(row: dict):
    prompt = dedent(
        f"""
    Place1: '{row["e1"]}'
    Place2: '{row["e2"]}'
    """
    )
    messages = [
        {
            "role": "system",
            "content": "Two place descriptions are provided. Predict the relation between them. Answer only with ‘same_as’, ‘part_of’, ‘serves’ or ‘unknown’.",
        },
        {"role": "user", "content": prompt},
        {"role": "assistant", "content": row["answer"]},
    ]
    return tokenizer.apply_chat_template(messages, tokenize=False)

In [66]:
dataset_folder_paths = ["data\\NZER\\auck\\", 
                        "data\\NZER\\hope\\", 
                        "data\\NZER\\norse\\", 
                        "data\\NZER\\north\\", 
                        "data\\NZER\\palm\\", 
                        "data\\GEOD_OSM_FSQ\\edi\\", 
                        "data\\GEOD_OSM_FSQ\\pit\\", 
                        "data\\GEOD_OSM_FSQ\\sin\\",
                        "data\\GEOD_OSM_FSQ\\tor\\",
                        "data\\GEOD_OSM_YELP\\edi\\",
                        "data\\GEOD_OSM_YELP\\pit\\",
                        "data\\GEOD_OSM_YELP\\sin\\",
                        "data\\GEOD_OSM_YELP\\tor\\",
                        "data\\SGN\\swiss\\"]

In [67]:
prompts = ['simple', 'attribute_val', 'plm', 'attribute_value_dist']

In [None]:
for prompt in prompts:
    for dataset_folder_path in dataset_folder_paths:
    
        if prompt =="simple" or prompt =="attribute_val" or prompt =="plm":
            if prompt=="simple":
                train_dataset, train_labels = parse_file(dataset_folder_path+'train.txt')
                valid_dataset, valid_labels = parse_file(dataset_folder_path+'valid.txt') 
                test_dataset, test_labels = parse_file(dataset_folder_path+'test.txt')
            elif prompt=="attribute_val":
                train_dataset, train_labels = parse_file_att_val(dataset_folder_path+'train.txt')
                valid_dataset, valid_labels = parse_file_att_val(dataset_folder_path+'valid.txt') 
                test_dataset, test_labels = parse_file_att_val(dataset_folder_path+'test.txt')
            else:
                train_dataset, train_labels = parse_file_plm(dataset_folder_path+'train.txt')
                valid_dataset, valid_labels = parse_file_plm(dataset_folder_path+'valid.txt') 
                test_dataset, test_labels = parse_file_plm(dataset_folder_path+'test.txt')
                
            rows = []
            for x,y in zip(train_dataset, train_labels):
                rows.append(
                    {
                        "e1": x[0],
                        "e2": x[1],
                        "answer": ["Yes" if y=="1" else "No"][0],
                    }
                )
            train_df = pd.DataFrame(rows)
            
            rows = []
            for x,y in zip(valid_dataset, valid_labels):
                rows.append(
                    {
                        "e1": x[0],
                        "e2": x[1],
                        "answer": ["Yes" if y=="1" else "No"][0],
                    }
                )
            valid_df = pd.DataFrame(rows)
            
            rows = []
            for x,y in zip(test_dataset, test_labels):
                rows.append(
                    {
                        "e1": x[0],
                        "e2": x[1],
                        "answer": ["Yes" if y=="1" else "No"][0],
                    }
                )
            test_df = pd.DataFrame(rows)
            
            train_df["text"] = train_df.apply(format_example, axis=1)
            valid_df["text"] = valid_df.apply(format_example, axis=1)
            test_df["text"] = test_df.apply(format_example, axis=1)
            
        elif prompt =="attribute_value_dist":
            train_dataset, train_labels = parse_file_att_val(dataset_folder_path+'train.txt')
            valid_dataset, valid_labels = parse_file_att_val(dataset_folder_path+'valid.txt') 
            test_dataset, test_labels = parse_file_att_val(dataset_folder_path+'test.txt')
            
            rows = []
            for x,y in zip(train_dataset, train_labels):
                rows.append(
                    {
                        "e1": x[0],
                        "e2": x[1],
                        "distance": x[2],
                        "answer": ["Yes" if y=="1" else "No"][0],
                    }
                )
            train_df = pd.DataFrame(rows)
            
            rows = []
            for x,y in zip(valid_dataset, valid_labels):
                rows.append(
                    {
                        "e1": x[0],
                        "e2": x[1],
                        "distance": x[2],
                        "answer": ["Yes" if y=="1" else "No"][0],
                    }
                )
            valid_df = pd.DataFrame(rows)
            
            rows = []
            for x,y in zip(test_dataset, test_labels):
                rows.append(
                    {
                        "e1": x[0],
                        "e2": x[1],
                        "distance": x[2],
                        "answer": ["Yes" if y=="1" else "No"][0],
                    }
                )
            test_df = pd.DataFrame(rows)

            train_df["text"] = train_df.apply(format_example_distance, axis=1)
            valid_df["text"] = valid_df.apply(format_example_distance, axis=1)
            test_df["text"] = test_df.apply(format_example_distance, axis=1)
    
    
        dataset_output_path = dataset_folder_path.split('\\')[-3:-1]
    
        dataset_output_path_1, dataset_output_path_2 = dataset_output_path[0], dataset_output_path[1]
    
        train_out_file_path = "datasets\\"+ dataset_output_path_1 + "_"+ prompt + "\\"+ dataset_output_path_2+"\\train.json"
        valid_out_file_path = "datasets\\"+ dataset_output_path_1 + "_"+ prompt + "\\"+ dataset_output_path_2+"\\valid.json"
        test_out_file_path = "datasets\\"+ dataset_output_path_1 + "_"+ prompt + "\\"+ dataset_output_path_2+"\\test.json"
    
        os.makedirs(os.path.dirname(train_out_file_path), exist_ok=True)
        os.makedirs(os.path.dirname(valid_out_file_path), exist_ok=True)
        os.makedirs(os.path.dirname(test_out_file_path), exist_ok=True)
    
        train_df.to_json(train_out_file_path, orient="records", lines=True)
        valid_df.to_json(valid_out_file_path, orient="records", lines=True)
        test_df.to_json(test_out_file_path, orient="records", lines=True)

In [43]:
file_path_gt = ['data\\GTMD\\mel\\', 
                'data\\GTMD\\sea\\',
                'data\\GTMD\\sin\\',
                'data\\GTMD\\tor\\']

In [45]:
for prompt in prompts:    
    for dataset_folder_path in file_path_gt:
        train_dataset, train_labels = parse_file_plm(dataset_folder_path+'train.txt')
        valid_dataset, valid_labels = parse_file_plm(dataset_folder_path+'valid.txt') 
        test_dataset, test_labels = parse_file_plm(dataset_folder_path+'test.txt')
        
        if prompt =="simple" or prompt =="attribute_val" or prompt =="plm":
            if prompt=="simple":
                train_dataset, train_labels = parse_file(dataset_folder_path+'train.txt')
                valid_dataset, valid_labels = parse_file(dataset_folder_path+'valid.txt') 
                test_dataset, test_labels = parse_file(dataset_folder_path+'test.txt')
            elif prompt=="attribute_val":
                train_dataset, train_labels = parse_file_att_val(dataset_folder_path+'train.txt')
                valid_dataset, valid_labels = parse_file_att_val(dataset_folder_path+'valid.txt') 
                test_dataset, test_labels = parse_file_att_val(dataset_folder_path+'test.txt')
            else:
                train_dataset, train_labels = parse_file_plm(dataset_folder_path+'train.txt')
                valid_dataset, valid_labels = parse_file_plm(dataset_folder_path+'valid.txt') 
                test_dataset, test_labels = parse_file_plm(dataset_folder_path+'test.txt')
                
            rows = []
            for x,y in zip(train_dataset, train_labels):
                rows.append(
                    {
                        "e1": x[0],
                        "e2": x[1],
                        # "distance": x[2],
                        "answer": ["same_as" if y=="1" else "part_of" if y=="2" else "serves" if y=="3" else "unknown" if y=="0" else "asd"][0],
                    }
                )
            train_df = pd.DataFrame(rows)
            
            rows = []
            for x,y in zip(valid_dataset, valid_labels):
                rows.append(
                    {
                        "e1": x[0],
                        "e2": x[1],
                        # "distance": x[2],
                        "answer": ["same_as" if y=="1" else "part_of" if y=="2" else "serves" if y=="3" else "unknown" if y=="0" else "asd"][0],
                    }
                )
            valid_df = pd.DataFrame(rows)
            
            rows = []
            for x,y in zip(test_dataset, test_labels):
                rows.append(
                    {
                        "e1": x[0],
                        "e2": x[1],
                        # "distance": x[2],
                        "answer":  ["same_as" if y=="1" else "part_of" if y=="2" else "serves" if y=="3" else "unknown" if y=="0" else "asd"][0],
                    }
                )
            test_df = pd.DataFrame(rows)
            
            if prompt=="simple":
                train_df["text"] = train_df.apply(format_example_gtminer_simple, axis=1)
                valid_df["text"] = valid_df.apply(format_example_gtminer_simple, axis=1)
                test_df["text"] = test_df.apply(format_example_gtminer_simple, axis=1)
            else:
                train_df["text"] = train_df.apply(format_example_gtminer, axis=1)
                valid_df["text"] = valid_df.apply(format_example_gtminer, axis=1)
                test_df["text"] = test_df.apply(format_example_gtminer, axis=1)
            
            
            
        elif prompt =='attribute_value_dist':
            
            train_dataset, train_labels = parse_file_att_val(dataset_folder_path+'train.txt')
            valid_dataset, valid_labels = parse_file_att_val(dataset_folder_path+'valid.txt') 
            test_dataset, test_labels = parse_file_att_val(dataset_folder_path+'test.txt')
            
            rows = []
            for x,y in zip(train_dataset, train_labels):
                rows.append(
                    {
                        "e1": x[0],
                        "e2": x[1],
                        "distance": x[2],
                        "answer": ["same_as" if y=="1" else "part_of" if y=="2" else "serves" if y=="3" else "unknown" if y=="0" else "asd"][0],
                    }
                )
            train_df = pd.DataFrame(rows)
            
            rows = []
            for x,y in zip(valid_dataset, valid_labels):
                rows.append(
                    {
                        "e1": x[0],
                        "e2": x[1],
                        "distance": x[2],
                        "answer": ["same_as" if y=="1" else "part_of" if y=="2" else "serves" if y=="3" else "unknown" if y=="0" else "asd"][0],
                    }
                )
            valid_df = pd.DataFrame(rows)
            
            rows = []
            for x,y in zip(test_dataset, test_labels):
                rows.append(
                    {
                        "e1": x[0],
                        "e2": x[1],
                        "distance": x[2],
                        "answer":  ["same_as" if y=="1" else "part_of" if y=="2" else "serves" if y=="3" else "unknown" if y=="0" else "asd"][0],
                    }
                )
            test_df = pd.DataFrame(rows)
            
            train_df["text"] = train_df.apply(format_example_gtminer_distance, axis=1)
            valid_df["text"] = valid_df.apply(format_example_gtminer_distance, axis=1)
            test_df["text"] = test_df.apply(format_example_gtminer_distance, axis=1)
            
            
        
        dataset_output_path = dataset_folder_path.split('\\')[-3:-1]
        
        dataset_output_path_1, dataset_output_path_2 = dataset_output_path[0], dataset_output_path[1]
        
        train_out_file_path = "datasets\\"+ dataset_output_path_1 +"_"+ prompt+ "\\"+ dataset_output_path_2+"\\train.json"
        valid_out_file_path = "datasets\\"+ dataset_output_path_1 +"_"+ prompt+ "\\"+ dataset_output_path_2+"\\valid.json"
        test_out_file_path = "datasets\\"+ dataset_output_path_1 +"_"+ prompt+ "\\"+ dataset_output_path_2+"\\test.json"
        
        os.makedirs(os.path.dirname(train_out_file_path), exist_ok=True)
        os.makedirs(os.path.dirname(valid_out_file_path), exist_ok=True)
        os.makedirs(os.path.dirname(test_out_file_path), exist_ok=True)
        
        train_df.to_json(train_out_file_path, orient="records", lines=True)
        valid_df.to_json(valid_out_file_path, orient="records", lines=True)
        test_df.to_json(test_out_file_path, orient="records", lines=True)