In [1]:
import json
import torch
from typing import List
import transformers
import logging
import pandas as pd
from tqdm import tqdm
import random
import csv

import warnings
warnings.filterwarnings("ignore")

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [2]:
# Global variable to store the pipeline
global_pipeline = None

def load_llm():
    global global_pipeline
    model_id = "/data/models/LLaMa3/Meta-Llama-3-70B-Instruct-hf/"
    # model_id = "meta-llama/Llama-3.3-70B-Instruct"
    # model_id = "deepseek-ai/DeepSeek-R1-Distill-Llama-70B"
    # model_id = "Qwen/Qwen2.5-72B-Instruct"
    # model_id = "/data/models/LLaMa3/Meta-Llama-3.1-8B-Instruct-hf/"
    global_pipeline = transformers.pipeline(
        "text-generation",
        model=model_id,
        model_kwargs={"torch_dtype": torch.bfloat16},
        device_map="auto",
    )
    logger.info("Model loaded successfully!")

In [3]:
load_llm()

2025-03-01 15:41:45.689905: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1740861705.703432  994068 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1740861705.707520  994068 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-01 15:41:45.723426: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Loading checkpoint shards:   0%|          | 0/30 [00:00<?, ?it/s]

Device set to use cuda:0
INFO:__main__:Model loaded successfully!


In [4]:
def load_knowledge_base(file_path: str) -> str:
    with open(file_path, 'r') as file:
        return file.read()

def generate_queries(question: str, kb_file: str, properties_file: str) -> List[str]:
    # Load knowledge base and properties
    kb_content = load_knowledge_base(kb_file)
    properties_content = load_knowledge_base(properties_file)

    prompt = f"""
        You are an AI assistant tasked with generating queries for a first-order logic knowledge base using the AIMA Python library. Your task is to create a FOL query that can be used to answer the provided question.
    
    Question: {question}
    
    To understand how to translate and what are the available predicates, you can look into the following FOL predicates and statements.

    Knowledge Base Rules:
    {kb_content}

    Example of FOL related to object properties:
    {properties_content}

    Some more examples and explanations:
    TypeOf(x, Car)
    ColorOf(x, Red)

    Object properties take the type of vehicles and color. These can be retrieved from the question. But the color and type need
    to be capitalized, and any space in it should be removed. For example, a Police car should be PoliceCar. For PoliceCar you do not need to add ColorOf the police car in the FOL. But for other vehicles include ColorOf.

    Based on the knowledge base and object properties, generate a FOL query that would be necessary to answer the question. The queries should be in the format used by the AIMA Python library's fol_fc_ask function.

    Rules for generating queries:
    1. Each query should be a string that might be a conjunction of multiple predicates.
    2. Use 'x' as the variable name for the main object in question.
    3. The predicates should be combined to form a conjunctive query.
    4. Include predicates for type, attributes, and relevant relationships.
    5. Please do not use any predicate that is not present in the Knowledge Base Rules or Object Properties.
    6. Please use the predicates that are present in the rules. Please do not make any changes in predicates.
    7. Check the location of the object if the location is mentioned in the question.
    8. Questions that involve a single object should be responded to with a query which is a conjunction of some predicates.
    9. If the location of an object is not mentioned in the question then do not include InitialLocation or LastLocation predicates in the query.

    For example. For Location, use InitialLocation(x, position) or LastLocation(x, position) predicates. Don't use only Location(x). Then, create another list of actions between the two objects.

    Example 1:
    Question: "Is there a white car near the left?"
    Query: "TypeOf(x, Car)^ColorOf(x, White)^InitialLocation(x, NearLeft)"

    10. If the question involves two objects, then first generate the FOL query for each object similar to a single object question.
    11. Use 'y' as the variable name for the second object in the question.
    12. For two objects, the response would be the conjunction of all the predicates for both objects, which contains the predicates for the first object, the predicates for the second object, and the predicates for the interaction between them (if available).
    13. Only respond with the FOL query of the input question. Do not add any other text to it
    Example 2:
    Question: "Does the white car near the left come close to a pedestrian at the front?"
    Queries: "TypeOf(x, Car)^ColorOf(x, White)^InitialLocation(x, NearLeft)^Pedestrian(y)^InitialLocation(y, Front)^ComeClose(x, y)"

    Example 3:
    Question: "Can you spot a pedestrian walking near the right of the police car?"
    Queries: "Pedestrian(x)^Walk(x)^InitialLocation(x, NearRight)^TypeOf(y, PoliceCar)"

    Example 4:
    Question: "Can you spot a pedestrian walking near the right of the police car at the center?"
    Queries: "Pedestrian(x)^Walk(x)^InitialLocation(x, NearRight)^TypeOf(y, PoliceCar)^InitialLocation(x, Front)"

    Example 5:
    Question: "Is there a red car near the right?"
    Queries: "TypeOf(x, Car)^ColorOf(x, Red)^InitialLocation(x, Right)"

    Example 6:
    Question: "Is there a Black Car in the front in the initial frames that moves to the right?"
    Queries: "TypeOf(x, Car)^ColorOf(x, Black)^InitialLocation(x, Front)^Move(x)^LastLocation(x, Right)"

    Example 7:
    Question: "Can you spot a trafficsign in the front that shifts to the right in the later frames due to the camera movement?"
    Queries: "TrafficSign(x)^InitialLocation(x, Front)^LastLocation(x, Right)"

    Example 8:
    Question: "Does the White Car move from front to the left of the scene?"
    Queries: "TypeOf(x, Car)^ColorOf(x, White)^InitialLocation(x, Front)^LastLocation(x, Left)^Move(x)"

    Example 9:
    Question: "Does the position of the White Car change from front to the left of the scene?"
    Queries: "TypeOf(x, Car)^ColorOf(x, White)^InitialLocation(x, Front)^LastLocation(x, Left)"

    Example 10:
    Question: "Can you spot a pedestrian on the left?"
    Queries: "Pedestrian(x)^InitialLocation(x, Left)"



    In the above examples (example 3, 4), notice that both the examples mentioned about a police car. But in example 3, there was no mention of location of the police car, hence the FOL query does not include any location for the police car. But in the 4th example as there is a mention of the police car's position, we include it in the FOL query.

    Location can be any of the following: Left, NearLeft, FarLeft, Right, NearRight, FarRight, Front, NearFront, and FarFront. Please analyze the question
    Choose the proper one, and do not use any words other than the mentioned 9 for location. Do not use NearLeft, FarLeft, NearRight, FarRight, NearFront, and FarFront as the location predicate value if those are not mentioned explicitly in the question.
    Inside the location  predicates (InitialLocation, LastLocation) please don't use any other location other than these six: NearLeft, FarLeft, NearRight, FarRight, NearFront, and FarFront
    If a synonym is used in the question, try to find the closest one from the six mentioned positions.
    If you are not sure about the location/position of an object, seeing the question, don't include location predicates for that object.
    For example, the center can be replaced by NearFront.
    For location, we only have two predicates. Example: InitialLocation(x, NearLeft) and LastLocation(x, NearLeft)

    On(x, y) predicate signifies if object x is positioned on top of y. It does not signify if object x is at y side of the frmae.

    Please provide a similar list of queries for the given question. Ensure that the queries cover all aspects necessary to answer the question based on the knowledge base and object properties.
    Also, please consider that some true predicates can lead other predicates to be true. For example: 

    ((Vehicles(x) & SpeedUp(x)) ==> Accelerate(x))
    ((Vehicles(x) & SpeedDown(x)) ==> Decelerate(x))
    (((Vehicles(x) & NotAccelerate(x)) & NotDecelerate(x)) ==> ConstantSpeed(x))

    Here, you do not need to check all the predicates (SpeedUp, SpeedDown, ConstantSpeed) to understand if a vehicle is moving at a constant speed.
    You can only check ConstantSpeed(x).
    
    Your response should be a query string. For example:
    "Predicate1(x, Value)^Predicate2(x)^Predicate3(x, OtherValue)"

    Respond with only the query string, no additional text.
    """

    messages = [
        {"role": "system", "content": "You are an AI assistant specializing in generating logical queries for AIMA Python based on first-order logic knowledge bases and object properties."},
        {"role": "user", "content": prompt},
    ]

    terminators = [
        global_pipeline.tokenizer.eos_token_id,
        global_pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
    ]
    
    outputs = global_pipeline(
        messages,
        max_new_tokens=1000,
        eos_token_id=terminators,
        do_sample=True,
        # temperature=0.6,
        # top_p=0.9,
    )
    
    result = outputs[0]["generated_text"][-1]['content']
    
    return result


In [5]:
def write_to_file(file_path, question, queries):
    with open(file_path, 'a', newline='') as f:
        writer = csv.writer(f)
        writer.writerow([question, queries])

def main():
    # kb_file = "./fol_rules_kbs/out_017_181_191.txt"
    random.seed(1)
    kb_file = "/home/ibk5106/projects/projects/LogicRAG/tools/kb_framework/rules/all_rules.txt"
    properties_file = "/home/ibk5106/projects/projects/LogicRAG/tools/kb_framework/rules/normal_vehicle_info_frequent_0001.txt"
    out_file_train = 'question_query_kitti_llama3.csv'

    questions = pd.read_csv(
        '/home/ibk5106/projects/projects/LogicRAG/tools/kb_framework/kitti_questions/all_que_fn_model_ans.csv'
    )['Questions'].fillna('')
    questions = [q for q in questions if q != '']
    # random.shuffle(questions)

    with open(out_file_train, 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(["Question", "FOL Query"])
    
    # Process train data
    for question in tqdm(questions):
        queries = generate_queries(question, kb_file, properties_file)
        write_to_file(out_file_train, question, queries.replace('"', '').replace("'", ''))

if __name__ == "__main__":
    main()

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.                                                                                                                                                                                                     | 0/101 [00:00<?, ?it/s]
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.                                                                                                                                                                                             | 1/101 [00:07<12:28,  7.49s/it]
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.                                                                                                                                                                                             | 2/101 [00:15<13:14,  8.02s/it]
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.                                              

In [2]:
df = pd.read_csv('question_query.csv')

df = df.sample(frac=1).reset_index().drop(columns=['index'])

df_train = df.iloc[:470]
df_test = df.iloc[470:].reset_index().drop(columns=['index'])


df_train.to_csv('question_query_train.csv', index=False)
df_test.to_csv('question_query_test.csv', index=False)

In [28]:
import re

def extract_bracket_content(text):
    pattern = r'\[(.*?)\]'
    match = re.search(pattern, text)
    if match:
        return match.group(1).strip('"')
    return None

In [17]:
import requests
import json

api_url = "http://127.0.0.1:8899/v1/judge"
headers = {
    "Content-Type": "application/json",
    "X-API-Key": "147728a9-29dc-4260-af88-664915c717c1"
}
data = {
    "prompt": f"Does the black SUV on the left move at a constant speed?=TypeOf(x, SUV)^ColorOf(x, Black)^ConstantSpeed(x)",
    "max_new_tokens": 512,
    "temperature": 0.7,
    "do_sample":True,
    "temperature":0.6,
    "top_p":0.9,
}

response = requests.post(api_url, headers=headers, data=json.dumps(data))
response.raise_for_status()
result = response.json()["choices"][0]["text"].strip()

result

'ROUGE-L F1: 1.00'

In [58]:
from rouge import Rouge
rouge = Rouge()

scores = rouge.get_scores(
    "TypeOf(x, SUV)^ColorOf(x, Black)^ConstantSpeed(x)^TypeOf(y, Car)^ColorOf(y, Blue)^InitialLocation(y, Right)", 
    "TypeOf(x, SUV)^ColorOf(x, Black)^ConstantSpeed(x)"
)
scores[0]

{'rouge-1': {'r': 0.6666666666666666,
  'p': 0.3333333333333333,
  'f': 0.44444444000000005},
 'rouge-2': {'r': 0.5, 'p': 0.2, 'f': 0.2857142816326531},
 'rouge-l': {'r': 0.6666666666666666,
  'p': 0.3333333333333333,
  'f': 0.44444444000000005}}

In [8]:
import numpy as np

# Create a sample 2D array
arr = np.array([
    [1, 5, 3],
    [4, 2, 6],
    [7, 8, 9],
    [10, 11, 12]
])

# Get the maximum of each row
row_maxima = np.max(arr, axis=0)

print("Original array:")
print(arr)
print("\nMaximum of each row:")
print(row_maxima, row_maxima.sum(), row_maxima.mean())
arr.shape, arr[1, 2]

Original array:
[[ 1  5  3]
 [ 4  2  6]
 [ 7  8  9]
 [10 11 12]]

Maximum of each row:
[10 11 12] 33 11.0


((4, 3), 6)