In [12]:
import json
import torch
from typing import List
import transformers
import logging
import pandas as pd
from tqdm import tqdm
import random
import csv
import os
import time

import warnings
warnings.filterwarnings("ignore")

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [2]:
# Global variable to store the pipeline
global_pipeline = None

def load_llm():
    global global_pipeline
    # model_id = "/data/models/LLaMa3/Meta-Llama-3-70B-Instruct-hf/"
    model_id = "meta-llama/Llama-3.3-70B-Instruct"
    # model_id = "deepseek-ai/DeepSeek-R1-Distill-Llama-70B"
    # model_id = "Qwen/Qwen2.5-72B-Instruct"
    # model_id = "/data/models/LLaMa3/Meta-Llama-3.1-8B-Instruct-hf/"
    global_pipeline = transformers.pipeline(
        "text-generation",
        model=model_id,
        model_kwargs={"torch_dtype": torch.bfloat16},
        device_map="auto",
    )
    logger.info("Model loaded successfully!")

In [3]:
def load_knowledge_base(file_path: str) -> str:
    with open(file_path, 'r') as file:
        return file.read()

def generate_queries(question, kb_file, properties_file, example_file):
    # Load knowledge base and properties
    kb_content = load_knowledge_base(kb_file)
    properties_content = load_knowledge_base(properties_file)
    example_content = load_knowledge_base(example_file)

    prompt = f"""
        You are an AI assistant tasked with generating queries for a first-order logic knowledge base using the AIMA Python library. Your task is to create a FOL query that can be used to answer the provided question.
    
    Question: {question}
    
    To understand how to translate and what are the available predicates, you can look into the following FOL predicates and statements.

    Knowledge Base Rules:
    {kb_content}

    Example of FOL related to object properties:
    {properties_content}

    Some more examples and explanations:
    TypeOf(x, Car)
    ColorOf(x, Red)

    TypeOf vehicles could be Car, Bus, Van, MiniCooper, MiniVan, Truck, Hatchback, Motorcycle, Bicycle, SUV, Jeep etc.
    ColorOf vehicles could be any color that is associated with a vehicle.

    Object properties take the type of vehicles and color. These can be retrieved from the question. But the color and type need
    to be capitalized, and any space in it should be removed. For example, a Police car should be PoliceCar. For PoliceCar you do not need to add ColorOf the police car in the FOL. But for other vehicles include ColorOf.

    Based on the knowledge base and object properties, generate a FOL query that would be necessary to answer the question. The queries should be in the format used by the AIMA Python library's fol_fc_ask function.

    Rules for generating queries:
    1. Each query should be a string that might be a conjunction of multiple predicates.
    2. Use 'x' as the variable name for the main object in question.
    3. The predicates should be combined to form a conjunctive query.
    4. Include predicates for type, attributes, and relevant relationships.
    5. Please do not use any predicate that is not present in the Knowledge Base Rules or Object Properties.
    6. Please use the predicates that are present in the rules. Please do not make any changes in predicates.
    7. Check the location of the object if the location is mentioned in the question.
    8. Questions that involve a single object should be responded to with a query which is a conjunction of some predicates.
    9. If the location of an object is not mentioned in the question then do not include InitialLocation or LastLocation predicates in the query.

    For example. For Location, use InitialLocation(x, position) or LastLocation(x, position) predicates. Don't use only Location(x). Then, create another list of actions between the two objects.

    Example 1:
    Question: "Is there a white car near the left?"
    Query: "TypeOf(x, Car)^ColorOf(x, White)^InitialLocation(x, NearLeft)"

    10. If the question involves two objects, then first generate the FOL query for each object similar to a single object question.
    11. Use 'y' as the variable name for the second object in the question.
    12. For two objects, the response would be the conjunction of all the predicates for both objects, which contains the predicates for the first object, the predicates for the second object, and the predicates for the interaction between them (if available).
    13. Only respond with the FOL query of the input question. Do not add any other text to it
    Example 2:
    Question: "Does the white car near the left come close to a pedestrian at the front?"
    Queries: "TypeOf(x, Car)^ColorOf(x, White)^InitialLocation(x, NearLeft)^Pedestrian(y)^InitialLocation(y, Front)^ComeClose(x, y)"

    Example 3:
    Question: "Can you spot a pedestrian walking near the right of the police car?"
    Queries: "Pedestrian(x)^Walk(x)^InitialLocation(x, NearRight)^TypeOf(y, PoliceCar)"

    Example 4:
    Question: "Can you spot a pedestrian walking near the right of the police car at the center?"
    Queries: "Pedestrian(x)^Walk(x)^InitialLocation(x, NearRight)^TypeOf(y, PoliceCar)^InitialLocation(x, Front)"

    In the above examples (example 3, 4), notice that both the examples mentioned about a police car. But in example 3, there was no mention of location of the police car, hence the FOL query does not include any location for the police car. But in the 4th example as there is a mention of the police car's position, we include it in the FOL query.

    Location can be any of the following: Left, NearLeft, FarLeft, Right, NearRight, FarRight, Front, NearFront, and FarFront. Please analyze the question
    Choose the proper one, and do not use any words other than the mentioned 9 for location. Do not use NearLeft, FarLeft, NearRight, FarRight, NearFront, and FarFront as the location predicate value if those are not mentioned explicitly in the question.
    Inside the location  predicates (InitialLocation, LastLocation) please don't use any other location other than these six: NearLeft, FarLeft, NearRight, FarRight, NearFront, and FarFront
    If a synonym is used in the question, try to find the closest one from the six mentioned positions.
    If you are not sure about the location/position of an object, seeing the question, don't include location predicates for that object.
    For example, the center can be replaced by NearFront.
    For location, we only have two predicates. Example: InitialLocation(x, NearLeft) and LastLocation(x, NearLeft)

    On(x, y) predicate signifies if object x is positioned on top of y. It does not signify if object x is at y side of the frmae.

    Here are more examples showing questions and their corresponding FOL Queries:
    {example_content}

    Please provide a similar list of queries for the given question. Ensure that the queries cover all aspects necessary to answer the question based on the knowledge base and object properties.
    Also, please consider that some true predicates can lead other predicates to be true. For example: 

    ((Vehicles(x) & SpeedUp(x)) ==> Accelerate(x))
    ((Vehicles(x) & SpeedDown(x)) ==> Decelerate(x))
    (((Vehicles(x) & NotAccelerate(x)) & NotDecelerate(x)) ==> ConstantSpeed(x))

    Here, you do not need to check all the predicates (SpeedUp, SpeedDown, ConstantSpeed) to understand if a vehicle is moving at a constant speed.
    You can only check ConstantSpeed(x).
    
    Your response should be a query string. For example:
    "Predicate1(x, Value)^Predicate2(x)^Predicate3(x, OtherValue)"

    Respond with only the query string, no additional text.
    """

    # return prompt

    messages = [
        {"role": "system", "content": "You are an AI assistant specializing in generating logical queries for AIMA Python based on first-order logic knowledge bases and object properties."},
        {"role": "user", "content": prompt},
    ]

    terminators = [
        global_pipeline.tokenizer.eos_token_id,
        global_pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
    ]
    
    outputs = global_pipeline(
        messages,
        max_new_tokens=1000,
        eos_token_id=terminators,
        do_sample=True,
        # temperature=0.6,
        # top_p=0.9,
    )
    
    result = outputs[0]["generated_text"][-1]['content']
    
    return result


In [4]:
load_llm()

2025-03-04 20:33:54.970274: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1741138434.983484 1872476 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1741138434.987503 1872476 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-04 20:33:55.002932: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Loading checkpoint shards:   0%|          | 0/30 [00:00<?, ?it/s]

Device set to use cuda:0
INFO:__main__:Model loaded successfully!


In [13]:
# def write_to_file(file_path, question, queries):
#     with open(file_path, 'a', newline='') as f:
#         writer = csv.writer(f)
#         writer.writerow([question, queries])

# def main():
#     # kb_file = "./fol_rules_kbs/out_017_181_191.txt"
#     random.seed(1)
#     kb_file = "/home/ibk5106/projects/projects/LogicRAG/tools/kb_framework/rules/all_rules.txt"
#     properties_file = "/home/ibk5106/projects/projects/LogicRAG/tools/kb_framework/rules/normal_vehicle_info_frequent_0001.txt"
#     out_file_train = 'question_query_carla_llama3.csv'

#     questions = pd.read_csv(
#         '/home/ibk5106/projects/projects/LogicRAG/tools/kb_framework/questions/all_seq_qt_2_final.csv'
#     )['Questions'].fillna('')
#     questions = [q for q in questions if q != '']
#     # random.shuffle(questions)

#     with open(out_file_train, 'w', newline='') as f:
#         writer = csv.writer(f)
#         writer.writerow(["Question", "FOL Query"])
    
#     # Process train data
#     for question in tqdm(questions):
#         queries = generate_queries(question, kb_file, properties_file)
#         write_to_file(out_file_train, question, queries.replace('"', '').replace("'", ''))

# if __name__ == "__main__":
#     main()

def write_to_file(file_path, question, queries):
    with open(file_path, 'a', newline='') as f:
        writer = csv.writer(f)
        writer.writerow([question, queries])

def main():
    kb_file = "/home/ibk5106/projects/projects/LogicRAG/tools/kb_framework/rules/all_rules.txt"
    properties_file = "/home/ibk5106/projects/projects/LogicRAG/tools/kb_framework/rules/normal_vehicle_info_frequent_0001.txt"
    out_file_train = '/home/ibk5106/projects/projects/LogicRAG/tools/kb_framework/kb_inference/translated_queries/question_query_kitti_llama33.csv'
    example_file = '/home/ibk5106/projects/projects/LogicRAG/tools/kb_framework/kb_inference/examples/example_translation_1.txt'
    
    questions = pd.read_csv('/home/ibk5106/projects/projects/LogicRAG/tools/kb_framework/kitti_questions/all_que_fn.csv')['Questions'].fillna('')
    questions = [q for q in questions if q != '']
    
    # Check if output file exists and load already processed questions
    processed_questions = set()
    if os.path.exists(out_file_train):
        try:
            existing_data = pd.read_csv(out_file_train)
            processed_questions = set(existing_data['Question'].tolist())
            logger.info(f"Found {len(processed_questions)} already processed questions")
        except pd.errors.EmptyDataError:
            logger.info("Existing file is empty, processing all questions")
        except Exception as e:
            logger.error(f"Error reading existing file: {e}")
            logger.info("Creating new output file")
            processed_questions = set()
    
    # Filter out already processed questions
    remaining_questions = [q for q in questions if q not in processed_questions]
    logger.info(f"Processing {len(remaining_questions)} new questions")
    
    # Create file if it doesn't exist, or append if it exists
    if not os.path.exists(out_file_train):
        with open(out_file_train, 'w', newline='') as f:
            writer = csv.writer(f)
            writer.writerow(["Question", "FOL Query"])
    
    # Process remaining questions
    for i_cnt, question in enumerate(tqdm(remaining_questions)):
        try:
            queries = generate_queries(question, kb_file, properties_file, example_file)
            write_to_file(out_file_train, question, queries.replace('"', '').replace("'", ''))
        except Exception as e:
            logger.error(f"Error processing question '{question}': {e}")
            continue

        # time.sleep(5)

if __name__ == "__main__":
    main()

INFO:__main__:Found 96 already processed questions
INFO:__main__:Processing 2 new questions
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:34<00:00, 17.41s/it]
