In [1]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments
from peft import LoraConfig, TaskType, PeftModel
from trl import SFTTrainer
import torch
from sklearn.model_selection import train_test_split
import random

In [2]:
def load_csv_dataset(file_path, train_size=0.95, random_state=42):
    df = pd.read_csv(file_path)
    
    df['text'] = df.apply(lambda row: f"Translate the following natural language question to First Order Logic (FOL). Please respond with only the FOL statement. Don't include additional text.\nQuestion: {row['Question']}\nFOL Query: {row['FOL Query']}", axis=1)
    
    train_df, val_df = train_test_split(df, train_size=train_size, random_state=random_state)
    
    return {
        "train": Dataset.from_pandas(train_df),
        "validation": Dataset.from_pandas(val_df)
    }

In [3]:
def test_model(model, tokenizer, test_questions):
    model.eval()
    results = []
    
    for question, true_fol in test_questions:
        input_text = f"Translate the following natural language question to First Order Logic (FOL). Please respond with only the FOL statement. Don't include additional text. \nQuestion: {question}\nFOL Query:"
        input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(model.device)
        
        with torch.no_grad():
            output = model.generate(input_ids, max_new_tokens=100, num_return_sequences=1, temperature=0.7)
        
        generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
        fol_query = generated_text.split("FOL Query:")[-1].strip()
        results.append({"question": question, "input": input_text, "generated_fol": fol_query, "true_fol": true_fol})
    
    return results

In [4]:
dataset = load_csv_dataset("question_query_train.csv")
print("Dataset loaded:", dataset)

Dataset loaded: {'train': Dataset({
    features: ['Question', 'FOL Query', 'text', '__index_level_0__'],
    num_rows: 408
}), 'validation': Dataset({
    features: ['Question', 'FOL Query', 'text', '__index_level_0__'],
    num_rows: 22
})}


## Phi-3

In [5]:
# model_name = "microsoft/Phi-3-mini-4k-instruct"
# model_id = "microsoft/Phi-3-mini-4k-instruct"
# device_map = 'auto'

# if torch.cuda.is_bf16_supported():
#   compute_dtype = torch.bfloat16
#   attn_implementation = 'flash_attention_2'
# # If bfloat16 is not supported, 'compute_dtype' is set to 'torch.float16' and 'attn_implementation' is set to 'sdpa'.
# else:
#   compute_dtype = torch.float16
#   attn_implementation = 'sdpa'

# # compute_dtype = torch.float16
# # attn_implementation = 'sdpa'

# # Model and tokenizer
# tokenizer = AutoTokenizer.from_pretrained(
#     model_id, 
#     # trust_remote_code=True, 
#     # add_eos_token=True, 
#     # use_fast=True
# )

# # The padding token is set to the unknown token.
# # tokenizer.pad_token = tokenizer.unk_token

# # The ID of the padding token is set to the ID of the unknown token.
# # tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)

# # The padding side is set to 'left', meaning that padding tokens will be added to the left (start) of the sequence.
# # tokenizer.padding_side = 'left'

# # 'AutoModelForCausalLM.from_pretrained' is a method that loads a pre-trained model for causal language modeling from the Hugging Face Model Hub.
# # 'model_id' is passed as an argument to specify which model to load.
# # 'torch_dtype' is set to the compute data type determined earlier.
# # 'trust_remote_code' is set to True to trust the remote code in the model files.
# # 'device_map' is passed as an argument to specify the device mapping for distributed training.
# # 'attn_implementation' is set to the attention implementation determined earlier.
# model = AutoModelForCausalLM.from_pretrained(
#     model_id, trust_remote_code=True, 
#     device_map=device_map,
#     torch_dtype=compute_dtype,
#     attn_implementation=attn_implementation
# )



In [6]:
# lora_r = 16
# lora_alpha = 16
# lora_dropout = 0.05
# target_modules = ['k_proj', 'q_proj', 'v_proj', 'o_proj', "gate_proj", "down_proj", "up_proj"]

# peft_config = LoraConfig(
#     r=lora_r,
#     lora_alpha=lora_alpha,
#     lora_dropout=lora_dropout,
#     task_type=TaskType.CAUSAL_LM,
#     target_modules=target_modules,
# )

# # Training arguments
# args = TrainingArguments(
#     output_dir="./phi-3-mini-LoRA-nl-to-fol",
#     evaluation_strategy="steps",
#     do_eval=True,
#     optim="adamw_torch",
#     per_device_train_batch_size=8,
#     gradient_accumulation_steps=4,
#     per_device_eval_batch_size=8,
#     log_level="debug",
#     save_strategy="epoch",
#     logging_steps=100,
#     learning_rate=1e-4,
#     fp16=not torch.cuda.is_bf16_supported(),
#     bf16=torch.cuda.is_bf16_supported(),
#     eval_steps=100,
#     num_train_epochs=17,
#     warmup_ratio=0.1,
#     lr_scheduler_type="linear",
#     # report_to="wandb",
#     seed=42,
# )

# # Initialize SFTTrainer
# trainer = SFTTrainer(
#     model=model,
#     train_dataset=dataset['train'],
#     eval_dataset=dataset['validation'],
#     peft_config=peft_config,
#     dataset_text_field="text",
#     max_seq_length=512,
#     tokenizer=tokenizer,
#     args=args,
# )

In [7]:
# # Save the fine-tuned model
# trainer.save_model("./phi-3-mini-LoRA-nl-to-fol-final")

# # Load the fine-tuned model
# fine_tuned_model = PeftModel.from_pretrained(model, "./phi-3-mini-LoRA-nl-to-fol-final")

# # Test model after fine-tuning
# print("Testing model after fine-tuning:")
# after_results = test_model(fine_tuned_model, tokenizer, test_questions)
# for result in after_results:
#     print(f"Question: {result['question']}")
#     print(f"Generated FOL: {result['generated_fol']}")
#     print(f"True FOL: {result['true_fol']}\n")



In [8]:
# # Save the entire model (base + LoRA) to a single file
# fine_tuned_model = fine_tuned_model.merge_and_unload()
# fine_tuned_model.save_pretrained("./phi-3-mini-nl-to-fol-merged")
# tokenizer.save_pretrained("./phi-3-mini-nl-to-fol-merged")
# print("Merged model saved to: ./phi-3-mini-nl-to-fol-merged")

## GPT 2

In [9]:
model_name = "gpt2"
model_id = "gpt2"
device_map = 'auto'


tokenizer = AutoTokenizer.from_pretrained(
    model_id, 
)

model = AutoModelForCausalLM.from_pretrained(
    model_id, trust_remote_code=True, 
    device_map=device_map,
)

tokenizer.pad_token = tokenizer.eos_token




In [10]:
test_questions = random.sample(list(zip(dataset['validation']['Question'], dataset['validation']['FOL Query'])), 1)

# Test model before fine-tuning
print("Testing model before fine-tuning:")
before_results = test_model(model, tokenizer, test_questions)
for result in before_results:
    print(f"Question: {result['question']}")
    print(f"Generated FOL: {result['generated_fol']}")
    print(f"True FOL: {result['true_fol']}\n")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Testing model before fine-tuning:
Question: Can you spot a Mini Cooper on the left?
Generated FOL: Question: Can you spot a Mini Cooper on the left?
Question: Can you spot a Mini Cooper on the right?
Question: Can you spot a Mini Cooper on the left?
Question: Can you spot a Mini Cooper on the right?
Question: Can you spot a Mini Cooper on the left?
Question: Can you spot a Mini Cooper on the right?
Question: Can
True FOL: TypeOf(x, MiniCooper)^InitialLocation(x, NearLeft)



In [11]:
dataset

{'train': Dataset({
     features: ['Question', 'FOL Query', 'text', '__index_level_0__'],
     num_rows: 408
 }),
 'validation': Dataset({
     features: ['Question', 'FOL Query', 'text', '__index_level_0__'],
     num_rows: 22
 })}

In [12]:
lora_r = 8  # Reduced from 16 to 8 for GPT-2
lora_alpha = 16
lora_dropout = 0.05
target_modules = ["c_attn", "c_proj", "c_fc"]  # GPT-2 specific target modules

peft_config = LoraConfig(
    r=lora_r,
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    task_type=TaskType.CAUSAL_LM,
    target_modules=target_modules,
    bias="none",  # GPT-2 doesn't use bias in attention layers
)

# Training arguments
args = TrainingArguments(
    output_dir="./gpt2-LoRA-nl-to-fol-2",
    evaluation_strategy="steps",
    do_eval=True,
    optim="adamw_torch",
    per_device_train_batch_size=4,  # Reduced from 8 to 4 due to GPT-2's larger size
    gradient_accumulation_steps=8,  # Increased from 4 to 8 to compensate for smaller batch size
    per_device_eval_batch_size=4,  # Reduced from 8 to 4
    log_level="info",  # Changed from "debug" to "info" for less verbose output
    save_strategy="epoch",
    logging_steps=100,  # Reduced from 100 to 50 for more frequent logging
    learning_rate=5e-5,  # Reduced from 1e-4 to 5e-5 for GPT-2
    fp16=True,  # GPT-2 works well with fp16
    eval_steps=100,  # Reduced from 100 to 50 for more frequent evaluation
    num_train_epochs=200,  # Reduced from 17 to 3 as GPT-2 might converge faster
    warmup_ratio=0.1,
    lr_scheduler_type="cosine",  # Changed from "linear" to "cosine" for potentially better performance
    # report_to="wandb",  # Commented out as per your original code
    seed=42,
)

trainer = SFTTrainer(
    model=model,
    train_dataset=dataset['train'],
    eval_dataset=dataset['validation'],
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=256,  # Reduced from 512 to 256 for GPT-2
    tokenizer=tokenizer,
    args=args,
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/408 [00:00<?, ? examples/s]

Map:   0%|          | 0/22 [00:00<?, ? examples/s]

You have loaded a model on multiple GPUs. `is_model_parallel` attribute will be force-set to `True` to avoid any unexpected behavior such as device placement mismatching.
Using auto half precision backend


In [13]:
# Start training
trainer.train()

***** Running training *****
  Num examples = 408
  Num Epochs = 200
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 8
  Total optimization steps = 2,400
  Number of trainable parameters = 1,179,648
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mimrankabir1996[0m ([33mimrankabir1996-penn-state[0m). Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss
100,4.2069,3.650223
200,2.7214,1.064592
300,0.7117,0.438406
400,0.4442,0.344376
500,0.3698,0.296572
600,0.3307,0.277749
700,0.308,0.26256
800,0.2877,0.25604
900,0.2746,0.253634
1000,0.2633,0.245728


Saving model checkpoint to ./gpt2-LoRA-nl-to-fol-2/checkpoint-12
loading configuration file config.json from cache at /home/ibk5106/.cache/huggingface/hub/models--gpt2/snapshots/607a30d783dfa663caf39e06633721c8d4cfcd7e/config.json
Model config GPT2Config {
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "m

TrainOutput(global_step=2400, training_loss=0.544321772257487, metrics={'train_runtime': 1022.1581, 'train_samples_per_second': 79.831, 'train_steps_per_second': 2.348, 'total_flos': 3701730039607296.0, 'train_loss': 0.544321772257487, 'epoch': 188.23529411764707})

In [14]:
# Save the fine-tuned model
trainer.save_model("./gpt-2-LoRA-nl-to-fol-final-2")

# Load the fine-tuned model
fine_tuned_model = PeftModel.from_pretrained(model, "./gpt-2-LoRA-nl-to-fol-final-2")

# Test model after fine-tuning
print("Testing model after fine-tuning:")
after_results = test_model(fine_tuned_model, tokenizer, test_questions)
for result in after_results:
    print(f"Question: {result['question']}")
    print(f"Input: {result['input']}")
    print(f"Generated FOL: {result['generated_fol']}")
    print(f"True FOL: {result['true_fol']}\n")



Saving model checkpoint to ./gpt-2-LoRA-nl-to-fol-final-2
loading configuration file config.json from cache at /home/ibk5106/.cache/huggingface/hub/models--gpt2/snapshots/607a30d783dfa663caf39e06633721c8d4cfcd7e/config.json
Model config GPT2Config {
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_leng

Testing model after fine-tuning:
Question: Can you spot a Mini Cooper on the left?
Input: Translate the following natural language question to First Order Logic (FOL). Please respond with only the FOL statement. Don't include additional text. 
Question: Can you spot a Mini Cooper on the left?
FOL Query:
Generated FOL: TypeOf(x, Car)^InitialLocation(x, Left)^TypeOf(y, Truck)^InitialLocation(y, Right)^InitialLocation(x, Right)^ComeClose(x, y)^InitialLocation(y, Left)^ComeClose(y, x)^InitialLocation(x, Right)^InitialLocation(y, Left)^ComeClose(y, x)^InitialLocation(x, Right)^InitialLocation(y
True FOL: TypeOf(x, MiniCooper)^InitialLocation(x, NearLeft)



In [15]:
# Save the entire model (base + LoRA) to a single file
fine_tuned_model = fine_tuned_model.merge_and_unload()
fine_tuned_model.save_pretrained("./gpt-2-nl-to-fol-merged")
tokenizer.save_pretrained("./gpt-2-nl-to-fol-merged")
print("Merged model saved to: ./gpt-2-nl-to-fol-merged")

Configuration saved in ./gpt-2-nl-to-fol-merged/config.json
Configuration saved in ./gpt-2-nl-to-fol-merged/generation_config.json
Model weights saved in ./gpt-2-nl-to-fol-merged/model.safetensors
tokenizer config file saved in ./gpt-2-nl-to-fol-merged/tokenizer_config.json
Special tokens file saved in ./gpt-2-nl-to-fol-merged/special_tokens_map.json


Merged model saved to: ./gpt-2-nl-to-fol-merged


In [16]:
model_name = "./gpt-2-nl-to-fol-merged"
model_id = "./gpt-2-nl-to-fol-merged"
device_map = 'auto'


tokenizer = AutoTokenizer.from_pretrained(
    model_id, 
)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map=device_map,
)

tokenizer.pad_token = tokenizer.eos_token

loading file vocab.json
loading file merges.txt
loading file tokenizer.json
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json
loading configuration file ./gpt-2-nl-to-fol-merged/config.json
Model config GPT2Config {
  "_name_or_path": "./gpt-2-nl-to-fol-merged",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {

In [19]:
test_questions = random.sample(list(zip(dataset['validation']['Question'], dataset['validation']['FOL Query'])), 2)

print("Testing model after fine-tuning:")
after_results = test_model(fine_tuned_model, tokenizer, test_questions)
for result in after_results:
    print(f"Question: {result['question']}")
    print(f"Input: {result['input']}")
    print(f"Generated FOL: {result['generated_fol']}")
    print(f"True FOL: {result['true_fol']}\n")


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Testing model after fine-tuning:


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Question: Is there a white car at the center of the scene?
Input: Translate the following natural language question to First Order Logic (FOL). Please respond with only the FOL statement. Don't include additional text. 
Question: Is there a white car at the center of the scene?
FOL Query:
Generated FOL: TypeOf(x, Car)^ColorOf(x, White)^InitialLocation(x, NearFront)^TypeOf(y, Car)^InitialLocation(y, NearFront)^InitialLocation(y, NearLeft)^ComeClose(x, y)^InitialLocation(y, NearRight)^ComeClose(y, y)^InitialLocation(x, NearRight)^InitialLocation(y, NearLeft)^InitialLocation(y, Near
True FOL: TypeOf(x, Car)^ColorOf(x, White)^InitialLocation(x, NearFront)

Question: Is the gray SUV accelerating?
Input: Translate the following natural language question to First Order Logic (FOL). Please respond with only the FOL statement. Don't include additional text. 
Question: Is the gray SUV accelerating?
FOL Query:
Generated FOL: TypeOf(x, SUV)^ColorOf(x, Gray)^Vehicles(x)^Accelerate(x)^InitialLocatio