### Qwen2.5-7B-Instruct LLM run on Aristotle pipeline
### This is run in Runpod/remote Jupyter environment

In [None]:
!apt update

In [None]:
!apt install git-lfs

In [None]:
!pip install transformers safetensors sentencepiece huggingface-hub accelerate bitsandbytes tqdm openai backoff retrying protobuf matplotlib

In [None]:
!git lfs install

In [None]:
!git clone https://huggingface.co/Qwen/Qwen2.5-7B-Instruct LLM_MODELS/Qwen2.5-7B-Instruct

In [None]:
import os

os.environ["TOKENIZERS_PARALLELISM"] = "false"
######### Also useful to reduce thread contention:
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["MKL_NUM_THREADS"] = "1"

snapshot_path = "/workspace/LLM_MODELS/Qwen2.5-7B-Instruct" ############## <--- Change this based on platform and models
snapshot_path = "Qwen/Qwen2.5-7B-Instruct"

os.environ["LOCAL_MODEL_PATH"] = snapshot_path
os.environ["LLM_MODEL"] = snapshot_path

######## enable 4-bit for quants (and bitsandbytes is set up)
os.environ["LLM_LOAD_IN_4BIT"] = "1"  # or "0" to disable quantization

print("LOCAL_MODEL_PATH =", os.environ["LOCAL_MODEL_PATH"])
print("LLM_MODEL =", os.environ["LLM_MODEL"])

### If kernel doesnt recognize
LLM_MODEL=snapshot_path

In [None]:
#Test for Max Time generation stopping criteria
from llm_backends import HFBackend
hb = HFBackend(local_model_path=snapshot_path, quantize_4bit=True)
print("Calling short test (3s max_time)...")
res = hb.generate("Write a long list of words and sentences: ", max_new_tokens=1024, max_time=3.0)
print("Result length:", len(res))
print(res[:1000])

print("Calling short test (10s max_time)...")
res = hb.generate("Write a long list of words and sentences: ", max_new_tokens=1024, max_time=10.0)
print("Result length:", len(res))
print(res[:1000])

In [None]:
# Change this every process (translate, decompose, search_resolve), different time limit may be needed (in seconds).
os.environ["LLM_WORKER_MAX_TIME"] = "300"
LLM_WORKER_MAX_TIME=300

# Commandline args universal 
# MAX_NEW_TOKENS is purely for text generation count limit while max_position_embeddings is for context_length based on LLM config.json. !!! input_length + MAX_NEW_TOKENS shopuld be < context_length, otherwise LLM breaks. Llama 3 only has 8k context length/max_posiiton_embedding. SEALIONv3-LLama3-8B-IT uses ROPE, max_position_embeddings follows ROPE limit 131k, Qwen2.5-7B-IT has 32k context length, SahabatAIv1-LLama3-8B-IT has 8k context length.
# Counted the response for each steps in notebook output cell with tokens counter online, translations ~400 tokens, ~decomposition ~500 tokens, search_resolve ~700 tokens
os.environ["MAX_NEW_TOKENS"] = "1500"
MAX_NEW_TOKENS=1500
os.environ["BATCH_NUM"] = "1"
BATCH_NUM=1

In [None]:
# Translation with original prompts
#!python translate_to_fol.py --data_path manual_data_translated --dataset_name ProntoQA --sample_pct 100 --prompts_folder manual_prompts_translated --prompts_file translation --split dev --save_path results_translated_translation/v3/prompts_original --model_name $LLM_MODEL --batch_num $BATCH_NUM --max_new_tokens $MAX_NEW_TOKENS

In [None]:
# Translation with modified prompts
#!python translate_to_fol.py --data_path manual_data_translated --dataset_name ProntoQA --sample_pct 100 --prompts_folder manual_prompts_translated --prompts_file translation_modified --split dev --save_path results_translated_translation/v3/prompts_modified --model_name $LLM_MODEL --batch_num $BATCH_NUM --max_new_tokens $MAX_NEW_TOKENS

In [None]:
# Translation with refined prompts
#!python translate_to_fol.py --data_path manual_data_translated --dataset_name ProntoQA --sample_pct 100 --prompts_folder manual_prompts_translated --prompts_file translation_refine --split dev --save_path results_translated_translation/v3/prompts_refine --model_name $LLM_MODEL --batch_num $BATCH_NUM --max_new_tokens $MAX_NEW_TOKENS

In [None]:
# Decomposition with refined prompts
#!python decompose_to_cnf.py --data_path results_translated_translation/v3/prompts_refine --dataset_name ProntoQA --sample_pct 100 --prompts_folder manual_prompts_translated --prompts_file and_or_decomposer_refine --save_path results_translated_decomposition/v2/prompts_refine --model_name $LLM_MODEL --batch_num $BATCH_NUM --max_new_tokens $MAX_NEW_TOKENS

In [None]:
#!python negate.py --dataset_name ProntoQA --save_path results_translated_decomposition/v2/prompts_refine --model_name $LLM_MODEL

In [None]:
# Logic Resolver with refined prompts
#!python search_resolve.py --data_path results_translated_decomposition/v2/prompts_refine --dataset_name ProntoQA --sample_pct 100 --prompts_folder manual_prompts_translated --prompts_file logic_resolver_refine --save_path results_translated_search_resolve/prompts_refine --model_name $LLM_MODEL --batch_num $BATCH_NUM --negation False --search_round 10 --max_new_tokens $MAX_NEW_TOKENS

In [None]:
# Logic Resolver with refined prompts
#!python search_resolve.py --data_path results_translated_decomposition/v2/prompts_refine --dataset_name ProntoQA --sample_pct 100 --prompts_folder manual_prompts_translated --prompts_file logic_resolver_refine --save_path results_translated_search_resolve/prompts_refine --model_name $LLM_MODEL --batch_num $BATCH_NUM --negation True --search_round 10 --max_new_tokens $MAX_NEW_TOKENS

In [None]:
#!python evaluate.py --dataset_name ProntoQA --save_path results_translated_search_resolve/v3/prompts_refine --model_name $LLM_MODEL

In [17]:
# Naive prompting only requires True or False answer based on context
os.environ["MAX_NEW_TOKENS"] = "600"
MAX_NEW_TOKENS=600
os.environ["BATCH_NUM"] = "1"
BATCH_NUM=1

In [None]:
# Solving with naive prompting with explanations reasoning after answer
#!python naive_prompting.py --data_path results_bahasa_translation --dataset_name ProntoQA --sample_pct 0 --prompts_folder manual_prompts_translated --prompts_file naive_prompting_explanations_after_answer --split dev --save_path results_translated_naive_prompting/prompt_explanations_after_answer --model_name $LLM_MODEL --batch_num $BATCH_NUM --max_new_tokens $MAX_NEW_TOKENS

In [None]:
#!python evaluate.py --dataset_name ProntoQA --save_path results_translated_naive_prompting/prompt_explanations_after_answer --model_name $LLM_MODEL --evaluation_method naive_prompting

In [19]:
# Solving with naive prompting with explanations reasoning before answer
!python naive_prompting.py --data_path results_bahasa_translation --dataset_name ProntoQA --sample_pct 0 --start_index 382 --prompts_folder manual_prompts_translated --prompts_file naive_prompting_explanations_before_answer --split dev --save_path results_translated_naive_prompting/prompt_explanations_before_answer --model_name $LLM_MODEL --batch_num $BATCH_NUM --max_new_tokens $MAX_NEW_TOKENS

Loading translation file:  manual_prompts_translated\ProntoQA\naive_prompting_explanations_before_answer.txt
SAMPLE PCT: 0
START INDEX: 382 type: <class 'int'>
Loaded 1 examples from dev split.
Number of batch:  1
Running example id:  ProntoQA_382

Naive prompting: Deskripsi Tugas: Anda diberikan sebuah konteks yang berisi sekumpulan premis dan pertanyaan mengenai konteks tersebut. Tugas Anda adalah menjawab pertanyaan tersebut dengan memilih salah satu nilai kebenaran True atau False berdasarkan konteks yang diberikan dengan memberikan penjelasan langkah-langkah yang Anda gunakan untuk mencapai nilai kebenaran/jawaban tersebut.

------

Contoh:

Konteks:
Impuses adalah besar. Alex adalah impus.

Pertanyaan:
Alex besar

###
**Penjelasan**:
Alex adalah impus.
Impuses adalah besar.
Alex adalah besar.

**Jawaban**: True 
###

------

Contoh:

Konteks:
Setiap numpus adalah tembus pandang. Fae adalah numpus.

Pertanyaan:
Fae tidak tembus pandang

###
**Penjelasan**:
Fae adalah numpus.
Setia


  0%|          | 0/1 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s][A

Loading checkpoint shards:  25%|██▌       | 1/4 [00:01<00:05,  1.81s/it][A

Loading checkpoint shards:  50%|█████     | 2/4 [00:03<00:03,  1.82s/it][A

Loading checkpoint shards:  75%|███████▌  | 3/4 [00:05<00:01,  1.84s/it][A

Loading checkpoint shards: 100%|██████████| 4/4 [00:07<00:00,  1.72s/it][A
Loading checkpoint shards: 100%|██████████| 4/4 [00:07<00:00,  1.76s/it]
The following generation flags are not valid and may be ignored: ['temperature', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.

100%|██████████| 1/1 [00:39<00:00, 39.75s/it]
100%|██████████| 1/1 [00:39<00:00, 39.75s/it]


In [None]:
!python evaluate.py --dataset_name ProntoQA --save_path results_translated_naive_prompting/prompt_explanations_before_answer --model_name $LLM_MODEL --evaluation_method naive_prompting