In [2]:
import os
import sys
import json
from pathlib import Path

# Set up paths
project_root = Path(os.getcwd()).parent
model_path = project_root / "pipelines" / "lm_model_classes"
data_path = project_root / "preprocessing" / "500_pairs.json"
output_path = Path("results/lexical_results.json")
output_path.parent.mkdir(parents=True, exist_ok=True)

# Add model folder to sys.path
sys.path.append(str(model_path))

# Now import model classes
from GPT4oAPI import GPT4oAPI
from DeepSeekChatAPI import DeepSeekChatAPI
import json
from pathlib import Path

with open("results/lexical_results.json", "r") as f:
    lexical_data = json.load(f)


In [3]:
import time
import json
from tqdm import tqdm
from joblib import Parallel, delayed
from GPT4oAPI import GPT4oAPI
from DeepSeekChatAPI import DeepSeekChatAPI

# Instantiate models once
gpt4o = GPT4oAPI()
deepseek = DeepSeekChatAPI()

# Prepare job for a single entry
def process_one_entry(entry):
    gpt_lex = entry["gpt4o"]
    deepseek_lex = entry["deepseek"]

    def run_variant(name, model, input_text):
        return (name, model.syntactic_simplification(input_text))

    jobs = [
        ("gpt4o_gpt4o", gpt4o, gpt_lex),
        ("gpt4o_deepseek", deepseek, gpt_lex),
        ("deepseek_gpt4o", gpt4o, deepseek_lex),
        ("deepseek_deepseek", deepseek, deepseek_lex),
    ]

    results = Parallel(n_jobs=4, backend="threading")(
        delayed(run_variant)(name, model, text) for name, model, text in jobs
    )

    return {**entry, **{k: v for k, v in results}}

# Run over all entries with progress bar
start = time.time()
syntactic_results = []

start_idx = 50
end_idx = 100
syntactic_results = []

for entry in tqdm(lexical_data[start_idx:end_idx], desc="Processing entries 50–99"):
    result = process_one_entry(entry)
    syntactic_results.append(result)

print(f"\n✅ All 100 entries completed in {(time.time() - start) / 60:.2f} minutes")

# @TODO: restart process from index 50, save to different file


Processing entries 50–99:   0%|          | 0/50 [00:00<?, ?it/s]

Processing entries 50–99: 100%|██████████| 50/50 [22:22<00:00, 26.84s/it]


✅ All 100 entries completed in 22.37 minutes





In [4]:
with open("results/syntactic_results_batch_2.json", "w") as f:
    json.dump(syntactic_results, f, indent=2)

print("✅ Syntactic-layered results saved to results/syntactic_layer_results.json")


✅ Syntactic-layered results saved to results/syntactic_layer_results.json
