In [1]:
import os
import time
from text_generator import MistralGenerator
from utils import load_data, save_batch_output, log_time

In [2]:
from tqdm.auto import tqdm

In [3]:
# Config
BATCH_SIZE = 10
INPUT_CSV = "../Data/VisDoM-main/spiqa/spiqa.csv" # where the questions and captions are
OUTPUT_DIR = "../Eval_outputs/SPIQA/text_only"
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [4]:
BASE_URL = "http://192.168.1.56:1234/v1"

In [5]:
PROMPT_TEMPLATE = """You are a knowledgeable assistant. Use the following retrieved context to answer the user's question as accurately as possible.

Context:
{context}

Question:
{question}

Instructions:
- Use only the information provided in the context to answer.
- Do not make up information or assumptions.
- If the answer is not in the context, respond with "The answer is not available in the provided context."
- Provide a concise and clear answer.

Answer:

"""


In [6]:
API_KEY = "lm-studio"

In [7]:
# Load data and generator
data = load_data(INPUT_CSV)
generator = MistralGenerator(base_url=BASE_URL,api_key=API_KEY,prompt_template=PROMPT_TEMPLATE )

In [21]:
generator.generate_answer("who are you", "you are a star")

"The answer is not available in the provided context as it does not specify who 'you' refers to."

In [None]:
# # Batch processing
# for i in tqdm(range(0, len(data), BATCH_SIZE)):
#     batch = data.iloc[i:i+BATCH_SIZE]
#     results = []
#     start = time.time()

#     for idx, row in batch.iterrows():
#         question = row["old_question"]
#         response = generator.generate_answer(question)
#         results.append({"index": idx, "question": question, "response": response})

#     duration = time.time() - start
#     save_batch_output(results, OUTPUT_DIR, i)
#     log_time(OUTPUT_DIR, i, duration)

In [19]:
# Detect existing batches
existing_batches = {
    int(f.split("_")[1])  # extract start index from file name
    for f in os.listdir(OUTPUT_DIR)
    if f.startswith("batch_") and f.endswith(".csv")
}
existing_batches

{0, 10, 20, 30, 40, 50, 60, 70}

In [None]:
# # Detect existing batches
# existing_batches = {
#     int(f.split("_")[1])  # extract start index from file name
#     for f in os.listdir(OUTPUT_DIR)
#     if f.startswith("batch_") and f.endswith(".csv")
# }

# # Batch processing
# for i in tqdm(range(0, len(data), BATCH_SIZE)):
#     if i in existing_batches:
#         print(f"Skipping batch {i} (already processed)")
#         continue

#     batch = data.iloc[i:i+BATCH_SIZE]
#     results = []
#     start = time.time()

#     for idx, row in tqdm(batch.iterrows()):
#         question = row["question"]
#         response = generator.generate_answer(question)
#         results.append({"index": idx, "question": question, "response": response})

#     duration = time.time() - start
#     save_batch_output(results, OUTPUT_DIR, i)
#     log_time(OUTPUT_DIR, i, duration,prefix="text_only")

  0%|          | 0/59 [00:00<?, ?it/s]

Skipping batch 0 (already processed)
Skipping batch 10 (already processed)
Skipping batch 20 (already processed)
Skipping batch 30 (already processed)
Skipping batch 40 (already processed)
Skipping batch 50 (already processed)
Skipping batch 60 (already processed)
Skipping batch 70 (already processed)


0it [00:00, ?it/s]

File saved to:  ../Eval_outputs/SPIQA/text_only/batch_80_20250420_201653.csv


0it [00:00, ?it/s]

File saved to:  ../Eval_outputs/SPIQA/text_only/batch_90_20250420_202241.csv


0it [00:00, ?it/s]

File saved to:  ../Eval_outputs/SPIQA/text_only/batch_100_20250420_202723.csv


0it [00:00, ?it/s]

APITimeoutError: Request timed out.

In [None]:
# Detect existing batches
existing_batches = {
    int(f.split("_")[1])  # extract start index from file name
    for f in os.listdir(OUTPUT_DIR)
    if f.startswith("batch_") and f.endswith(".csv")
}

# Batch processing
for i in tqdm(range(0, len(data), BATCH_SIZE)):
    if i in existing_batches:
        print(f"Skipping batch {i} (already processed)")
        continue

    batch = data.iloc[i:i+BATCH_SIZE]
    results = []
    start = time.time()

    for idx, row in tqdm(batch.iterrows()):
        question = row["question"]
        response = generator.generate_answer(question)
        results.append({"index": idx, "question": question, "response": response})

    duration = time.time() - start
    save_batch_output(results, OUTPUT_DIR, i)
    log_time(OUTPUT_DIR, i, duration,prefix="text_only")

  0%|          | 0/59 [00:00<?, ?it/s]

Skipping batch 0 (already processed)
Skipping batch 10 (already processed)
Skipping batch 20 (already processed)
Skipping batch 30 (already processed)
Skipping batch 40 (already processed)
Skipping batch 50 (already processed)
Skipping batch 60 (already processed)
Skipping batch 70 (already processed)
Skipping batch 80 (already processed)
Skipping batch 90 (already processed)
Skipping batch 100 (already processed)


0it [00:00, ?it/s]

File saved to:  ../Eval_outputs/SPIQA/text_only/batch_110_20250420_210546.csv


0it [00:00, ?it/s]

File saved to:  ../Eval_outputs/SPIQA/text_only/batch_120_20250420_210948.csv


0it [00:00, ?it/s]

In [15]:
log_time(output_dir=OUTPUT_DIR,batch_index="cc",duration=19999.99,prefix='text')