In [None]:
!pip install wandb

In [None]:
!wandb login

In [None]:
import wandb

wandb.init(
    project="Prompting-Experiments",
    name="flickr-Pixtral12B - 2",
    group="prompting-experiments",
    tags=["pixtral", "prompting", "flickr"],
    notes="FT",
    config={
        "model": "Pixtral12B",
        "prompting": "FT",
    },
)

In [None]:
import itertools
from inf_util2 import evaluate_batch
import pandas as pd
from datasets import load_dataset
import torch
import time

# Clear GPU cache
torch.cuda.empty_cache()
print("✅ Cleared GPU cache")

# Log initial VRAM usage
print("📊 Initial VRAM Usage:")
print(f"  Reserved: {torch.cuda.memory_reserved() / 1024**3:.2f} GB")
print(f"  Allocated: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")

# Load Flickr30k test split
flickr30k_test = load_dataset("nlphuji/flickr30k", split="test")

# Select specified images
test_indices = [2499, 2500, 2501, 2502, 2503, 2504, 2505, 2506, 2507, 2508, 2509]
test_dataset = flickr30k_test.select(test_indices)

# Log the selected images (filenames and indices)
image_details = [(i, test_dataset[i]["filename"]) for i in range(len(test_dataset))]
for idx, filename in image_details:
    print(f"Index: {test_indices[idx]}, Filename: {filename}")
with open("selected_images.txt", "w") as f:
    for idx, filename in image_details:
        f.write(f"Index: {test_indices[idx]}, Filename: {filename}\n")
print("✅ Saved selected images to selected_images.txt")

# Define prompts
prompt_list = ["Describe this image"]
prompts_total_list = [prompt_list] * len(test_indices)  # Repeat for each image
multiple_refs = True  # Flickr30k has multiple reference captions

# Run inference with timing
print("\n🔄 Running Inference...")
torch.cuda.reset_peak_memory_stats()  # Reset peak memory stats
start_time = time.time()
all_results = []

for i, (index, prompts) in enumerate(zip(range(len(test_indices)), prompts_total_list)):
    print(f"\n📦 Evaluating sample {i+1}/{len(test_indices)} at index {index}...")
    # Run evaluation (inference time and VRAM measured in inf_util.py)
    results = evaluate_batch([prompts], test_dataset, [index], multiple_refs)[0]
    # Debug: Print results type and content
    print(f"Results type: {type(results)}")
    print(f"Results content: {results}")
    # Add VRAM to DataFrame
    results['sample_index'] = index  # Fixed: Use relative index
    results['vram_reserved_gb'] = results['vram_reserved_gb']  # Set by inf_util.py
    results['vram_allocated_gb'] = results['vram_allocated_gb']  # Set by inf_util.py
    all_results.append(results)

# Combine results into a single DataFrame
print("\n🔄 Combining results...")
dfs = [pd.concat(all_results, ignore_index=True)]

end_time = time.time()
total_inference_time = dfs[0]["inference_time"].sum()
avg_inference_time = dfs[0]["inference_time"].mean()

# Log inference time
print(f"\n⏱️ Inference Time (Generation Only):")
print(f"  Total: {total_inference_time:.2f} seconds")
print(f"  Average per image: {avg_inference_time:.2f} seconds")

# Log final VRAM usage
print(f"\n📊 Final VRAM Usage:")
print(f"  Reserved: {torch.cuda.memory_reserved() / 1024**3:.2f} GB")
print(f"  Peak Allocated: {torch.cuda.max_memory_allocated() / 1024**3:.2f} GB")

# Report results
print(f"\nResults for Run (Prompt: 'Describe this image'):")
for i, df in enumerate(dfs, start=1):
    # Ensure generated is a string in DataFrame
    df['generated'] = df['generated'].apply(lambda x: x[0] if isinstance(x, list) else x)
    avg_cider = df["CIDEr"].mean()
    avg_meteor = df["METEOR"].mean()
    avg_cos_sim = df["semantic_similarity"].mean()
    avg_spice = df["SPICE"].mean()
    avg_inf_time = df["inference_time"].mean()
    avg_vram_reserved = df["vram_reserved_gb"].mean()
    avg_vram_allocated = df["vram_allocated_gb"].mean()
    print(f"  Average CIDEr: {avg_cider:.4f}")
    print(f"  Average METEOR: {avg_meteor:.4f}")
    print(f"  Average Semantic Similarity: {avg_cos_sim:.4f}")
    print(f"  Average SPICE: {avg_spice:.4f}")
    print(f"  Average Inference Time: {avg_inf_time:.2f} seconds")
    print(f"  Average VRAM Reserved: {avg_vram_reserved:.2f} GB")
    print(f"  Average VRAM Allocated: {avg_vram_allocated:.2f} GB")
    # Log captions and metrics for output verification
    print("  Sample Captions and Metrics:")
    for idx, row in df.iterrows():
        dataset_idx = int(row['sample_index'])
        original_idx = test_indices[dataset_idx]
        print(f"    Index {original_idx}: Generated: '{row['generated']}', Reference: '{row['reference_captions']}', "
              f"Inference Time: {row['inference_time']:.2f} s, VRAM Reserved: {row['vram_reserved_gb']:.2f} GB, "
              f"VRAM Allocated: {row['vram_allocated_gb']:.2f} GB")

# Save results
dfs[0].to_csv("evaluation_results_run1.csv", index=False)
print("✅ Saved evaluation results to evaluation_results_run1.csv")


# Initialize wandb table
table = wandb.Table(columns=["sample_index", "reference", "generated", "semantic_similarity", "spice", "cider"])

# Log results to wandb
for idx, row in dfs[0].iterrows():
    table.add_data(
        str(int(row['sample_index'])),
        row['reference_captions'],
        row['generated'],
        row['semantic_similarity'] if 'semantic_similarity' in row else 0.0,
        row['SPICE'] if 'SPICE' in row else 0.0,
        row['CIDEr'] if 'CIDEr' in row else 0.0
    )

# Log the table to wandb
wandb.log({"Results": table})

# Finish wandb run
wandb.finish()