3.1 Automatic Story Generation

In [2]:
import sys
!{sys.executable} -m pip install ollama

Collecting ollama
  Using cached ollama-0.5.1-py3-none-any.whl.metadata (4.3 kB)
Collecting httpx>=0.27 (from ollama)
  Using cached httpx-0.28.1-py3-none-any.whl.metadata (7.1 kB)
Collecting pydantic>=2.9 (from ollama)
  Using cached pydantic-2.11.7-py3-none-any.whl.metadata (67 kB)
Collecting anyio (from httpx>=0.27->ollama)
  Using cached anyio-4.9.0-py3-none-any.whl.metadata (4.7 kB)
Collecting certifi (from httpx>=0.27->ollama)
  Downloading certifi-2025.7.14-py3-none-any.whl.metadata (2.4 kB)
Collecting httpcore==1.* (from httpx>=0.27->ollama)
  Using cached httpcore-1.0.9-py3-none-any.whl.metadata (21 kB)
Collecting idna (from httpx>=0.27->ollama)
  Using cached idna-3.10-py3-none-any.whl.metadata (10 kB)
Collecting h11>=0.16 (from httpcore==1.*->httpx>=0.27->ollama)
  Using cached h11-0.16.0-py3-none-any.whl.metadata (8.3 kB)
Collecting annotated-types>=0.6.0 (from pydantic>=2.9->ollama)
  Using cached annotated_types-0.7.0-py3-none-any.whl.metadata (15 kB)
Collecting pydantic-

In [None]:
import ollama
import random
import os
import pandas as pd
import numpy as np
from llama_cpp import Llama
import gc

# Dictionary for GGUF model paths
model_paths = {
    "llama3.2:3b": "G:/amozeshi/Arshad/NLP/HW4/models/llama-3.2-3b-instruct-q8_0.gguf",
    "qwen2.5vl:3b": "G:/amozeshi/Arshad/NLP/HW4/models/qwen2.5-3b-instruct-q4_k_m.gguf"
}

def calculate_perplexity_with_llama_cpp(text: str, model: Llama) -> float | None:
    """
    Calculates perplexity using the logprobs approach, filtering None values.
    """
    try:
        outputs = model.create_completion(
            prompt=text,
            max_tokens=0,
            logprobs=True,
            echo=True
        )
        
        # Safely get the list of log probabilities
        logprobs_data = outputs.get('choices', [{}])[0].get('logprobs')
        if not logprobs_data:
            return None
        
        token_logprobs = logprobs_data.get('token_logprobs')
        if not token_logprobs:
            return None

        # --- FIX: Filter out None values to prevent TypeError ---
        filtered_logprobs = [lp for lp in token_logprobs if lp is not None]

        if not filtered_logprobs:
            return None

        avg_log_likelihood = np.mean(filtered_logprobs)
        perplexity = np.exp(-avg_log_likelihood)
        return perplexity

    except Exception as e:
        print(f" Could not calculate perplexity with llama-cpp: {e}")
        return None

# --- 1. Configuration ---
random.seed(42)
models_to_test = ['llama3.2:3b', 'qwen2.5vl:3b']
genres = [
    'science fiction', 'fantasy', 'horror', 'comedy', 'mystery',
    'romance', 'thriller', 'historical fiction', 'western', 'cyberpunk'
]
output_dir = 'generated_stories'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

evaluation_results = []

# Main Generation Loop 
for model_name in models_to_test:
    print(f"\n{'='*60}")
    print(f" Starting tests for model: {model_name} ")
    print(f"{'='*60}\n")

    # Load the specific GGUF model for perplexity calculation
    llm_for_perplexity = None
    model_path = model_paths.get(model_name)
    if not model_path or not os.path.exists(model_path):
        print(f" WARNING: GGUF model path for '{model_name}' not found. Perplexity will NOT be calculated.")
    else:
        print(f"  Loading perplexity model from: {model_path}...")
        llm_for_perplexity = Llama(
            model_path=model_path,
            n_ctx=2048,
            verbose=True,
            logits_all=True
        )
        print("  Perplexity model loaded successfully.")

    for i, genre in enumerate(genres):
        prompt = f"""Write a complete short story embodying the main elements of the {genre} genre. The story should feature:
        A compelling and attention-grabbing opening that sets the tone.
        Well-developed characters whose motivations and personalities suit the genre.
        A plot with a clear beginning, middle, and end, enriched with twists suitable for the genre.
        Attention to genre-specific themes or settings, utilizing common tropes creatively.
        A satisfying conclusion that ties together critical themes and character arcs.
        Incorporate imaginative descriptions and dialogue where appropriate, enhancing engagement and depth. The story should be approximately 100 words to allow for detailed scenario building and resolution."""
        options = {'temperature': round(random.uniform(0.7, 1.2), 2), 'seed': 42}
        
        print(f"\n Generating story {i+1}/{len(genres)} in genre: '{genre}'...")
        try:
            response = ollama.generate(model=model_name, prompt=prompt, options=options, stream=False)
            generated_story = response['response'].strip()

            if not generated_story:
                print("WARNING: Generated story is empty. Skipping.")
                continue

            perplexity_score = None
            if llm_for_perplexity:
                text_for_perplexity = generated_story
                if "qwen" in model_name:
                    text_for_perplexity = "<|endoftext|>" + generated_story
                    print(" Prepended <|endoftext|> token for Qwen model.")

                perplexity_score = calculate_perplexity_with_llama_cpp(text_for_perplexity, llm_for_perplexity)
                
                if perplexity_score is not None:
                    print(f" Perplexity: {perplexity_score:.4f}")

            # Store results for final report
            evaluation_results.append({
                "model": model_name,
                "genre": genre,
                "status": "success",
                "inference_time_ns": response.get('total_duration', 0),
                "perplexity": perplexity_score
            })
            
            # Save the generated story to a file
            safe_model_name = model_name.replace(':', '_').replace('/', '_')
            file_name = os.path.join(output_dir, f"story_{i+1}_{safe_model_name}.txt")
            with open(file_name, 'w', encoding='utf-8') as f:
                f.write(f"MODEL: {model_name}\n")
                f.write(f"GENRE: {genre}\n")
                f.write(f"TEMPERATURE: {options['temperature']}\n")
                perplexity_str = f"{perplexity_score:.4f}" if perplexity_score is not None else "N/A"
                f.write(f"PERPLEXITY: {perplexity_str}\n")
                f.write("-" * 20 + "\n\n")
                f.write(generated_story)
            print(f"     Story saved to {file_name}")


        except Exception as e:
            evaluation_results.append({
                "model": model_name, "genre": genre, "status": "failure",
                "inference_time_ns": 0, "perplexity": None
            })
            print(f"ERROR generating story for {model_name}: {e}")

    # Free up memory
    if llm_for_perplexity:
        del llm_for_perplexity
        gc.collect()
        print(f"\n  Unloaded perplexity model for '{model_name}' to free up memory.")

# Calculate and Print Final Metrics 
if evaluation_results:
    results_df = pd.DataFrame(evaluation_results)
    
    print("\n" + "="*60)
    print("FINAL EVALUATION METRICS")
    print("="*60)

    for model_name in models_to_test:
        model_df = results_df[results_df['model'] == model_name].copy()
        
        if not model_df.empty:
            model_df['perplexity'] = pd.to_numeric(model_df['perplexity'], errors='coerce')
            successes = model_df[model_df['status'] == 'success'].shape[0]
            total_items = len(model_df)
            failures = total_items - successes
            
            avg_time_s = (model_df['inference_time_ns'].sum() / successes) / 1e9 if successes > 0 else 0
            reject_rate = (failures / total_items) * 100 if total_items > 0 else 0
            avg_perplexity = model_df['perplexity'].mean()

            print(f"\nMetrics for model: {model_name}")
            print("-" * 35)
            print(f"  Data Items Count: {total_items}")
            print(f"  Average Inference Time: {avg_time_s:.4f} seconds/item")
            print(f"  Reject Rate (RR): {reject_rate:.2f}%")
            print(f"  Average Perplexity: {avg_perplexity:.4f}")


--- Starting tests for model: llama3.2:3b ---

  Loading perplexity model from: G:/amozeshi/Arshad/NLP/HW4/models/llama-3.2-3b-instruct-q8_0.gguf...


llama_model_loader: loaded meta data with 30 key-value pairs and 255 tensors from G:/amozeshi/Arshad/NLP/HW4/models/llama-3.2-3b-instruct-q8_0.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = Llama 3.2 3B Instruct
llama_model_loader: - kv   3:                           general.finetune str              = Instruct
llama_model_loader: - kv   4:                           general.basename str              = Llama-3.2
llama_model_loader: - kv   5:                         general.size_label str              = 3B
llama_model_loader: - kv   6:                               general.tags arr[str,6]       = ["facebook", "meta", "pytor

  Perplexity model loaded successfully.

  -> Generating story 1/10 in genre: 'science fiction'...


llama_perf_context_print:        load time =  117362.90 ms
llama_perf_context_print: prompt eval time =  117356.68 ms /   177 tokens (  663.03 ms per token,     1.51 tokens per second)
llama_perf_context_print:        eval time =  354653.80 ms /   463 runs   (  765.99 ms per token,     1.31 tokens per second)
llama_perf_context_print:       total time =  476339.94 ms /   640 tokens


  --> Perplexity: 2.2046
     Story saved to generated_stories\story_1_llama3.2_3b.txt

  -> Generating story 2/10 in genre: 'fantasy'...


Llama.generate: 1 prefix-match hit, remaining 251 prompt tokens to eval
llama_perf_context_print:        load time =  117362.90 ms
llama_perf_context_print: prompt eval time =  129734.67 ms /   251 tokens (  516.87 ms per token,     1.93 tokens per second)
llama_perf_context_print:        eval time =  220049.23 ms /    43 runs   ( 5117.42 ms per token,     0.20 tokens per second)
llama_perf_context_print:       total time =  350943.98 ms /   294 tokens


  --> Perplexity: 3.2263
     Story saved to generated_stories\story_2_llama3.2_3b.txt

  -> Generating story 3/10 in genre: 'horror'...


Llama.generate: 2 prefix-match hit, remaining 227 prompt tokens to eval
llama_perf_context_print:        load time =  117362.90 ms
llama_perf_context_print: prompt eval time =  149402.69 ms /   227 tokens (  658.16 ms per token,     1.52 tokens per second)
llama_perf_context_print:        eval time =  200895.64 ms /    80 runs   ( 2511.20 ms per token,     0.40 tokens per second)
llama_perf_context_print:       total time =  352262.21 ms /   307 tokens


  --> Perplexity: 2.5918
     Story saved to generated_stories\story_3_llama3.2_3b.txt

  -> Generating story 4/10 in genre: 'comedy'...


Llama.generate: 2 prefix-match hit, remaining 261 prompt tokens to eval
llama_perf_context_print:        load time =  117362.90 ms
llama_perf_context_print: prompt eval time =  122611.98 ms /   261 tokens (  469.78 ms per token,     2.13 tokens per second)
llama_perf_context_print:        eval time =  329263.71 ms /   113 runs   ( 2913.84 ms per token,     0.34 tokens per second)
llama_perf_context_print:       total time =  455490.29 ms /   374 tokens


  --> Perplexity: 3.2143
     Story saved to generated_stories\story_4_llama3.2_3b.txt

  -> Generating story 5/10 in genre: 'mystery'...


Llama.generate: 1 prefix-match hit, remaining 259 prompt tokens to eval
llama_perf_context_print:        load time =  117362.90 ms
llama_perf_context_print: prompt eval time =  125471.32 ms /   259 tokens (  484.45 ms per token,     2.06 tokens per second)
llama_perf_context_print:        eval time =       0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_perf_context_print:       total time =  126522.49 ms /   260 tokens


  --> Perplexity: 4.0969
     Story saved to generated_stories\story_5_llama3.2_3b.txt

  -> Generating story 6/10 in genre: 'romance'...


Llama.generate: 1 prefix-match hit, remaining 141 prompt tokens to eval
llama_perf_context_print:        load time =  117362.90 ms
llama_perf_context_print: prompt eval time =  126007.88 ms /   141 tokens (  893.67 ms per token,     1.12 tokens per second)
llama_perf_context_print:        eval time =  301024.78 ms /   163 runs   ( 1846.78 ms per token,     0.54 tokens per second)
llama_perf_context_print:       total time =  429565.65 ms /   304 tokens


  --> Perplexity: 3.6244
     Story saved to generated_stories\story_6_llama3.2_3b.txt

  -> Generating story 7/10 in genre: 'thriller'...


Llama.generate: 1 prefix-match hit, remaining 223 prompt tokens to eval
llama_perf_context_print:        load time =  117362.90 ms
llama_perf_context_print: prompt eval time =  129428.17 ms /   223 tokens (  580.40 ms per token,     1.72 tokens per second)
llama_perf_context_print:        eval time =  335910.20 ms /   346 runs   (  970.84 ms per token,     1.03 tokens per second)
llama_perf_context_print:       total time =  470638.50 ms /   569 tokens


  --> Perplexity: 2.9874
     Story saved to generated_stories\story_7_llama3.2_3b.txt

  -> Generating story 8/10 in genre: 'historical fiction'...


Llama.generate: 1 prefix-match hit, remaining 266 prompt tokens to eval
llama_perf_context_print:        load time =  117362.90 ms
llama_perf_context_print: prompt eval time =  160657.75 ms /   266 tokens (  603.98 ms per token,     1.66 tokens per second)
llama_perf_context_print:        eval time =  218009.11 ms /   138 runs   ( 1579.78 ms per token,     0.63 tokens per second)
llama_perf_context_print:       total time =  381587.77 ms /   404 tokens


  --> Perplexity: 2.5139
     Story saved to generated_stories\story_8_llama3.2_3b.txt

  -> Generating story 9/10 in genre: 'western'...


Llama.generate: 1 prefix-match hit, remaining 244 prompt tokens to eval
llama_perf_context_print:        load time =  117362.90 ms
llama_perf_context_print: prompt eval time =  142745.20 ms /   244 tokens (  585.02 ms per token,     1.71 tokens per second)
llama_perf_context_print:        eval time =  297836.94 ms /   364 runs   (  818.23 ms per token,     1.22 tokens per second)
llama_perf_context_print:       total time =  445762.91 ms /   608 tokens


  --> Perplexity: 2.2983
     Story saved to generated_stories\story_9_llama3.2_3b.txt

  -> Generating story 10/10 in genre: 'cyberpunk'...


Llama.generate: 1 prefix-match hit, remaining 391 prompt tokens to eval
llama_perf_context_print:        load time =  117362.90 ms
llama_perf_context_print: prompt eval time =  153163.71 ms /   391 tokens (  391.72 ms per token,     2.55 tokens per second)
llama_perf_context_print:        eval time =  186282.07 ms /    36 runs   ( 5174.50 ms per token,     0.19 tokens per second)
llama_perf_context_print:       total time =  344493.95 ms /   427 tokens


  --> Perplexity: 2.9597
     Story saved to generated_stories\story_10_llama3.2_3b.txt


llama_model_loader: loaded meta data with 35 key-value pairs and 434 tensors from G:/amozeshi/Arshad/NLP/HW4/models/qwen2.5-3b-instruct-q4_k_m.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = qwen2
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = Qwen2.5 3B Instruct
llama_model_loader: - kv   3:                           general.finetune str              = Instruct
llama_model_loader: - kv   4:                           general.basename str              = Qwen2.5
llama_model_loader: - kv   5:                         general.size_label str              = 3B
llama_model_loader: - kv   6:                            general.license str              = other
llama_model_loader: - kv 


  Unloaded perplexity model for 'llama3.2:3b' to free up memory.

--- Starting tests for model: qwen2.5vl:3b ---

  Loading perplexity model from: G:/amozeshi/Arshad/NLP/HW4/models/qwen2.5-3b-instruct-q4_k_m.gguf...


llama_model_loader: - kv  26:                      tokenizer.ggml.tokens arr[str,151936]  = ["!", "\"", "#", "$", "%", "&", "'", ...
llama_model_loader: - kv  27:                  tokenizer.ggml.token_type arr[i32,151936]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
llama_model_loader: - kv  28:                      tokenizer.ggml.merges arr[str,151387]  = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
llama_model_loader: - kv  29:                tokenizer.ggml.eos_token_id u32              = 151645
llama_model_loader: - kv  30:            tokenizer.ggml.padding_token_id u32              = 151643
llama_model_loader: - kv  31:                tokenizer.ggml.bos_token_id u32              = 151643
llama_model_loader: - kv  32:               tokenizer.ggml.add_bos_token bool             = false
llama_model_loader: - kv  33:                    tokenizer.chat_template str              = {%- if tools %}\n    {{- '<|im_start|>...
llama_model_loader: - kv  34:               general.quantization_version u32   

  Perplexity model loaded successfully.

  -> Generating story 1/10 in genre: 'science fiction'...
  --> Prepended <|endoftext|> token for Qwen model.


llama_perf_context_print:        load time =   39131.21 ms
llama_perf_context_print: prompt eval time =   39111.30 ms /   288 tokens (  135.80 ms per token,     7.36 tokens per second)
llama_perf_context_print:        eval time =  331666.35 ms /  1759 runs   (  188.55 ms per token,     5.30 tokens per second)
llama_perf_context_print:       total time =  388529.44 ms /  2047 tokens


  --> Perplexity: 2.4822
     Story saved to generated_stories\story_1_qwen2.5vl_3b.txt

  -> Generating story 2/10 in genre: 'fantasy'...
  --> Prepended <|endoftext|> token for Qwen model.


Llama.generate: 3 prefix-match hit, remaining 145 prompt tokens to eval
llama_perf_context_print:        load time =   39131.21 ms
llama_perf_context_print: prompt eval time =   37936.53 ms /   145 tokens (  261.63 ms per token,     3.82 tokens per second)
llama_perf_context_print:        eval time =  360528.85 ms /  1899 runs   (  189.85 ms per token,     5.27 tokens per second)
llama_perf_context_print:       total time =  458753.70 ms /  2044 tokens


  --> Perplexity: 1.8394
     Story saved to generated_stories\story_2_qwen2.5vl_3b.txt

  -> Generating story 3/10 in genre: 'horror'...
  --> Prepended <|endoftext|> token for Qwen model.


Llama.generate: 3 prefix-match hit, remaining 227 prompt tokens to eval
llama_perf_context_print:        load time =   39131.21 ms
llama_perf_context_print: prompt eval time =   42479.36 ms /   227 tokens (  187.13 ms per token,     5.34 tokens per second)
llama_perf_context_print:        eval time =   51063.61 ms /   161 runs   (  317.17 ms per token,     3.15 tokens per second)
llama_perf_context_print:       total time =  104823.84 ms /   388 tokens


  --> Perplexity: 3.5822
     Story saved to generated_stories\story_3_qwen2.5vl_3b.txt

  -> Generating story 4/10 in genre: 'comedy'...
  --> Prepended <|endoftext|> token for Qwen model.


Llama.generate: 3 prefix-match hit, remaining 323 prompt tokens to eval
llama_perf_context_print:        load time =   39131.21 ms
llama_perf_context_print: prompt eval time =   22684.00 ms /   323 tokens (   70.23 ms per token,    14.24 tokens per second)
llama_perf_context_print:        eval time =  116276.10 ms /   610 runs   (  190.62 ms per token,     5.25 tokens per second)
llama_perf_context_print:       total time =  145918.72 ms /   933 tokens


  --> Perplexity: 2.8012
     Story saved to generated_stories\story_4_qwen2.5vl_3b.txt

  -> Generating story 5/10 in genre: 'mystery'...
  --> Prepended <|endoftext|> token for Qwen model.


Llama.generate: 1 prefix-match hit, remaining 328 prompt tokens to eval
llama_perf_context_print:        load time =   39131.21 ms
llama_perf_context_print: prompt eval time =   39094.56 ms /   328 tokens (  119.19 ms per token,     8.39 tokens per second)
llama_perf_context_print:        eval time =  237913.35 ms /  1136 runs   (  209.43 ms per token,     4.77 tokens per second)
llama_perf_context_print:       total time =  300320.20 ms /  1464 tokens


  --> Perplexity: 3.0731
     Story saved to generated_stories\story_5_qwen2.5vl_3b.txt

  -> Generating story 6/10 in genre: 'romance'...
  --> Prepended <|endoftext|> token for Qwen model.


Llama.generate: 1 prefix-match hit, remaining 261 prompt tokens to eval
llama_perf_context_print:        load time =   39131.21 ms
llama_perf_context_print: prompt eval time =   43283.42 ms /   261 tokens (  165.84 ms per token,     6.03 tokens per second)
llama_perf_context_print:        eval time =  471929.92 ms /  1785 runs   (  264.39 ms per token,     3.78 tokens per second)
llama_perf_context_print:       total time =  562548.21 ms /  2046 tokens


  --> Perplexity: 2.0023
     Story saved to generated_stories\story_6_qwen2.5vl_3b.txt

  -> Generating story 7/10 in genre: 'thriller'...
  --> Prepended <|endoftext|> token for Qwen model.


Llama.generate: 1 prefix-match hit, remaining 225 prompt tokens to eval
llama_perf_context_print:        load time =   39131.21 ms
llama_perf_context_print: prompt eval time =   32564.90 ms /   225 tokens (  144.73 ms per token,     6.91 tokens per second)
llama_perf_context_print:        eval time =  311777.83 ms /  1821 runs   (  171.21 ms per token,     5.84 tokens per second)
llama_perf_context_print:       total time =  395671.50 ms /  2046 tokens


  --> Perplexity: 2.2238
     Story saved to generated_stories\story_7_qwen2.5vl_3b.txt

  -> Generating story 8/10 in genre: 'historical fiction'...
  --> Prepended <|endoftext|> token for Qwen model.


Llama.generate: 3 prefix-match hit, remaining 228 prompt tokens to eval
llama_perf_context_print:        load time =   39131.21 ms
llama_perf_context_print: prompt eval time =   38110.02 ms /   228 tokens (  167.15 ms per token,     5.98 tokens per second)
llama_perf_context_print:        eval time =  375350.02 ms /  1816 runs   (  206.69 ms per token,     4.84 tokens per second)
llama_perf_context_print:       total time =  453047.04 ms /  2044 tokens


  --> Perplexity: 2.1693
     Story saved to generated_stories\story_8_qwen2.5vl_3b.txt

  -> Generating story 9/10 in genre: 'western'...
  --> Prepended <|endoftext|> token for Qwen model.


Llama.generate: 3 prefix-match hit, remaining 360 prompt tokens to eval
llama_perf_context_print:        load time =   39131.21 ms
llama_perf_context_print: prompt eval time =   34412.08 ms /   360 tokens (   95.59 ms per token,    10.46 tokens per second)
llama_perf_context_print:        eval time =  172842.50 ms /   772 runs   (  223.89 ms per token,     4.47 tokens per second)
llama_perf_context_print:       total time =  220864.24 ms /  1132 tokens


  --> Perplexity: 3.3687
     Story saved to generated_stories\story_9_qwen2.5vl_3b.txt

  -> Generating story 10/10 in genre: 'cyberpunk'...
  --> Prepended <|endoftext|> token for Qwen model.


Llama.generate: 2 prefix-match hit, remaining 149 prompt tokens to eval
llama_perf_context_print:        load time =   39131.21 ms
llama_perf_context_print: prompt eval time =   31450.32 ms /   149 tokens (  211.08 ms per token,     4.74 tokens per second)
llama_perf_context_print:        eval time =  134789.42 ms /   741 runs   (  181.90 ms per token,     5.50 tokens per second)
llama_perf_context_print:       total time =  186614.54 ms /   890 tokens


  --> Perplexity: 2.6773
     Story saved to generated_stories\story_10_qwen2.5vl_3b.txt

  Unloaded perplexity model for 'qwen2.5vl:3b' to free up memory.

FINAL EVALUATION METRICS

Metrics for model: llama3.2:3b
-----------------------------------
  Data Items Count: 10
  Average Inference Time: 66.1869 seconds/item
  Reject Rate (RR): 0.00%
  Average Perplexity: 2.9718 (lower is better)

Metrics for model: qwen2.5vl:3b
-----------------------------------
  Data Items Count: 10
  Average Inference Time: 72.4379 seconds/item
  Reject Rate (RR): 0.00%
  Average Perplexity: 2.6220 (lower is better)


3.2 Abstractive Text Summarization

In [None]:
import ollama
import os
import pandas as pd

#Configuration
stories_dir = 'generated_stories'
summaries_dir = 'generated_summaries'

if not os.path.exists(summaries_dir):
    os.makedirs(summaries_dir)

evaluation_results = []

#Main Summarization Loop
try:
    story_files = [f for f in os.listdir(stories_dir) if f.endswith('.txt')]
    if not story_files:
        print(f"Error: No story files found in the '{stories_dir}' directory.")
    else:
        print(f"Found {len(story_files)} stories to summarize.\n")
except FileNotFoundError:
    print(f"Error: The directory '{stories_dir}' does not exist. Please run the story generation script first.")
    story_files = []

for filename in story_files:
    # Read the original story and its metadata ---
    file_path = os.path.join(stories_dir, filename)
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
        model_name_from_file = lines[0].replace('MODEL: ', '').strip()
        story_text = ''.join(lines[5:])
    
    #Create the summarization prompt ---
    prompt = f"""
    Summarize the provided story into a single, concise paragraph that encapsulates the central plot, key characters, and main theme.

    To ensure the summary achieves a high ROUGE-1 F1 score, follow these steps:

    Identify the main storyline and pivotal scenes that define the beginning, middle, and conclusion.
    Pinpoint character development milestones and their impact on the storyline.
    Highlight the core message or theme underscored through the narrative.
    Where possible, integrate key phrases or important details from the original text to enhance summary quality.
    Use clear and straightforward sentences for effective communication of the summary’s ideas, ensuring the paragraph remains comprehensive yet succinct.
    IMPORTANT: Your response MUST be ONLY the summary paragraph itself.
    Do NOT output any other text, such as "Here is the summary..." or any explanation.

    The story text is provided below:
    “”" {story_text} “”"

    Your summary should fit under the section labeled “Concise Summary”.
    """

    print(f"Summarizing '{filename}' using model '{model_name_from_file}' ")

    # Generate the summary
    try:
        response = ollama.generate(
            model=model_name_from_file,
            prompt=prompt,
            options={'seed': 42, 'temperature': 0.2},
            stream=False
        )
        
        #  Store success result
        evaluation_results.append({
            "model": model_name_from_file,
            "task": "Summarization",
            "status": "success",
            "inference_time_ns": response.get('total_duration', 0)
        })

        summary_text = response['response']

        summary_filename = os.path.join(summaries_dir, f"summary_of_{filename}")
        with open(summary_filename, 'w', encoding='utf-8') as f:
            f.write(summary_text)

        print(f" Summary saved to {summary_filename}")

    except Exception as e:
        evaluation_results.append({
            "model": model_name_from_file,
            "task": "Summarization",
            "status": "failure",
            "inference_time_ns": 0
        })
        print(f"ERROR summarizing {filename}: {e}")

#Calculate and Print Final Metrics
if evaluation_results:
    results_df = pd.DataFrame(evaluation_results)
    models_in_results = results_df['model'].unique()
    
    print("\n" + "="*50)
    print("--- FINAL EVALUATION METRICS (for your report) ---")
    print("="*50)

    for model_name in models_in_results:
        model_df = results_df[results_df['model'] == model_name]
        if not model_df.empty:
            total_items = len(model_df)
            failures = len(model_df[model_df['status'] == 'failure'])
            successes = total_items - failures
            
            reject_rate = (failures / total_items) * 100 if total_items > 0 else 0
            avg_time_s = (model_df['inference_time_ns'].sum() / successes) / 1e9 if successes > 0 else 0
            
            print(f"\nMetrics for model: {model_name}")
            print("-" * 30)
            print(f"Data Items Count: {total_items}")
            print(f"Average Inference Time: {avg_time_s:.4f} seconds/item")
            print(f"Reject Rate (RR): {reject_rate:.2f}%")

Found 20 stories to summarize.

-> Summarizing 'story_10_llama3.2_3b.txt' using model 'llama3.2:3b'...
 Summary saved to generated_summaries\summary_of_story_10_llama3.2_3b.txt
-> Summarizing 'story_10_qwen2.5vl_3b.txt' using model 'qwen2.5vl:3b'...
 Summary saved to generated_summaries\summary_of_story_10_qwen2.5vl_3b.txt
-> Summarizing 'story_1_llama3.2_3b.txt' using model 'llama3.2:3b'...
 Summary saved to generated_summaries\summary_of_story_1_llama3.2_3b.txt
-> Summarizing 'story_1_qwen2.5vl_3b.txt' using model 'qwen2.5vl:3b'...
 Summary saved to generated_summaries\summary_of_story_1_qwen2.5vl_3b.txt
-> Summarizing 'story_2_llama3.2_3b.txt' using model 'llama3.2:3b'...
 Summary saved to generated_summaries\summary_of_story_2_llama3.2_3b.txt
-> Summarizing 'story_2_qwen2.5vl_3b.txt' using model 'qwen2.5vl:3b'...
 Summary saved to generated_summaries\summary_of_story_2_qwen2.5vl_3b.txt
-> Summarizing 'story_3_llama3.2_3b.txt' using model 'llama3.2:3b'...
 Summary saved to generated

3.3 Natural Language Inference

In [None]:
import ollama
import pandas as pd
import os


nli_dataset_path = './nli/nli.csv' 
models_to_test = ['llama3.2:3b', 'qwen2.5vl:3b']


evaluation_results = []

prompt_template = """
Analyze and classify the relationship between the Premise and the Hypothesis provided. Use one of the following labels to signify the relationship:

entails - when the Hypothesis logically follows from the Premise.
contradicts - when the Hypothesis is logically refuted by the Premise.
neutral - when the Hypothesis neither contradicts nor is entailed by the Premise.
IMPORTANT: Your answer must include exactly one of these words: entails, contradicts, or neutral. Do NOT include other text or explanations.

Please study the examples provided below:
Example
Premise: “A man is driving a car on a scenic road.”
Hypothesis 1: “A man is in a vehicle.”
Answer 1: entails

Hypothesis 2: “Nobody is driving any vehicle.”
Answer 2: contradicts

Hypothesis 3: “The road has many sharp turns.”Answer 3: neutral
Now classify the following pair:
“”"
Premise: {premise}
Hypothesis: {hypothesis}
Answer:
"""

try:
    df = pd.read_csv(nli_dataset_path)
    print(f"Successfully loaded {len(df)} items from the NLI dataset.")
    # df = df.head(10) # For faster testing
except FileNotFoundError:
    print(f"ERROR: Dataset not found at '{nli_dataset_path}'.")
    df = None

# Main NLI Processing Loop
if df is not None:
    for index, row in df.iterrows():
        premise = row['premise']
        hypothesis = row['hypothesis']
        gold_label = row['label'] 

        print(f"\nProcessing item {index + 1}/{len(df)}...")
        
        for model_name in models_to_test:
            prompt = prompt_template.format(premise=premise, hypothesis=hypothesis)
            
            try:
                response = ollama.generate(
                    model=model_name,
                    prompt=prompt,
                    options={'seed': 42, 'temperature': 0.1},
                    stream=False
                )
                
                predicted_label = response['response'].strip().lower().split()[0]
                evaluation_results.append({
                    "model": model_name,
                    "status": "success",
                    "inference_time_ns": response.get('total_duration', 0),
                    "is_correct": 1 if predicted_label == gold_label else 0
                })
                print(f"  - Model: {model_name:<20} | Prediction: {predicted_label:<15} | Actual: {gold_label}")

            except Exception as e:
            
                evaluation_results.append({
                    "model": model_name,
                    "status": "failure",
                    "inference_time_ns": 0,
                    "is_correct": 0
                })
                print(f" ERROR with model {model_name}: {e}")

# Calculate and Print Final Metrics 
if evaluation_results:
    results_df = pd.DataFrame(evaluation_results)
    
    print("\n" + "="*50)
    print("FINAL EVALUATION METRICS")
    print("="*50)

    for model_name in models_to_test:
        model_df = results_df[results_df['model'] == model_name]
        
        if not model_df.empty:
            total_items = len(model_df)
            failures = len(model_df[model_df['status'] == 'failure'])
            successes = total_items - failures

            reject_rate = (failures / total_items) * 100 if total_items > 0 else 0
            avg_time_s = (model_df['inference_time_ns'].sum() / successes) / 1e9 if successes > 0 else 0
            accuracy = (model_df['is_correct'].sum() / successes) * 100 if successes > 0 else 0

            print(f"\nMetrics for model: {model_name}")
            print("-" * 30)
            print(f"Data Items Count: {total_items}")
            print(f"Average Inference Time: {avg_time_s:.4f} seconds/item")
            print(f"Reject Rate (RR): {reject_rate:.2f}%")
            print(f"Classification Accuracy: {accuracy:.2f}%")

Successfully loaded 100 items from the NLI dataset.

Processing item 1/100...
  - Model: llama3.2:3b          | Prediction: entails         | Actual: contradicts
 ERROR with model qwen2.5vl:3b: llama runner process has terminated: exit status 2 (status code: 500)

Processing item 2/100...
  - Model: llama3.2:3b          | Prediction: entails         | Actual: neutral
  - Model: qwen2.5vl:3b         | Prediction: neutral         | Actual: neutral

Processing item 3/100...
  - Model: llama3.2:3b          | Prediction: entails         | Actual: entails
  - Model: qwen2.5vl:3b         | Prediction: entails         | Actual: entails

Processing item 4/100...
  - Model: llama3.2:3b          | Prediction: contradicts     | Actual: contradicts
  - Model: qwen2.5vl:3b         | Prediction: neutral         | Actual: contradicts

Processing item 5/100...
  - Model: llama3.2:3b          | Prediction: entails         | Actual: neutral
  - Model: qwen2.5vl:3b         | Prediction: neutral         | 

3.4 Image Captioning

In [None]:
import ollama
import os
import pandas as pd

vlm_model_name = 'qwen2.5vl:3b'  

image_dataset_path = './ic/images'

output_dir = 'generated_captions'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

evaluation_results = []

# Get the list of image files
try:
    image_files = [f for f in os.listdir(image_dataset_path) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
    if not image_files:
        print(f"Error: No image files found in '{image_dataset_path}'. Please check the path.")
    else:
        print(f"Found {len(image_files)} images to caption.")
except FileNotFoundError:
    print(f"ERROR: Image directory not found at '{image_dataset_path}'.")
    image_files = []

#Main Image Captioning Loop 
for i, image_filename in enumerate(image_files):
    image_path = os.path.join(image_dataset_path, image_filename)
    print(f"\nProcessing image {i + 1}/{len(image_files)}: {image_filename}")
    
    prompt = """Generate a high-quality, single-sentence caption for this image. The caption should be factual and clearly describe the main subject, action, and setting."""
    try:
    
        response = ollama.generate(
            model=vlm_model_name,
            prompt=prompt,
            images=[image_path],  
            stream=False
        )
        
        # Record success for evaluation
        evaluation_results.append({
            "model": vlm_model_name, "task": "Image Captioning", "status": "success",
            "inference_time_ns": response.get('total_duration', 0)
        })
        
        # Save the generated caption
        caption_text = response['response']
        caption_filename = os.path.join(output_dir, f"{os.path.splitext(image_filename)[0]}_caption.txt")
        with open(caption_filename, 'w', encoding='utf-8') as f:
            f.write(caption_text)
        
        print(f" Caption generated and saved.")

    except Exception as e:
        #Record failure for evaluation
        evaluation_results.append({
            "model": vlm_model_name, "task": "Image Captioning", "status": "failure",
            "inference_time_ns": 0
        })
        print(f"ERROR processing {image_filename}: {e}")

# Calculate and Print Final Metrics 
if evaluation_results:
    results_df = pd.DataFrame(evaluation_results)
    print("\n" + "="*50 + "\n EVALUATION FOR IMAGE CAPTIONING\n" + "="*50)
    
    model_df = results_df[results_df['model'] == vlm_model_name]
    if not model_df.empty:
        total_items = len(model_df)
        failures = len(model_df[model_df['status'] == 'failure'])
        successes = total_items - failures
        
        reject_rate = (failures / total_items) * 100 if total_items > 0 else 0
        avg_time_s = (model_df['inference_time_ns'].sum() / successes) / 1e9 if successes > 0 else 0
        
        print(f"\nMetrics for model: {vlm_model_name}")
        print("-" * 30)
        print(f"Data Items Count: {total_items}")
        print(f"Average Inference Time: {avg_time_s:.4f} seconds/item")
        print(f"Reject Rate (RR): {reject_rate:.2f}%")
        

Found 100 images to caption.

Processing image 1/100: ic-001.jpg
 Caption generated and saved.

Processing image 2/100: ic-002.jpg
 Caption generated and saved.

Processing image 3/100: ic-003.jpg
 Caption generated and saved.

Processing image 4/100: ic-004.jpg
 Caption generated and saved.

Processing image 5/100: ic-005.jpg
 Caption generated and saved.

Processing image 6/100: ic-006.jpg
 Caption generated and saved.

Processing image 7/100: ic-007.jpg
 Caption generated and saved.

Processing image 8/100: ic-008.jpg
 Caption generated and saved.

Processing image 9/100: ic-009.jpg
 Caption generated and saved.

Processing image 10/100: ic-010.jpg
 Caption generated and saved.

Processing image 11/100: ic-011.jpg
 Caption generated and saved.

Processing image 12/100: ic-012.jpg
 Caption generated and saved.

Processing image 13/100: ic-013.jpg
 Caption generated and saved.

Processing image 14/100: ic-014.jpg
 Caption generated and saved.

Processing image 15/100: ic-015.jpg
 Capt

3.5 Visual Question Answering

In [None]:
import ollama
import os
import pandas as pd

vlm_model_name = 'qwen2.5vl:3b'  

vqa_dataset_path = './vqa/vqa.csv' 
image_dir = './vqa/images'  
output_dir = 'vqa_results'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

evaluation_results = []

try:
    df = pd.read_csv(vqa_dataset_path)
    print(f"Successfully loaded {len(df)} items from the VQA dataset.")
    # df = df.head(5) # For faster testing
except FileNotFoundError:
    print(f"ERROR: Dataset metadata file not found at '{vqa_dataset_path}'.")
    df = None

#Main VQA Loop
if df is not None:
    for index, row in df.iterrows(): 
        image_filename = row['image']
        
        question = row['question']
        options_str = row['options']
        correct_answer = row['answer'].strip().upper()
        
        image_path = os.path.join(image_dir, image_filename)
        
        print(f"\nProcessing item {index + 1}/{len(df)}: {image_filename}")

        prompt = f"""
        Answer the following multiple-choice question based on the image.
        Respond with ONLY the capital letter of the correct option (A, B, C, D, or E). Do not provide any other text or explanation.

        Question: {question}
        Options: {options_str}

        Correct Answer Letter:
        """

        # Generate the answer 
        try:
            response = ollama.generate(
                model=vlm_model_name,
                prompt=prompt,
                images=[image_path],
                options={'seed': 42, 'temperature': 0.0},
                stream=False
            )
            
            #Parse and validate the prediction
            prediction = response['response'].strip().upper()
            if len(prediction) > 0 and 'A' <= prediction[0] <= 'E':
                predicted_letter = prediction[0]
                status = "success"
            else:
                predicted_letter = "invalid_format"
                status = "failure" 

            #Record results 
            evaluation_results.append({
                "model": vlm_model_name, "task": "VQA", "status": status,
                "inference_time_ns": response.get('total_duration', 0),
                "is_correct": 1 if predicted_letter == correct_answer else 0
            })
            print(f"  - Model Prediction: {predicted_letter} | Correct Answer: {correct_answer}")

        except Exception as e:
            evaluation_results.append({
                "model": vlm_model_name, "task": "VQA", "status": "failure",
                "inference_time_ns": 0, "is_correct": 0
            })
            print(f"ERROR processing {image_filename}: {e}")

#  Calculate and Print Final Metrics 
if evaluation_results:
    results_df = pd.DataFrame(evaluation_results)
    print("\n" + "="*50 + "\n EVALUATION FOR VISUAL QUESTION ANSWERING \n" + "="*50)
    
    total_items = len(results_df)
    failures = len(results_df[results_df['status'] == 'failure'])
    successes = total_items - failures
    
    reject_rate = (failures / total_items) * 100 if total_items > 0 else 0
    avg_time_s = (results_df['inference_time_ns'].sum() / successes) / 1e9 if successes > 0 else 0
    em_accuracy = (results_df['is_correct'].sum() / successes) * 100 if successes > 0 else 0

    print(f"\nMetrics for model: {vlm_model_name}")
    print("-" * 30)
    print(f"Data Items Count: {total_items}")
    print(f"Average Inference Time: {avg_time_s:.4f} seconds/item")
    print(f"Reject Rate (RR): {reject_rate:.2f}%")
    print(f"Exact Match (EM) Accuracy: {em_accuracy:.2f}%")

Successfully loaded 100 items from the VQA dataset.

Processing item 1/100: vqa-001.jpg
  - Model Prediction: A | Correct Answer: C

Processing item 2/100: vqa-002.jpg
  - Model Prediction: B | Correct Answer: C

Processing item 3/100: vqa-003.jpg
  - Model Prediction: A | Correct Answer: E

Processing item 4/100: vqa-004.jpg
  - Model Prediction: D | Correct Answer: D

Processing item 5/100: vqa-005.jpg
  - Model Prediction: A | Correct Answer: B

Processing item 6/100: vqa-006.jpg
  - Model Prediction: A | Correct Answer: B

Processing item 7/100: vqa-007.jpg
  - Model Prediction: E | Correct Answer: C

Processing item 8/100: vqa-008.jpg
  - Model Prediction: B | Correct Answer: B

Processing item 9/100: vqa-009.jpg
  - Model Prediction: E | Correct Answer: D

Processing item 10/100: vqa-010.jpg
  - Model Prediction: E | Correct Answer: A

Processing item 11/100: vqa-011.jpg
  - Model Prediction: A | Correct Answer: A

Processing item 12/100: vqa-012.jpg
  - Model Prediction: B | Cor

4 Evaluation

 ROUGE-1 F1

In [4]:
import sys
!{sys.executable} -m pip install rouge-score

Collecting rouge-score
  Using cached rouge_score-0.1.2.tar.gz (17 kB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Collecting absl-py (from rouge-score)
  Using cached absl_py-2.3.1-py3-none-any.whl.metadata (3.3 kB)
Collecting nltk (from rouge-score)
  Using cached nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting click (from nltk->rouge-score)
  Using cached click-8.2.1-py3-none-any.whl.metadata (2.5 kB)
Collecting joblib (from nltk->rouge-score)
  Downloading joblib-1.5.1-py3-none-any.whl.metadata (5.6 kB)
Collecting regex>=2021.8.3 (from nltk->rouge-score)
  Downloading regex-2024.11.6-cp313-cp313-win_amd64.whl.metadata (41 kB)
Collecting tqdm (from nltk->rouge-score)
  Using cached tqdm-4.67.1

In [23]:
from rouge_score import rouge_scorer
import os
import pandas as pd

stories_dir = 'generated_stories'
summaries_dir = 'generated_summaries'
models = ['llama3.2_3b', 'qwen2.5vl_3b'] 

print("Calculating ROUGE-1 F1 Scores for Summarization ")

for model_name in models:
    scores = []
    scorer = rouge_scorer.RougeScorer(['rouge1'], use_stemmer=True)
    
    #Find all summaries for the current model
    summary_files = [f for f in os.listdir(summaries_dir) if model_name in f]
    
    for summary_filename in summary_files:
        original_story_filename = summary_filename.replace('summary_of_', '')
        
        story_path = os.path.join(stories_dir, original_story_filename)
        summary_path = os.path.join(summaries_dir, summary_filename)
        
        try:
            with open(story_path, 'r', encoding='utf-8') as f:
                story_text = ''.join(f.readlines()[5:]) # Skip header

            with open(summary_path, 'r', encoding='utf-8') as f:
                summary_text = f.read()

            # Calculate score
            rouge_scores = scorer.score(story_text, summary_text)
            scores.append(rouge_scores['rouge1'].fmeasure)
        
        except FileNotFoundError:
            print(f"Warning: Could not find matching files for {summary_filename}")

    # Calculate and print the average score for the model
    if scores:
        average_f1 = sum(scores) / len(scores)
        print(f"\nAverage ROUGE-1 F1 Score for {model_name.replace('_', ':')}: {average_f1:.4f}")

Calculating ROUGE-1 F1 Scores for Summarization 

Average ROUGE-1 F1 Score for llama3.2:3b: 0.4667

Average ROUGE-1 F1 Score for qwen2.5vl:3b: 0.5630


CIDEr (Consensus-based Image Description Eval
uation)

In [9]:
import sys
!{sys.executable} -m pip install pycocotools pycocoevalcap

Collecting pycocotools
  Using cached pycocotools-2.0.10-cp312-abi3-win_amd64.whl.metadata (1.3 kB)
Collecting pycocoevalcap
  Using cached pycocoevalcap-1.2-py3-none-any.whl.metadata (3.2 kB)
Using cached pycocotools-2.0.10-cp312-abi3-win_amd64.whl (76 kB)
Using cached pycocoevalcap-1.2-py3-none-any.whl (104.3 MB)
Installing collected packages: pycocotools, pycocoevalcap

   -------------------- ------------------- 1/2 [pycocoevalcap]
   -------------------- ------------------- 1/2 [pycocoevalcap]
   -------------------- ------------------- 1/2 [pycocoevalcap]
   ---------------------------------------- 2/2 [pycocoevalcap]

Successfully installed pycocoevalcap-1.2 pycocotools-2.0.10


In [24]:
from pycocoevalcap.cider.cider import Cider
import os
import pandas as pd
import ast 

captions_dir = 'generated_captions'

ground_truth_csv_path = './ic/ic.csv'

try:
    ground_truth_df = pd.read_csv(ground_truth_csv_path)
    print("Successfully loaded ground truth captions from CSV.")
except FileNotFoundError:
    print(f"ERROR: Ground truth file not found at '{ground_truth_csv_path}'")
    ground_truth_df = None

if ground_truth_df is not None:
    
    gts = {}  
    res = {}  

    for index, row in ground_truth_df.iterrows():
        image_filename = row['image']
        human_captions_str = row['human_captions']
        
        image_id = os.path.splitext(image_filename)[0]
        gts[image_id] = ast.literal_eval(human_captions_str)
        
        generated_caption_path = os.path.join(captions_dir, f"{image_id}_caption.txt")
        try:
            with open(generated_caption_path, 'r', encoding='utf-8') as f:
                res[image_id] = [f.read().strip()]
        except FileNotFoundError:
            res[image_id] = [""]

    # Calculate CIDEr Score 
    print("\n Calculating CIDEr Score for Image Captioning ")
    cider_scorer = Cider()

    (score, scores) = cider_scorer.compute_score(gts, res)

    print(f"\nOverall CIDEr Score: {score:.4f}")

Successfully loaded ground truth captions from CSV.

 Calculating CIDEr Score for Image Captioning 

Overall CIDEr Score: 0.1721
