In [None]:
# --- Google Colab Setup (Cell 0) ---
# This cell installs Ollama, starts the server, and pulls the specific model.

print("Installing necessary Python packages...")
# Remove pandas/numpy/matplotlib/requests/scikit-learn from !pip as they are usually pre-installed
# Adding explicitly to ensure if not present (though they generally are)
!pip install pandas numpy scikit-learn requests

print("Installing Ollama...")
# Download and execute the Ollama installation script
!curl -fsSL https://ollama.com/install.sh | sh

import subprocess
import time
import os
import socket
from pathlib import Path

# Set environment variable for Ollama to listen on all interfaces within the Colab VM
# This allows other Colab cells to connect to the Ollama server.
os.environ['OLLAMA_HOST'] = '0.0.0.0'
OLLAMA_PORT = 11434 # Ollama's default API port

print("Ensuring no previous Ollama processes are running...")
# Kill any existing Ollama processes to ensure a clean start
!pkill ollama || true

print(f"Starting Ollama server on port {OLLAMA_PORT} in background...")
# Start the Ollama server as a background process
ollama_process = subprocess.Popen(["ollama", "serve"],
                                  stdout=subprocess.DEVNULL, # Redirect stdout to /dev/null
                                  stderr=subprocess.DEVNULL, # Redirect stderr to /dev/null
                                  preexec_fn=os.setsid) # Detach from the current process group
print(f"Ollama server process started with PID: {ollama_process.pid}")

print(f"Waiting for Ollama server to become available on port {OLLAMA_PORT}...")
max_wait_time = 90 # seconds (increased for model loading)
start_time = time.time()
server_ready = False

# Loop to check if the Ollama server is responsive
while time.time() - start_time < max_wait_time:
    try:
        # Attempt to create a socket connection to the Ollama server
        with socket.create_connection(('127.0.0.1', OLLAMA_PORT), timeout=1):
            server_ready = True
            break
    except (ConnectionRefusedError, socket.timeout):
        # If connection is refused or times out, check if the Ollama process is still running
        if ollama_process.poll() is not None:
            print(f"Ollama server process died unexpectedly (exit code: {ollama_process.poll()}).")
            print("Please check previous logs for errors or try restarting the runtime.")
            server_ready = False
            break
        print(f"  Still waiting for Ollama... ({int(time.time() - start_time)}s elapsed)")
        time.sleep(5) # Wait 5 seconds before retrying
    except Exception as e:
        print(f"An unexpected error occurred during Ollama startup check: {e}")
        server_ready = False
        break

if not server_ready:
    print(f"Ollama server did not become available on port {OLLAMA_PORT} within {max_wait_time} seconds. Aborting setup.")
    if ollama_process.poll() is None: # If the process is still running, try to terminate it
        ollama_process.terminate()
        print("Attempted to terminate non-responsive Ollama process.")
    raise RuntimeError("Ollama server failed to start or become ready.")
else:
    print(f"Ollama server is now available on port {OLLAMA_PORT}.")

# --- Pull the specific model for this notebook ---
MODEL_NAME = "qwen:7b" # <--- MODEL TO PULL FOR THIS NOTEBOOK
print(f"\nPulling {MODEL_NAME} model...")
# Run the ollama pull command
pull_process = subprocess.run(["ollama", "pull", MODEL_NAME], capture_output=True, text=True)
print(pull_process.stdout)
if pull_process.stderr:
    print("Ollama pull stderr:", pull_process.stderr)
    if "Error" in pull_process.stderr or "failed" in pull_process.stderr.lower():
        raise RuntimeError(f"Failed to pull Ollama model: {pull_process.stderr}")
print(f"{MODEL_NAME} pull complete.")

# Check if model is available (list all downloaded models)
list_process = subprocess.run(["ollama", "list"], capture_output=True, text=True)
print(f"\nAvailable Ollama models after pulling {MODEL_NAME}:")
print(list_process.stdout)
if list_process.stderr:
    print("Ollama list stderr:", list_process.stderr)

# Verify GPU availability with nvidia-smi
print("\nChecking GPU availability with nvidia-smi...")
!nvidia-smi

# Set the base URL and model name as environment variables for use in subsequent cells
OLLAMA_API_BASE_URL = f"http://127.0.0.1:{OLLAMA_PORT}"
print(f"Set Ollama API base URL for direct requests to: {OLLAMA_API_BASE_URL}")
%env OLLAMA_MODEL_NAME=$MODEL_NAME
%env OLLAMA_API_BASE_URL=$OLLAMA_API_BASE_URL

Installing necessary Python packages...
Installing Ollama...
>>> Cleaning up old version at /usr/local/lib/ollama
>>> Installing ollama to /usr/local
>>> Downloading Linux amd64 bundle
######################################################################## 100.0%
>>> Adding ollama user to video group...
>>> Adding current user to ollama group...
>>> Creating ollama systemd service...
>>> The Ollama API is now available at 127.0.0.1:11434.
>>> Install complete. Run "ollama" from the command line.
Ensuring no previous Ollama processes are running...
Starting Ollama server on port 11434 in background...
Ollama server process started with PID: 6866
Waiting for Ollama server to become available on port 11434...
  Still waiting for Ollama... (0s elapsed)
Ollama server is now available on port 11434.

Pulling qwen:7b model...

Ollama pull stderr: [?2026h[?25l[1Gpulling manifest ⠋ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ⠙ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ⠹ [

In [None]:
# Cell 1: Initialize Client & Confirm Server is Running
import os
import json
import pandas as pd
import numpy as np # Used later for classification_report
import matplotlib.pyplot as plt # Used later for plotting
import requests
import time
import socket
import re

# Use the API base URL from the environment variable set in Cell 0
base_url = os.getenv('OLLAMA_API_BASE_URL')
api_url_generate = f"{base_url}/api/generate" # Direct generate endpoint
api_url_show = f"{base_url}/api/show" # To check model details

print(f"Using Ollama API at: {base_url}")
print(f"Model to use: {os.getenv('OLLAMA_MODEL_NAME')}")


# Confirm server is running and model is accessible (more robust check)
def check_ollama_and_model(base_url, model_name):
    try:
        # Check if Ollama server is generally responsive
        response = requests.get(f"{base_url}/api/tags", timeout=10)
        response.raise_for_status() # Raise an exception for bad status codes
        print("Ollama server is responsive.")

        # Check if the specific model is loaded
        payload = {"name": model_name}
        response = requests.post(api_url_show, json=payload, timeout=10)
        response.raise_for_status()
        print(f"Model '{model_name}' is loaded and accessible.")
        return True
    except requests.exceptions.ConnectionError as e:
        print(f"Error: Could not connect to Ollama server at {base_url}. Please ensure it's running.")
        print(f"Details: {e}")
        return False
    except requests.exceptions.Timeout:
        print(f"Error: Request to Ollama server timed out.")
        return False
    except requests.exceptions.RequestException as e:
        print(f"Error checking Ollama or model: {e}")
        if response:
            print(f"Response status: {response.status_code}")
            print(f"Response body: {response.text}")
        return False

if not check_ollama_and_model(base_url, os.getenv('OLLAMA_MODEL_NAME')):
    raise RuntimeError("Ollama server or model not ready. Please check Cell 0 execution.")
else:
    print("Ollama server and model are ready for use.")

# Note: The OpenAI client (from `openai` library) is not directly used here
# because we're interacting with Ollama's native API via `requests` as set up.
# If you explicitly need `openai` client for `ollama` (e.g., using `ollama run --port 8000`),
# then ensure `base_url` points to that port and the `client` object is initialized.

Using Ollama API at: http://127.0.0.1:11434
Model to use: qwen:7b
Ollama server is responsive.
Model 'qwen:7b' is loaded and accessible.
Ollama server and model are ready for use.


In [None]:
# Cell 2: Load Full Prepared Dataset (2400 Samples) and create stratified splits
import pandas as pd
from sklearn.model_selection import train_test_split # We can use this or manual group by

# Load the already-prepared dataset. Adjust path if your file is in Google Drive.
full_df = pd.read_csv('/content/cwe_top5_sampled_with_juliet_none.csv')
total_samples = len(full_df)
print(f"Total samples in full dataset: {total_samples}")

# Validate dataset
print("Validating full dataset...")
expected_cwes = {'CWE121', 'CWE78', 'CWE122', 'CWE190', 'CWE191', 'none'}
assert set(full_df['cwe'].unique()) == expected_cwes, "Unexpected CWE values found!"
assert full_df['code'].notna().all(), "Missing code values found!"

print("\nFull dataset CWE distribution:")
print(full_df['cwe'].value_counts())

# --- Create Stratified Training and Testing Splits ---
# We need 320 samples per class for training (1920 total)
# and 80 samples per class for testing (480 total).

train_split_df = pd.DataFrame()
test_split_df = pd.DataFrame()

# Group by 'cwe' and split each group
for cwe_type in expected_cwes:
    cwe_subset = full_df[full_df['cwe'] == cwe_type].reset_index(drop=True)

    # Ensure there are enough samples for the split
    if len(cwe_subset) < (320 + 80):
        print(f"Warning: Not enough samples for CWE type '{cwe_type}'. Expected at least 400, found {len(cwe_subset)}. This might cause issues.")
        # Handle cases where there might not be 400 samples per class if necessary
        # For now, we'll proceed, but it might result in fewer than 320/80 if data is short.

    # Take the first 320 for training, and next 80 for testing for each CWE type
    train_split_df = pd.concat([train_split_df, cwe_subset.iloc[:320]])
    test_split_df = pd.concat([test_split_df, cwe_subset.iloc[320:320+80]]) # Ensure it's 320+80 if some subsets are smaller

# Shuffle the splits to randomize the order within them, important for training/batching
train_split_df = train_split_df.sample(frac=1, random_state=42).reset_index(drop=True)
test_split_df = test_split_df.sample(frac=1, random_state=42).reset_index(drop=True)


print(f"\nTraining split size: {len(train_split_df)} samples")
print("Training split CWE distribution:")
print(train_split_df['cwe'].value_counts())

print(f"\nTesting split size: {len(test_split_df)} samples")
print("Testing split CWE distribution:")
print(test_split_df['cwe'].value_counts())

print("\nSample code from training split:", train_split_df['code'].iloc[0][:100]) # Truncate for display
print("Sample code from testing split:", test_split_df['code'].iloc[0][:100])

# Display head of splits (optional)
print("\nTrain Split Head:")
print(train_split_df.head())
print("\nTest Split Head:")
print(test_split_df.head())

Total samples in full dataset: 2400
Validating full dataset...

Full dataset CWE distribution:
cwe
CWE121    400
CWE78     400
CWE190    400
CWE191    400
CWE122    400
none      400
Name: count, dtype: int64

Training split size: 1920 samples
Training split CWE distribution:
cwe
CWE122    320
none      320
CWE190    320
CWE191    320
CWE78     320
CWE121    320
Name: count, dtype: int64

Testing split size: 480 samples
Testing split CWE distribution:
cwe
CWE78     80
none      80
CWE191    80
CWE122    80
CWE121    80
CWE190    80
Name: count, dtype: int64

Sample code from training split: #include "std_testcase.h"
#ifdef _WIN32
#include <winsock2.h>
#include <windows.h>
#include <direct.
Sample code from testing split: #include "std_testcase.h"
#include <wchar.h>
#ifdef _WIN32
#define COMMAND_INT_PATH "%WINDIR%\\syste

Train Split Head:
                                                file     cwe label  \
0  CWE122_Heap_Based_Buffer_Overflow__c_CWE129_co...  CWE122  good   
1        

In [None]:
# Cell 3: Define Prompt (Exact from Old Code)
# The prompt is critical for guiding the model's output to your desired JSON format.
prompt = (
    "You are a security analysis tool. Analyze the C source code for vulnerabilities and respond **only** with a JSON object containing 'reason' and 'answer' fields. Do not include any additional text, explanations, or markdown outside the JSON. Use this exact format:\n"
    "{ \"reason\": \"Explanation of vulnerability\", \"answer\": \"CWE121\" }\n"
    "The 'answer' field must be one of: 'CWE121', 'CWE78', 'CWE122', 'CWE190', 'CWE191', or 'none'. Match the 'answer' to the vulnerability type described in the 'reason'. Examples:\n"
    "- Code with `memcpy` without bounds check: { \"reason\": \"Stack-based buffer overflow due to unchecked memcpy\", \"answer\": \"CWE121\" }\n"
    "- Code with `system` using user input: { \"reason\": \"OS command injection via unsanitized input to system\", \"answer\": \"CWE78\" }\n"
    "- Code with `malloc` and overflow: { \"reason\": \"Heap-based buffer overflow due to improper malloc usage\", \"answer\": \"CWE122\" }\n"
    "- Code with large integer multiplication: { \"reason\": \"Integer overflow from unchecked multiplication\", \"answer\": \"CWE190\" }\n"
    "- Code with negative integer decrement: { \"reason\": \"Integer underflow from unchecked decrement\", \"answer\": \"CWE191\" }\n"
    "- Code with no obvious vulnerability: { \"reason\": \"No vulnerability detected\", \"answer\": \"none\" }\n"
    "Here is the code:\n\n"
)

# Apply the prompt to both training and testing splits
train_split_df['full_prompt'] = prompt + "\n\n" + train_split_df['code']
test_split_df['full_prompt'] = prompt + "\n\n" + test_split_df['code']

print("Prompt sample from training split (truncated):", train_split_df['full_prompt'].iloc[0][:500])
print("\nPrompt sample from testing split (truncated):", test_split_df['full_prompt'].iloc[0][:500])

Prompt sample from training split (truncated): You are a security analysis tool. Analyze the C source code for vulnerabilities and respond **only** with a JSON object containing 'reason' and 'answer' fields. Do not include any additional text, explanations, or markdown outside the JSON. Use this exact format:
{ "reason": "Explanation of vulnerability", "answer": "CWE121" }
The 'answer' field must be one of: 'CWE121', 'CWE78', 'CWE122', 'CWE190', 'CWE191', or 'none'. Match the 'answer' to the vulnerability type described in the 'reason'. Exam

Prompt sample from testing split (truncated): You are a security analysis tool. Analyze the C source code for vulnerabilities and respond **only** with a JSON object containing 'reason' and 'answer' fields. Do not include any additional text, explanations, or markdown outside the JSON. Use this exact format:
{ "reason": "Explanation of vulnerability", "answer": "CWE121" }
The 'answer' field must be one of: 'CWE121', 'CWE78', 'CWE122', 'CWE190', 'C

In [None]:
# Cell 4: Create Backups directory and initialize response lists
from pathlib import Path
import json
import pandas as pd # Ensure pandas is imported

# Define paths for Colab environment
BACKUP_DIR = Path("/content/CodeSentinel/Backups")
RESULTS_DIR = Path("/content/CodeSentinel/Results/_2")

BACKUP_DIR.mkdir(parents=True, exist_ok=True)
RESULTS_DIR.mkdir(parents=True, exist_ok=True)

# Define backup filenames for each split
MODEL_NAME = os.getenv('OLLAMA_MODEL_NAME').replace(':', '_').replace('/', '-') # Sanitize model name for filename
TRAIN_BACKUP_PATH = BACKUP_DIR / f"Result-{MODEL_NAME}-CWE-train.csv"
TEST_BACKUP_PATH = BACKUP_DIR / f"Result-{MODEL_NAME}-CWE-test.csv"

def save_responses(responses_list, path):
    """Saves a list of raw string responses to a CSV file."""
    print(f"Debug: Responses before saving to {path}:")
    cleaned_responses = []
    for idx, response in enumerate(responses_list):
        if pd.isna(response) or response is None:
            print(f"Response {idx} is invalid (NaN/None), replacing with fallback for saving.")
            response = json.dumps({"reason": "Invalid response", "answer": "none"})
        # Ensure it's a string representation of JSON
        if not isinstance(response, str):
            response = json.dumps(response) # Convert dicts back to string JSON if needed
        print(f"Response {idx}: {response[:100]}... (type: {type(response)})") # Truncate for print
        cleaned_responses.append(response)
    df = pd.DataFrame(cleaned_responses, columns=['raw_response_json_string'])
    df.to_csv(path, index=False)
    print(f"Responses saved to: {path}")

# Start fresh by resetting responses for both splits
training_responses = []
testing_responses = []
print("Response lists reset to empty for fresh start.")

Response lists reset to empty for fresh start.


In [None]:
# Cell 5: Run Inference Loops for Training and Testing Splits
import requests
import time
import json
import os # Make sure os is imported
from tqdm.notebook import tqdm # For nice progress bars in Colab
from sklearn.metrics import confusion_matrix, classification_report # For evaluation later

# Retrieve API details from environment variables
api_url_generate = f"{os.getenv('OLLAMA_API_BASE_URL')}/api/generate"
MODEL_NAME_FULL = os.getenv('OLLAMA_MODEL_NAME') # The full model name with colon

max_retries = 3
retry_delay = 5  # seconds
debug_mode = False # Set to True for very verbose logging per request (can slow down)

def process_dataframe_for_inference(df_to_process, response_list_target):
    """
    Processes a DataFrame, sends prompts to Ollama, and appends raw responses to a list.
    Handles retries and errors.
    """
    err_logs_local = []
    print(f"\nStarting inference for {len(df_to_process)} samples using model '{MODEL_NAME_FULL}'...")

    for index, row in tqdm(df_to_process.iterrows(), total=len(df_to_process), desc="Processing Samples"):
        attempts = 0
        success = False
        source_code = row['code'] # Use original code from row for prompt construction
        full_prompt_text = row['full_prompt'] # Use the pre-constructed full_prompt

        if debug_mode:
            print(f"\nASKED (Index {index}):")
            print(f"Source (truncated): {source_code[:200]}...")
            print(f"Full Prompt (truncated): {full_prompt_text[:300]}...")

        while attempts < max_retries and not success:
            try:
                if debug_mode:
                    print(f"Attempt {attempts + 1}: Sending request to {api_url_generate}...")
                start_time = time.time()
                payload = {
                    "model": MODEL_NAME_FULL,
                    "prompt": full_prompt_text,
                    "stream": False # We want the full response at once
                }

                if debug_mode:
                    print(f"Payload length: {len(payload['prompt'])} chars")

                response = requests.post(api_url_generate, json=payload, timeout=300) # Increased timeout to 300s (5 min) for large models/prompts
                response.raise_for_status() # Raise an error for bad status codes (4xx or 5xx)

                end_time = time.time()
                full_ollama_response = response.json()
                raw_content = full_ollama_response.get('response', '') # The actual text response from Ollama

                if debug_mode:
                    print(f"Request took {end_time - start_time:.2f} seconds")
                    print(f"Full Ollama JSON response: {json.dumps(full_ollama_response, indent=2)}")
                    print(f"Raw content from 'response' field: {raw_content[:200]}...")

                # Append the raw response string (or full JSON dict if preferred for later parsing)
                # Storing the full JSON dict ensures all metadata is preserved
                response_list_target.append(full_ollama_response)
                success = True

            except requests.RequestException as e:
                attempts += 1
                end_time = time.time()
                log_entry = {
                    "index": index,
                    "error_type": type(e).__name__,
                    "message": str(e),
                    "attempt": attempts,
                    "elapsed_time": end_time - start_time,
                    "response_status_code": getattr(response, 'status_code', 'N/A'),
                    "response_text_preview": getattr(response, 'text', 'No response')[:500]
                }
                err_logs_local.append(log_entry)
                print(f"ERROR at index {index}, attempt {attempts}/{max_retries} : {e}")
                print(f"Request failed after {end_time - start_time:.2f} seconds.")

                if attempts < max_retries:
                    print(f"Retrying in {retry_delay} seconds...")
                    time.sleep(retry_delay)
                else:
                    fallback_response_dict = {
                        "response": json.dumps({"reason": "Failed after max attempts", "answer": "none"}),
                        "status": "error",
                        "error_message": f"Max retries reached: {log_entry['message']}"
                    }
                    response_list_target.append(fallback_response_dict)
                    print(f"Max retries reached for index {index}, stored fallback response.")
            except Exception as e: # Catch any other unexpected errors
                attempts += 1
                end_time = time.time()
                log_entry = {
                    "index": index,
                    "error_type": type(e).__name__,
                    "message": str(e),
                    "attempt": attempts,
                    "elapsed_time": end_time - start_time,
                    "response_status_code": 'N/A', # No response object here
                    "response_text_preview": 'No response due to non-request error'
                }
                err_logs_local.append(log_entry)
                print(f"UNEXPECTED ERROR at index {index}, attempt {attempts}/{max_retries} : {e}")
                if attempts < max_retries:
                    print(f"Retrying in {retry_delay} seconds...")
                    time.sleep(retry_delay)
                else:
                    fallback_response_dict = {
                        "response": json.dumps({"reason": "Failed due to unexpected error", "answer": "none"}),
                        "status": "error",
                        "error_message": f"Unexpected error after max retries: {log_entry['message']}"
                    }
                    response_list_target.append(fallback_response_dict)
                    print(f"Max retries reached for index {index} due to unexpected error, stored fallback.")

    print(f"Inference loop finished. Total errors logged for this run: {len(err_logs_local)}")
    return err_logs_local

# Global error log list
all_err_logs = []

# --- Run inference for Training Split ---
print("--- Running Inference for Training Split ---")
train_err_logs = process_dataframe_for_inference(train_split_df, training_responses)
all_err_logs.extend(train_err_logs)
save_responses(training_responses, TRAIN_BACKUP_PATH)

# --- Run inference for Testing Split ---
print("\n--- Running Inference for Testing Split ---")
test_err_logs = process_dataframe_for_inference(test_split_df, testing_responses)
all_err_logs.extend(test_err_logs)
save_responses(testing_responses, TEST_BACKUP_PATH)

print("\nAll inference tasks complete.")

--- Running Inference for Training Split ---

Starting inference for 1920 samples using model 'qwen:7b'...


Processing Samples:   0%|          | 0/1920 [00:00<?, ?it/s]

Inference loop finished. Total errors logged for this run: 0
Debug: Responses before saving to /content/CodeSentinel/Backups/Result-qwen_7b-CWE-train.csv:
Response 0: {"model": "qwen:7b", "created_at": "2025-06-12T10:16:57.368369609Z", "response": "It seems you have ... (type: <class 'str'>)
Response 1: {"model": "qwen:7b", "created_at": "2025-06-12T10:16:58.404620714Z", "response": "{ \"reason\": \"St... (type: <class 'str'>)
Response 2: {"model": "qwen:7b", "created_at": "2025-06-12T10:16:59.825546238Z", "response": "{  \"reason\":  \"... (type: <class 'str'>)
Response 3: {"model": "qwen:7b", "created_at": "2025-06-12T10:17:01.151430784Z", "response": "{ \"reason\": \"St... (type: <class 'str'>)
Response 4: {"model": "qwen:7b", "created_at": "2025-06-12T10:17:02.768796916Z", "response": "{ \"reason\": \"St... (type: <class 'str'>)
Response 5: {"model": "qwen:7b", "created_at": "2025-06-12T10:17:03.963393878Z", "response": "{ \"reason\": \"Po... (type: <class 'str'>)
Response 6: {"mod

Processing Samples:   0%|          | 0/480 [00:00<?, ?it/s]

Inference loop finished. Total errors logged for this run: 0
Debug: Responses before saving to /content/CodeSentinel/Backups/Result-qwen_7b-CWE-test.csv:
Response 0: {"model": "qwen:7b", "created_at": "2025-06-12T11:18:47.429910515Z", "response": "{ \"reason\": \"Co... (type: <class 'str'>)
Response 1: {"model": "qwen:7b", "created_at": "2025-06-12T11:18:48.639557984Z", "response": "{  \"reason\":  \"... (type: <class 'str'>)
Response 2: {"model": "qwen:7b", "created_at": "2025-06-12T11:18:50.078090392Z", "response": "{ \"reason\": \"In... (type: <class 'str'>)
Response 3: {"model": "qwen:7b", "created_at": "2025-06-12T11:18:51.550854473Z", "response": "{ \"reason\": \"St... (type: <class 'str'>)
Response 4: {"model": "qwen:7b", "created_at": "2025-06-12T11:18:54.884682151Z", "response": "{ \"reason\": \"Co... (type: <class 'str'>)
Response 5: {"model": "qwen:7b", "created_at": "2025-06-12T11:18:56.090443491Z", "response": "{  \"reason\":  \"... (type: <class 'str'>)
Response 6: {"mode

In [23]:
# Cell 6: Parse and Evaluate Results (Fixed Multi-Class for Both Splits & Standardized Labels)
import re
import json
import pandas as pd
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np # For formatting confusion matrix

# Function to extract JSON from Ollama's full response object (KEEP THIS AS IS from last fix)
def extract_json_from_ollama_response(ollama_response_dict):
    """
    Extracts the JSON object from Ollama's 'response' field, handling potential malformed JSON.
    Expected input: a dictionary from response.json()
    Expected output: a dictionary {"reason": ..., "answer": ...} or a fallback.
    """
    raw_response_text = ollama_response_dict.get('response', '')
    if not raw_response_text:
        return {"reason": "Empty or missing 'response' field from Ollama", "answer": "none"}

    try:
        # Find the first and last curly braces to extract the JSON string
        start = raw_response_text.find('{')
        end = raw_response_text.rfind('}') + 1

        parsed_json = None
        if start != -1 and end != -1 and start < end:
            json_string = raw_response_text[start:end]
            try:
                parsed_json = json.loads(json_string)
            except json.JSONDecodeError:
                # If substring parse fails, try parsing the whole raw_response_text
                pass # Fall through to next attempt

        if parsed_json is None: # If substring didn't work, or was empty
            try:
                parsed_json = json.loads(raw_response_text)
            except json.JSONDecodeError:
                # If full text parse fails, try fixing quotes
                fixed_text = raw_response_text.replace("'", '"')
                try:
                    parsed_json = json.loads(fixed_text)
                except json.JSONDecodeError:
                    return {"reason": f"Failed to parse JSON even after quote fix: '{raw_response_text}'", "answer": "none"}

        # --- IMPORTANT NEW LOGIC (as in previous fix) ---
        # Handle cases where the model might output JSON wrapped in an array, e.g., '[{"reason":..., "answer":...}]'
        if isinstance(parsed_json, list) and len(parsed_json) > 0 and isinstance(parsed_json[0], dict):
            return parsed_json[0] # Take the first dictionary from the list
        elif isinstance(parsed_json, dict):
            return parsed_json
        else:
            # If it's not a dict and not a list containing a dict, it's an unexpected format
            return {"reason": f"Unexpected final JSON structure after parsing: {type(parsed_json).__name__} | Raw: '{raw_response_text}'", "answer": "none"}

    except Exception as e:
        # Catch any other unexpected errors during the process
        return {"reason": f"Critical error during JSON extraction: {e} | Raw: '{raw_response_text}'", "answer": "none"}

def evaluate_results(df_original, responses_list, data_split_name):
    """
    Processes responses, calculates metrics, and prints evaluation report for a given data split.
    """
    print(f"\n--- Evaluating Results for {data_split_name} Split ({len(responses_list)} samples) ---")

    if not responses_list:
        print(f"No responses to evaluate for {data_split_name} split.")
        return

    # Create a DataFrame for model answers
    model_answers_dicts = [extract_json_from_ollama_response(res_dict) for res_dict in responses_list]
    model_answers_df = pd.DataFrame(model_answers_dicts)

    # Ensure model_answer and model_reason columns exist, providing defaults
    model_answers_df['model_answer_raw'] = model_answers_df['answer'].fillna('none').astype(str)
    model_answers_df['model_reason'] = model_answers_df['reason'].fillna('No reason provided').astype(str)

    # --- NEW LOGIC: Standardize Model Answers ---
    # Define our target labels explicitly
    target_labels = ['CWE121', 'CWE78', 'CWE122', 'CWE190', 'CWE191', 'none']

    def standardize_label(raw_label):
        # Convert to string and uppercase for easier matching
        label = str(raw_label).upper()

        # Check for exact matches
        if label in target_labels:
            return label

        # Check for 'none' variations
        if 'NO VULNERABILITY' in label or 'NONE' == label.strip():
            return 'none'

        # Check for specific CWE patterns
        if 'CWE121' in label:
            return 'CWE121'
        if 'CWE78' in label:
            return 'CWE78'
        if 'CWE122' in label:
            return 'CWE122'
        if 'CWE190' in label:
            return 'CWE190'
        if 'CWE191' in label:
            return 'CWE191'

        # If none of the above, default to 'none' or log it as an issue
        return 'none'

    # Apply the standardization
    model_answers_df['model_answer'] = model_answers_df['model_answer_raw'].apply(standardize_label)

    # Join with the original DataFrame subset to get true labels and other info
    combined_df = df_original.reset_index(drop=True).copy()
    combined_df['model_answer'] = model_answers_df['model_answer']
    combined_df['model_reason'] = model_answers_df['model_reason']
    combined_df['model_answer_raw'] = model_answers_df['model_answer_raw'] # Keep raw for debugging

    # Compute verdict
    combined_df['verdict'] = combined_df['cwe'] == combined_df['model_answer']

    # Compute multi-class metrics
    true_labels = combined_df['cwe']
    pred_labels = combined_df['model_answer']

    # Define all possible labels explicitly for consistent report and matrix
    all_labels = sorted(list(set(target_labels))) # Use target_labels directly for consistency

    # Accuracy (overall accuracy)
    accuracy = combined_df['verdict'].mean()
    print(f"Accuracy ({data_split_name}): {accuracy:.3f}")

    # Confusion Matrix
    cm = confusion_matrix(true_labels, pred_labels, labels=all_labels)
    print(f"\nConfusion Matrix ({data_split_name}):")
    print("Labels order:", all_labels)
    # --- CUSTOM CONFUSION MATRIX PRINTING ---
    cm_list = cm.tolist()
    # Determine maximum width for each number to align columns
    # We add 1 for potential leading space if a number is negative (though unlikely for CM counts)
    max_widths = [max(len(str(item)) for item in col) for col in zip(*cm_list)]

    for row in cm_list:
        formatted_row = []
        for i, item in enumerate(row):
            formatted_row.append(f"{item:>{max_widths[i]}}") # Right-align each number within its max width
        print(f"[{' '.join(formatted_row)}]") # Join with space, wrap in brackets


    # Classification Report
    report = classification_report(true_labels, pred_labels, labels=all_labels, zero_division=0)
    print(f"\nClassification Report ({data_split_name}):")
    print(report)

    # Save filtered results
    output_filename_base = f"Result-{MODEL_NAME}-{data_split_name}-CWE.csv"
    output_path = RESULTS_DIR / output_filename_base
    filtered_df_output = combined_df[['cwe', 'model_answer', 'model_answer_raw', 'verdict', 'model_reason', 'file', 'code']]
    filtered_df_output.to_csv(output_path, index=False)
    print(f"Filtered results saved to '{output_path}'")

# --- Perform evaluation for each split ---
evaluate_results(train_split_df, training_responses, "Training")
evaluate_results(test_split_df, testing_responses, "Testing")

if all_err_logs:
    print("\n--- Consolidated Error Log ---")
    for log_entry in all_err_logs:
        print(f"Index: {log_entry['index']}, Error Type: {log_entry['error_type']}, Message: {log_entry['message']}")
        if log_entry.get('response_status_code') != 'N/A':
            print(f"  Status: {log_entry['response_status_code']}, Preview: {log_entry['response_text_preview']}")
else:
    print("\nNo errors logged during inference.")


--- Evaluating Results for Training Split (1920 samples) ---
Accuracy (Training): 0.385

Confusion Matrix (Training):
Labels order: ['CWE121', 'CWE122', 'CWE190', 'CWE191', 'CWE78', 'none']
[263 16  6  0  14  21]
[204 59  3  0  18  36]
[190 17 66  1  11  35]
[164 13 65 14  16  48]
[ 73  0  1  0 215  31]
[196  0  0  0   1 123]

Classification Report (Training):
              precision    recall  f1-score   support

      CWE121       0.24      0.82      0.37       320
      CWE122       0.56      0.18      0.28       320
      CWE190       0.47      0.21      0.29       320
      CWE191       0.93      0.04      0.08       320
       CWE78       0.78      0.67      0.72       320
        none       0.42      0.38      0.40       320

    accuracy                           0.39      1920
   macro avg       0.57      0.39      0.36      1920
weighted avg       0.57      0.39      0.36      1920

Filtered results saved to '/content/CodeSentinel/Results/_2/Result-qwen_7b-Training-CWE.csv'
