# Experiment 1: Zero-Shot Prompt Engineering

**Objective:** Evaluate LLaMA-3.1-8B-Instruct on Solana smart contract vulnerability detection without fine-tuning.

**Method:** Role-based prompting with security analyst persona.

---

## 1. Environment Setup

In [None]:
import torch
import os

# Verify GPU
print("=" * 50)
print("ENVIRONMENT CHECK")
print("=" * 50)

if torch.cuda.is_available():
    gpu_name = torch.cuda.get_device_name(0)
    gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1e9
    print(f"GPU: {gpu_name}")
    print(f"Memory: {gpu_memory:.1f} GB")
    print("Status: Ready")
else:
    print("ERROR: GPU not detected!")
    print("Go to: Settings -> Accelerator -> GPU T4 x2")

In [None]:
# Install required packages
!pip install -q bitsandbytes accelerate

print("Packages installed successfully.")

## 2. Authentication & Imports

In [None]:
import json
import warnings
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict, Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from tqdm import tqdm
from huggingface_hub import login

warnings.filterwarnings('ignore')

# HuggingFace Authentication
from kaggle_secrets import UserSecretsClient

secrets = UserSecretsClient()
HF_TOKEN = secrets.get_secret("HF_TOKEN")

login(token=HF_TOKEN, add_to_git_credential=False)
print("HuggingFace authentication successful.")

## 3. Load Dataset

**Instructions:** Upload `solana_140s_final.json` to Kaggle:
1. Click "+ Add Data" (right panel)
2. Select "Upload" â†’ "New Dataset"
3. Upload the JSON file
4. Name it `solana-dataset`

In [None]:
# Load dataset
# Option 1: From Kaggle dataset
DATASET_PATH = "/kaggle/input/solana-dataset/solana_140s_final.json"

# Option 2: If uploaded directly to notebook
if not os.path.exists(DATASET_PATH):
    DATASET_PATH = "/kaggle/input/solana_140s_final.json"

# Option 3: Working directory
if not os.path.exists(DATASET_PATH):
    DATASET_PATH = "/kaggle/working/solana_140s_final.json"
    print("Please upload solana_140s_final.json to the notebook.")

with open(DATASET_PATH, 'r') as f:
    dataset = json.load(f)

print(f"Dataset loaded: {len(dataset)} samples")
print("\nDistribution by vulnerability type:")
for vtype, count in sorted(Counter(s['vulnerability_type'] for s in dataset).items()):
    print(f"  {vtype}: {count}")

## 4. Data Preparation

In [None]:
# Stratified split by vulnerability type
by_vuln_type = defaultdict(list)
for sample in dataset:
    by_vuln_type[sample['vulnerability_type']].append(sample)

train_data, val_data, test_data = [], [], []

for vtype, samples in by_vuln_type.items():
    labels = [s['label'] for s in samples]
    train_samples, temp_samples = train_test_split(
        samples, test_size=0.2, stratify=labels, random_state=42
    )
    temp_labels = [s['label'] for s in temp_samples]
    val_samples, test_samples = train_test_split(
        temp_samples, test_size=0.5, stratify=temp_labels, random_state=42
    )
    train_data.extend(train_samples)
    val_data.extend(val_samples)
    test_data.extend(test_samples)

print("Dataset Split:")
print(f"  Train: {len(train_data)} samples")
print(f"  Val:   {len(val_data)} samples")
print(f"  Test:  {len(test_data)} samples")
print("\nTest set distribution:")
for vtype, count in sorted(Counter(s['vulnerability_type'] for s in test_data).items()):
    print(f"  {vtype}: {count}")

## 5. Load Model

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

MODEL_ID = "meta-llama/Llama-3.1-8B-Instruct"

# 4-bit quantization for memory efficiency
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True
)

print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, token=HF_TOKEN)
tokenizer.pad_token = tokenizer.eos_token

print("Loading model (this takes 3-5 minutes)...")
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    quantization_config=quant_config,
    device_map="auto",
    token=HF_TOKEN
)

print(f"Model loaded successfully.")
print(f"Parameters: {model.num_parameters():,}")

## 6. Define Prediction Function

In [None]:
# System prompt for security analysis
SYSTEM_PROMPT = """You are a smart contract security analyzer.
You analyze Solana smart contracts written in Rust and identify vulnerabilities.
Classify the code as either VULNERABLE or SAFE.
Respond with only one word: VULNERABLE or SAFE."""

def extract_code(sample):
    """Extract code content from the formatted sample."""
    text = sample['text']
    start_marker = '<|start_header_id|>user<|end_header_id|>'
    end_marker = '<|eot_id|><|start_header_id|>assistant'
    
    start_idx = text.find(start_marker)
    end_idx = text.find(end_marker)
    
    if start_idx != -1 and end_idx != -1:
        return text[start_idx + len(start_marker):end_idx].strip()
    return text[:1000]

def predict(sample):
    """Make prediction for a single sample."""
    code = extract_code(sample)
    
    prompt = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>

{SYSTEM_PROMPT}<|eot_id|><|start_header_id|>user<|end_header_id|>

Analyze this Solana smart contract:

{code}

Is this code VULNERABLE or SAFE?<|eot_id|><|start_header_id|>assistant<|end_header_id|>

"""
    
    inputs = tokenizer(
        prompt, 
        return_tensors="pt", 
        truncation=True, 
        max_length=2048
    ).to(model.device)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=20,
            do_sample=False,
            pad_token_id=tokenizer.eos_token_id
        )
    
    response = tokenizer.decode(
        outputs[0][inputs['input_ids'].shape[1]:], 
        skip_special_tokens=True
    ).strip().upper()
    
    return 'VULNERABLE' if 'VULNERABLE' in response else 'SAFE'

print("Prediction function defined.")

## 7. Run Evaluation

In [None]:
print("=" * 50)
print("RUNNING EVALUATION")
print("=" * 50)

results = []

for sample in tqdm(test_data, desc="Evaluating"):
    prediction = predict(sample)
    results.append({
        'vulnerability_type': sample['vulnerability_type'],
        'ground_truth': sample['label'],
        'prediction': prediction,
        'correct': sample['label'] == prediction
    })

print("\nEvaluation complete.")

## 8. Calculate Metrics

In [None]:
# Calculate metrics per vulnerability type
metrics_by_type = {}
vuln_types = sorted(set(r['vulnerability_type'] for r in results))

for vtype in vuln_types:
    type_results = [r for r in results if r['vulnerability_type'] == vtype]
    gt = [r['ground_truth'] for r in type_results]
    pred = [r['prediction'] for r in type_results]
    
    metrics_by_type[vtype] = {
        'Accuracy': round(accuracy_score(gt, pred), 2),
        'Precision': round(precision_score(gt, pred, pos_label='VULNERABLE', zero_division=0), 2),
        'Recall': round(recall_score(gt, pred, pos_label='VULNERABLE', zero_division=0), 2),
        'F1-score': round(f1_score(gt, pred, pos_label='VULNERABLE', zero_division=0), 2)
    }

# Calculate averages
avg_metrics = {
    'Accuracy': round(sum(m['Accuracy'] for m in metrics_by_type.values()) / len(metrics_by_type), 2),
    'Precision': round(sum(m['Precision'] for m in metrics_by_type.values()) / len(metrics_by_type), 2),
    'Recall': round(sum(m['Recall'] for m in metrics_by_type.values()) / len(metrics_by_type), 2),
    'F1-score': round(sum(m['F1-score'] for m in metrics_by_type.values()) / len(metrics_by_type), 2)
}

# Display results
print("=" * 70)
print("RESULTS: Zero-Shot Prompt Engineering")
print("=" * 70)
print(f"{'Vulnerability':<20} {'Accuracy':<12} {'Precision':<12} {'Recall':<12} {'F1-score':<12}")
print("-" * 70)

for vtype in vuln_types:
    m = metrics_by_type[vtype]
    print(f"{vtype:<20} {m['Accuracy']:<12} {m['Precision']:<12} {m['Recall']:<12} {m['F1-score']:<12}")

print("-" * 70)
print(f"{'Average':<20} {avg_metrics['Accuracy']:<12} {avg_metrics['Precision']:<12} {avg_metrics['Recall']:<12} {avg_metrics['F1-score']:<12}")
print("=" * 70)

## 9. Confusion Matrix

In [None]:
# Generate confusion matrix
all_gt = [r['ground_truth'] for r in results]
all_pred = [r['prediction'] for r in results]

cm = confusion_matrix(all_gt, all_pred, labels=['VULNERABLE', 'SAFE'])

plt.figure(figsize=(8, 6))
sns.heatmap(
    cm, 
    annot=True, 
    fmt='d', 
    cmap='Blues',
    xticklabels=['VULNERABLE', 'SAFE'],
    yticklabels=['VULNERABLE', 'SAFE']
)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Experiment 1: Zero-Shot PE - Confusion Matrix')
plt.tight_layout()
plt.savefig('/kaggle/working/cm_zero_shot.png', dpi=150)
plt.show()

print(f"\nConfusion Matrix:")
print(f"  True Positives:  {cm[0,0]}")
print(f"  False Negatives: {cm[0,1]}")
print(f"  False Positives: {cm[1,0]}")
print(f"  True Negatives:  {cm[1,1]}")

## 10. Save Results

In [None]:
# Save detailed results
results_df = pd.DataFrame(results)
results_df.to_csv('/kaggle/working/results_zero_shot.csv', index=False)

# Save summary
summary = {
    'experiment': 'Zero-Shot Prompt Engineering',
    'model': 'Llama-3.1-8B-Instruct',
    'quantization': '4-bit NF4',
    'method': 'Role-based prompting without examples',
    'dataset': {
        'total': len(dataset),
        'train': len(train_data),
        'val': len(val_data),
        'test': len(test_data)
    },
    'overall_accuracy': round(accuracy_score(all_gt, all_pred), 4),
    'per_vulnerability': metrics_by_type,
    'average': avg_metrics,
    'confusion_matrix': {
        'TP': int(cm[0,0]),
        'FN': int(cm[0,1]),
        'FP': int(cm[1,0]),
        'TN': int(cm[1,1])
    }
}

with open('/kaggle/working/summary_zero_shot.json', 'w') as f:
    json.dump(summary, f, indent=2)

print("Results saved:")
print("  - /kaggle/working/results_zero_shot.csv")
print("  - /kaggle/working/summary_zero_shot.json")
print("  - /kaggle/working/cm_zero_shot.png")

In [None]:
# Display final summary
print("\n" + "=" * 50)
print("EXPERIMENT 1 COMPLETE")
print("=" * 50)
print(f"Overall Accuracy: {summary['overall_accuracy']:.2%}")
print(f"Average F1-Score: {avg_metrics['F1-score']}")
print("\nFiles are in /kaggle/working/")
print("Download from the Output tab.")