In [None]:
import random
import json
import os
from ollama import chat
from ollama import ChatResponse

In [10]:
# Ollama model name
model_name = "qwen2.5-coder:32b"

#### For Critical Vulnerabilities

In [3]:
# Programming languages
languages = [
    "C", "JAVA", "PYTHON", "SQL", "C#", "JAVASCRIPT", "RUBY",
    "TYPESCRIPT", "C++", "KOTLIN", "RUST", "GO", "PHP", "SCALA"
]

# Difficulty levels
difficulties = ["easy", "medium", "hard", "very hard"]

# Common cybersecurity attacks
attacks = [
    "SQL Injection",
    "Cross-Site Scripting (XSS)",
    "Command Injection",
    "Buffer Overflow",
    "Broken Authentication",
    "Insecure Deserialization",
    "Cross-Site Request Forgery (CSRF)",
    "Path Traversal",
    "Remote Code Execution (RCE)",
    "Privilege Escalation"
]

code_types = ["complete code file", "code snippet", "function", "class"]

In [4]:
# Prompt template
prompt_template = """
Given the following inputs:
- Attack Type: {attack_type}
- Programming Language: {language}
- Code Type: {code_type}
- Vulnerability Detection Difficulty: {difficulty}

Your task is to:
1. Generate a {code_type} in the given language that contains a vulnerability matching the described attack type and identification difficulty level.
2. Evaluate the vulnerability and provide a realistic CVSS score as a float (e.g., 7.5).

VERY IMPORTANT:
- Output ONLY a valid JSON object.
- Do NOT add any explanation, markdown, comments, or additional text.
- Escape all double quotes correctly.
- The code must go inside the "input" key as a string.
- The CVSS score must go inside the "output" key and should be a float number only.
- The attack type must go inside the "output" key and should be a string only.

JSON FORMAT EXAMPLE:
{{
    "input": "# Python example\\ndef unsafe():\\n    eval(input(\\\"Enter command: \\\"))",
    "output": {{
        "score": 8.2,
        "type": "{attack_type}"
        }}
}}

Now generate a similar JSON for:
- Attack Type: {attack_type}
- Programming Language: {language}
- Difficulty Level: {difficulty}
- Code Type: {code_type}
"""

In [5]:
# Output directory
os.makedirs("Phase-2-Data", exist_ok=True)

In [6]:
# Retry logic
MAX_RETRIES = 2

In [7]:
# System + few-shot prompts
system_prompt = (
    "You are a cybersecurity code generator.\n"
    "You must always output ONLY one valid JSON object.\n"
    "No markdown, no extra text, no code blocks, no preamble.\n"
    "Just return clean JSON like: {\"input\": \"<code>\", \"output\": <cvss>}"
)
few_shot = {
    'role': 'user',
    'content': (
        "Example:\n"
        "{\n"
        "  \"input\": \"# Python example\\ndef unsafe():\\n    eval(input(\\\"Enter command: \\\"))\",\n"
        "  \"output\": 8.2\n"
        "}"
    )
}

In [8]:
# Main loop
for language in languages:
    print(f"\nGenerating for: {language}")
    results = {}

    for i in range(25):
        attack = random.choice(attacks)
        difficulty = random.choice(difficulties)
        code_type = random.choice(code_types)
        user_prompt = prompt_template.format(attack_type=attack, language=language, difficulty=difficulty, code_type=code_type)

        for attempt in range(MAX_RETRIES):
            try:
                response: ChatResponse = chat(model=model_name, messages=[
                    {'role': 'system', 'content': system_prompt},
                    few_shot,
                    {'role': 'user', 'content': user_prompt}
                ])

                reply = response['message']['content'].strip()
                parsed_json = json.loads(reply)

                results[f"{language.lower()}_{i}"] = parsed_json
                print(f"[{language}] Example {i+1}/25 - Success")
                break

            except Exception as e:
                if attempt == MAX_RETRIES - 1:
                    print(f"[{language}] Example {i+1}/25 - Failed: {e}")
                else:
                    print(f"Retrying [{language}] Example {i+1} due to: {e}")
                continue

    # Save results
    with open(f"Phase-2-Data/{language.lower()}.json", "w") as f:
        json.dump(results, f, indent=4)
    print(f"Saved: Phase-2-Data/{language.lower()}.json")


Generating for: C
[C] Example 1/25 - Success
[C] Example 2/25 - Success
[C] Example 3/25 - Success
[C] Example 4/25 - Success
[C] Example 5/25 - Success
[C] Example 6/25 - Success
[C] Example 7/25 - Success
[C] Example 8/25 - Success
[C] Example 9/25 - Success
[C] Example 10/25 - Success
[C] Example 11/25 - Success
[C] Example 12/25 - Success
[C] Example 13/25 - Success
[C] Example 14/25 - Success
[C] Example 15/25 - Success
[C] Example 16/25 - Success
[C] Example 17/25 - Success
[C] Example 18/25 - Success
[C] Example 19/25 - Success
[C] Example 20/25 - Success
[C] Example 21/25 - Success
[C] Example 22/25 - Success
[C] Example 23/25 - Success
[C] Example 24/25 - Success
[C] Example 25/25 - Success
Saved: Phase-2-Data/c.json

Generating for: JAVA
[JAVA] Example 1/25 - Success
[JAVA] Example 2/25 - Success
[JAVA] Example 3/25 - Success
[JAVA] Example 4/25 - Success
[JAVA] Example 5/25 - Success
[JAVA] Example 6/25 - Success
[JAVA] Example 7/25 - Success
[JAVA] Example 8/25 - Success
[

#### For Mild Vulnerabilities

In [11]:
# Programming languages
languages = [
    "C", "JAVA", "PYTHON", "SQL", "C#", "JAVASCRIPT", "RUBY",
    "TYPESCRIPT", "C++", "KOTLIN", "RUST", "GO", "PHP", "SCALA"
]

# Difficulty levels
difficulties = ["easy", "medium", "hard", "very hard"]

# Common cybersecurity attacks
attacks = [
    "Information Disclosure (Low Severity)",
    "Denial-of-Service (DoS) - Low Impact",
    "Clickjacking (Low Severity)",
    "Server-Side Request Forgery (SSRF) - Limited Scope",
    "Session Fixation (Low Severity)",
    "Local File Inclusion (LFI) - Limited Scope",
    "Race Condition (Difficult to Exploit)",
    "Open Redirect",
    "HTTP Response Splitting",
    "Missing Security Headers"
]

code_types = ["complete code file", "code snippet", "function", "class"]

In [12]:
# Prompt template
prompt_template = """
Given the following inputs:
- Attack Type: {attack_type}
- Programming Language: {language}
- Code Type: {code_type}
- Vulnerability Detection Difficulty: {difficulty}

Your task is to:
1. Generate a {code_type} in the given language that contains a vulnerability matching the described attack type and identification difficulty level.
2. Evaluate the vulnerability and provide a realistic CVSS score as a float (e.g., 7.5).

VERY IMPORTANT:
- Output ONLY a valid JSON object.
- Do NOT add any explanation, markdown, comments, or additional text.
- Escape all double quotes correctly.
- The code must go inside the "input" key as a string.
- The CVSS score must go inside the "output" key and should be a float number only.
- The attack type must go inside the "output" key and should be a string only.

JSON FORMAT EXAMPLE:
{{
    "input": "# Python example\\ndef unsafe():\\n    eval(input(\\\"Enter command: \\\"))",
    "output": {{
        "score": 4.2,
        "type": "{attack_type}"
        }}
}}

Now generate a similar JSON for:
- Attack Type: {attack_type}
- Programming Language: {language}
- Difficulty Level: {difficulty}
- Code Type: {code_type}
"""

In [13]:
# Output directory
os.makedirs("Phase-2-Data", exist_ok=True)

In [14]:
# Retry logic
MAX_RETRIES = 2

In [15]:
# System + few-shot prompts
system_prompt = (
    "You are a cybersecurity code generator.\n"
    "You must always output ONLY one valid JSON object.\n"
    "No markdown, no extra text, no code blocks, no preamble.\n"
    "Just return clean JSON like: {\"input\": \"<code>\", \"output\": <cvss>}"
)
few_shot = {
    'role': 'user',
    'content': (
        "Example:\n"
        "{\n"
        "  \"input\": \"# Python example\\ndef unsafe():\\n    eval(input(\\\"Enter command: \\\"))\",\n"
        "  \"output\": 4.2\n"
        "}"
    )
}

In [17]:
# Main loop
for language in languages:
    print(f"\nGenerating for: {language}")
    results = {}

    for i in range(25, 50):
        attack = random.choice(attacks)
        difficulty = random.choice(difficulties)
        code_type = random.choice(code_types)
        user_prompt = prompt_template.format(attack_type=attack, language=language, difficulty=difficulty, code_type=code_type)

        for attempt in range(MAX_RETRIES):
            try:
                response: ChatResponse = chat(model=model_name, messages=[
                    {'role': 'system', 'content': system_prompt},
                    few_shot,
                    {'role': 'user', 'content': user_prompt}
                ])

                reply = response['message']['content'].strip()
                parsed_json = json.loads(reply)

                results[f"{language.lower()}_{i}"] = parsed_json
                print(f"[{language}] Example {i+1}/50 - Success")
                break

            except Exception as e:
                if attempt == MAX_RETRIES - 1:
                    print(f"[{language}] Example {i+1}/50 - Failed: {e}")
                else:
                    print(f"Retrying [{language}] Example {i+1} due to: {e}")
                continue

    # Save results
    file_path = f"Phase-2-Data/{language.lower()}.json"

    if os.path.exists(file_path):
        with open(file_path, "r") as f:
            try:
                existing_data = json.load(f)
            except json.JSONDecodeError:
                print(f"Error decoding JSON from {file_path}. Starting with empty data.")
                existing_data = {}
    else:
        existing_data = {}

    existing_data.update(results)

    with open(file_path, "w") as f:
        json.dump(existing_data, f, indent=4)

    print(f"Saved: {file_path}")


Generating for: C
[C] Example 26/50 - Success
[C] Example 27/50 - Success
[C] Example 28/50 - Success
[C] Example 29/50 - Success
[C] Example 30/50 - Success
[C] Example 31/50 - Success
[C] Example 32/50 - Success
[C] Example 33/50 - Success
[C] Example 34/50 - Success
[C] Example 35/50 - Success
[C] Example 36/50 - Success
[C] Example 37/50 - Success
[C] Example 38/50 - Success
[C] Example 39/50 - Success
[C] Example 40/50 - Success
[C] Example 41/50 - Success
[C] Example 42/50 - Success
[C] Example 43/50 - Success
[C] Example 44/50 - Success
[C] Example 45/50 - Success
[C] Example 46/50 - Success
[C] Example 47/50 - Success
[C] Example 48/50 - Success
[C] Example 49/50 - Success
[C] Example 50/50 - Success
Saved: Phase-2-Data/c.json

Generating for: JAVA
[JAVA] Example 26/50 - Success
[JAVA] Example 27/50 - Success
[JAVA] Example 28/50 - Success
[JAVA] Example 29/50 - Success
[JAVA] Example 30/50 - Success
[JAVA] Example 31/50 - Success
[JAVA] Example 32/50 - Success
[JAVA] Example 

#### For Safe Code

In [18]:
# Programming languages
languages = [
    "C", "JAVA", "PYTHON", "SQL", "C#", "JAVASCRIPT", "RUBY",
    "TYPESCRIPT", "C++", "KOTLIN", "RUST", "GO", "PHP", "SCALA"
]

code_types = ["complete code file", "code snippet", "function", "class"]

In [19]:
# Prompt template
prompt_template = """
Your task is to:
1. Generate a secure, production-grade {code_type} snippet in {language} that demonstrates best practices in secure coding.
2. The code must not contain any known security vulnerabilities or bad practices.
3. Use appropriate input validation, safe APIs, error handling, and other relevant security techniques depending on the language and context.

REQUIREMENTS:
- Output ONLY a valid JSON object.
- Do NOT add any explanation, markdown, comments, or additional text.
- Escape all double quotes correctly.
- The secure code must go inside the "input" key as a string.
- The "output" key must include a string "Secure" as the value of the "type" field, and the CVSS score must be 0.0 (indicating no vulnerability).

JSON FORMAT EXAMPLE:
{{
    "input": "# Python example\\ndef safe():\\n    user_input = input(\\\"Enter number: \\\")\\n    if user_input.isdigit():\\n        print(int(user_input))",
    "output": {{
        "score": 0.0,
        "type": "Secure"
    }}
}}

Now generate a similar JSON for:
- Programming Language: {language}
- Code Type: {code_type}
"""

In [20]:
# Output directory
os.makedirs("Phase-2-Data", exist_ok=True)

In [21]:
# Retry logic
MAX_RETRIES = 2

In [22]:
# System + few-shot prompts
system_prompt = (
    "You are a secure code generator.\n"
    "You must always output ONLY one valid JSON object.\n"
    "No markdown, no extra text, no code blocks, no preamble.\n"
    "The JSON must include an 'input' field with secure code as a string,\n"
    "and an 'output' field with 'score': 0.0 and 'type': 'Secure'."
)
few_shot = {
    'role': 'user',
    'content': (
        "Example:\n"
        "{\n"
        "  \"input\": \"# Python example\\ndef safe():\\n    user_input = input(\\\"Enter number: \\\")\\n    if user_input.isdigit():\\n        print(int(user_input))\",\n"
        "  \"output\": {\n"
        "    \"score\": 0.0,\n"
        "    \"type\": \"Secure\"\n"
        "  }\n"
        "}"
    )
}

In [23]:
# Main loop
for language in languages:
    print(f"\nGenerating for: {language}")
    results = {}

    for i in range(50, 90):
        code_type = random.choice(code_types)
        user_prompt = prompt_template.format(language=language, code_type=code_type)

        for attempt in range(MAX_RETRIES):
            try:
                response: ChatResponse = chat(model=model_name, messages=[
                    {'role': 'system', 'content': system_prompt},
                    few_shot,
                    {'role': 'user', 'content': user_prompt}
                ])

                reply = response['message']['content'].strip()
                parsed_json = json.loads(reply)

                results[f"{language.lower()}_{i}"] = parsed_json
                print(f"[{language}] Example {i+1}/90 - Success")
                break

            except Exception as e:
                if attempt == MAX_RETRIES - 1:
                    print(f"[{language}] Example {i+1}/90 - Failed: {e}")
                else:
                    print(f"Retrying [{language}] Example {i+1} due to: {e}")
                continue

    # Save results
    file_path = f"Phase-2-Data/{language.lower()}.json"

    if os.path.exists(file_path):
        with open(file_path, "r") as f:
            try:
                existing_data = json.load(f)
            except json.JSONDecodeError:
                print(f"Error decoding JSON from {file_path}. Starting with empty data.")
                existing_data = {}
    else:
        existing_data = {}

    existing_data.update(results)

    with open(file_path, "w") as f:
        json.dump(existing_data, f, indent=4)

    print(f"Saved: {file_path}")


Generating for: C
[C] Example 51/90 - Success
[C] Example 52/90 - Success
[C] Example 53/90 - Success
[C] Example 54/90 - Success
[C] Example 55/90 - Success
[C] Example 56/90 - Success
[C] Example 57/90 - Success
[C] Example 58/90 - Success
[C] Example 59/90 - Success
[C] Example 60/90 - Success
[C] Example 61/90 - Success
[C] Example 62/90 - Success
[C] Example 63/90 - Success
[C] Example 64/90 - Success
[C] Example 65/90 - Success
[C] Example 66/90 - Success
[C] Example 67/90 - Success
[C] Example 68/90 - Success
[C] Example 69/90 - Success
[C] Example 70/90 - Success
[C] Example 71/90 - Success
[C] Example 72/90 - Success
[C] Example 73/90 - Success
[C] Example 74/90 - Success
[C] Example 75/90 - Success
[C] Example 76/90 - Success
[C] Example 77/90 - Success
[C] Example 78/90 - Success
[C] Example 79/90 - Success
[C] Example 80/90 - Success
[C] Example 81/90 - Success
[C] Example 82/90 - Success
[C] Example 83/90 - Success
[C] Example 84/90 - Success
[C] Example 85/90 - Success
[