In [1]:
import json
import torch
from mlx_lm.utils import load
from mlx_lm.generate import generate
from Phase_1_Scripts_and_Notebooks.Phase_1_few_shots import few_shots
from Phase_2_Scripts_and_Notebooks.Phase_2_few_shots import few_shots as few_shots_2

In [2]:
def parse_and_print_json(response):
    try:
        json_response = json.loads(response)
        print("Parsed JSON Successfully!")
        for key, value in json_response.items():
            if isinstance(value, list):
                print(f"{key}:")
                for item in value:
                    print(f" - {item}")
            else:
                print(f"{key}:\n - {value}")
        return json_response
    except json.JSONDecodeError as e:
        print("JSON Decode Error:", e)
        return e

In [3]:
model, tokenizer = load(path_or_hf_repo="Phase_1_MLX_Fine_Tuned")

In [4]:
system_prompt = """
You are an expert code analyzer that outputs ONLY valid and parsable JSON. Strictly follow these rules:
1. Analyze the provided code completely, but ONLY for top-level components
2. Output MUST be ONLY the JSON object with no extra text, formatting, or characters
3. Use this exact structure (including only components that exist in the code):
{
    "programming_language": "<C|JAVA|PYTHON|SQL|C#|JAVASCRIPT|RUBY|TYPESCRIPT|C++|KOTLIN|RUST|GO|PHP|SCALA>",
    "components": [
      {
        "component_type": "<IMPORT_STATEMENT|INCLUDE_STATEMENT|USING_STATEMENT|FUNCTION_DEFINITION|CLASS_DEFINITION|METHOD_DEFINITION|INTERFACE_DEFINITION|STRUCT_DEFINITION|ENUM_DEFINITION|GLOBAL_CODE|MODULE_DEFINITION|PACKAGE_DEFINITION|SCHEMA_DEFINITION|PROCEDURE_DEFINITION|TRIGGER_DEFINITION|MACRO_DEFINITION>",
        "component_name": "<name>",
        "component_code": "<complete_top_level_code_segment>",
        "component_description": "<technical_description>"
      }
    ],
    "overall_description": "<complete_code_summary>"
}

Requirements:
- ONLY analyze top-level components (do NOT break down internal implementation details)
- For functions/classes/methods: include the ENTIRE definition (including all nested code)
- Never split components into internal blocks (loops, conditionals, variable declarations)
- Only include the component types listed above
- Skip all other component types completely
- Never include null or empty fields
- Escape all special characters in JSON values properly
- Include complete top-level code segments exactly as they appear
- Provide detailed technical descriptions for each extracted component
- Describe overall functionality and architecture concisely
- Ensure the output is valid JSON that can be parsed directly
- Never add comments, notes, or text outside the JSON structure
- Never use markdown formatting or code blocks
- Maintain consistent JSON structure with proper escaping
- Only output the raw JSON object with no additional text
"""

In [5]:
with open("Test_Code_Files/SimpleNotesApp.py", "r") as f:
    code = f.read()

In [6]:
few_shots_prompt = []

count = 0

for k, v in few_shots.items():
    few_shots_prompt.append({"role": "user", "content": v[0].strip()})
    few_shots_prompt.append({"role": "assistant", "content": v[1].strip()})
    count += len(v[0].strip()) + len(v[1].strip())

In [7]:
messages = []
messages.append({"role": "assistant", "content": system_prompt.strip()})
messages += few_shots_prompt
messages.append({"role": "user", "content": code.strip()})

In [8]:
prompt = tokenizer.apply_chat_template(messages, add_generation_prompt=True)

In [9]:
# Assistant response
response = generate(
    model,
    tokenizer,
    prompt=prompt,
    max_tokens=3072
)

In [10]:
print(response)

{
    "programming_language": "PYTHON",
    "components": [
        {
            "component_type": "IMPORT_STATEMENT",
            "component_name": "flask",
            "component_code": "from flask import Flask, request, session, redirect, url_for, render_template_string, send_from_directory",
            "component_description": "Imports Flask, request, session, redirect, url_for, render_template_string, and send_from_directory from the Flask library."
        },
        {
            "component_type": "IMPORT_STATEMENT",
            "component_name": "os",
            "component_code": "import os",
            "component_description": "Imports the os module for file system operations."
        },
        {
            "component_type": "IMPORT_STATEMENT",
            "component_name": "uuid",
            "component_code": "import uuid",
            "component_description": "Imports the uuid module for generating unique identifiers."
        },
        {
            "component_type

In [11]:
json_response = parse_and_print_json(response)

Parsed JSON Successfully!
programming_language:
 - PYTHON
components:
 - {'component_type': 'IMPORT_STATEMENT', 'component_name': 'flask', 'component_code': 'from flask import Flask, request, session, redirect, url_for, render_template_string, send_from_directory', 'component_description': 'Imports Flask, request, session, redirect, url_for, render_template_string, and send_from_directory from the Flask library.'}
 - {'component_type': 'IMPORT_STATEMENT', 'component_name': 'os', 'component_code': 'import os', 'component_description': 'Imports the os module for file system operations.'}
 - {'component_type': 'IMPORT_STATEMENT', 'component_name': 'uuid', 'component_code': 'import uuid', 'component_description': 'Imports the uuid module for generating unique identifiers.'}
 - {'component_type': 'CLASS_DEFINITION', 'component_name': 'FlaskApp', 'component_code': 'class FlaskApp:\n    def __init__(self):\n        self.app = Flask(__name__)\n        self.app.secret_key = \'supersecretkey\'  

In [12]:
model, tokenizer = load(path_or_hf_repo="Phase_2_MLX_Fine_Tuned")

In [13]:
system_prompt = """
You are a highly skilled AI Security Code Auditor designed to deeply analyze source code components.

You must:
1. Identify any security vulnerabilities in each code component.
2. For each vulnerability found, provide:
    - The exact vulnerability type (e.g., "Insecure Deserialization", "Broken Authentication").
    - A CVE-like severity score on a scale of 0.0 to 10.0.
    - 3-4 realistic, step-by-step exploitation methods, showing how an attacker can practically exploit this vulnerability.
      (Each method must be described clearly, including prerequisites and the sequence of actions.)
    - Specific, actionable developer fixes to eliminate or mitigate the vulnerability.
      (Avoid vague advice like "use better security"; be exact.)

Additional Requirements:
- If no vulnerabilities are found, return an empty list for vulnerabilities.
- Maintain a consistent output format in strict JSON:
  
  {
      "component_type": "<COMPONENT_TYPE>",
      "component_name": "<COMPONENT_NAME>",
      "component_code": "<COMPONENT_CODE>",
      "component_description": "<COMPONENT_DESCRIPTION>",
      "vulnerabilities": [
          {
              "vul_type": "<VULNERABILITY_TYPE>",
              "cve_score": <FLOAT_SCORE>,
              "exploitation_methods": [
                  "<Step-by-step exploitation method 1>",
                  "<Step-by-step exploitation method 2>",
                  "<Step-by-step exploitation method 3>",
                  "<Optional: Step-by-step exploitation method 4>"
              ],
              "developer_fixes": [
                  "<Specific fix 1>",
                  "<Specific fix 2>",
                  "<Optional: fix 3>"
              ]
          },
          ...
      ]
  }

Rules:
- Be highly realistic. Only suggest exploitation methods that are practical in real-world attack scenarios.
- Be very strict in vulnerability identification — do not miss major vulnerabilities.
- Be concise but complete. No unnecessary repetition.
- Do not hallucinate vulnerabilities if the code is safe.
- Output only the structured JSON. No explanations outside JSON.

Important:
- You must think like a real security researcher.
- If a component includes multiple vulnerabilities, list each separately under the "vulnerabilities" field.

Output must be deterministic, clean, and immediately parsable by downstream systems.
"""

In [14]:
few_shots_prompt = []

count = 0

for k, v in few_shots_2.items():
    few_shots_prompt.append({"role": "user", "content": v[0].strip()})
    few_shots_prompt.append({"role": "assistant", "content": v[1].strip()})
    count += len(v[0].strip()) + len(v[1].strip())

In [15]:
results = []
if json_response["components"]:
    for component in json_response["components"]:
        messages = []
        messages.append({"role": "assistant", "content": system_prompt.strip()})
        messages += few_shots_prompt
        messages.append({"role": "user", "content": str(component)})

        prompt = tokenizer.apply_chat_template(messages, add_generation_prompt=True)

        # Assistant response
        response = generate(
            model,
            tokenizer,
            prompt=prompt,
            max_tokens=3072
        )

        results.append(response)

In [16]:
json_result = []
if results:
    for result in results:
        try:
            json_response = json.loads(result)
            json_result.append(json_response)
            print("Parsed JSON Successfully!")
        except json.JSONDecodeError as e:
            continue

Parsed JSON Successfully!
Parsed JSON Successfully!
Parsed JSON Successfully!
Parsed JSON Successfully!
Parsed JSON Successfully!
Parsed JSON Successfully!
Parsed JSON Successfully!
Parsed JSON Successfully!
Parsed JSON Successfully!
Parsed JSON Successfully!


In [17]:
for i in json_result:
    print(json.dumps(i, indent=4))
    print("-"*25)

{
    "component_type": "IMPORT_STATEMENT",
    "component_name": "flask",
    "component_code": "from flask import Flask, request, session, redirect, url_for, render_template_string, send_from_directory",
    "component_description": "Imports Flask, request, session, redirect, url_for, render_template_string, and send_from_directory from the Flask library.",
    "vulnerabilities": []
}
-------------------------
{
    "component_type": "IMPORT_STATEMENT",
    "component_name": "os",
    "component_code": "import os",
    "component_description": "Imports the os module for file system operations.",
    "vulnerabilities": []
}
-------------------------
{
    "component_type": "IMPORT_STATEMENT",
    "component_name": "uuid",
    "component_code": "import uuid",
    "component_description": "Imports the uuid module for generating unique identifiers."
}
-------------------------
{
    "component_type": "METHOD_DEFINITION",
    "component_name": "FlaskApp.login",
    "component_description"