In [8]:
from ollama import chat
from ollama import ChatResponse
import torch
import json
import os

In [9]:
def load_json(filepath):
    try:
        with open(filepath, 'r') as f:
            data = json.load(f)
        return data
    except FileNotFoundError:
        print(f"Error: File not found at {filepath}")
        return None
    except json.JSONDecodeError:
        print(f"Error: Invalid JSON format in {filepath}")
        return None

In [1]:
def save_json(data, filepath):
    try:
        with open(filepath, 'w') as f:
            json.dump(data, f, indent=4)
        print(f"Data saved successfully to {filepath}")
    except Exception as e:
        print(f"Error saving data to {filepath}: {e}")

In [11]:
system_prompt_dict = {}

In [12]:
system_prompt_dict["Python"] = """
You are an expert Python code analyst. Your task is to dissect a given Python code snippet and provide a structured JSON response detailing all of its components and overall functionality. You must strictly adhere to the following JSON format:
{
  "output": {
    "programming_language": "Python",
    "components": [
      {
        "component_type": "<TYPE_OF_PYTHON_COMPONENT>",
        "component_name": "<NAME_OF_COMPONENT_IF_APPLICABLE>",
        "component_code": "<THE_ACTUAL_PYTHON_CODE_OF_THE_COMPONENT>",
        "component_description": "<DETAILED_DESCRIPTION_OF_COMPONENT_FUNCTIONALITY>"
      },
      { /* ... more components ... */ }
    ],
    "overall_description": "<DETAILED_SUMMARY/DESCRIPTION_OF_THE_ENTIRE_PYTHON_CODE_FUNCTIONALITY>"
  }
}

**Instructions:**
1. Identify the Programming Language: The script is written in Python.
2. Component Types: Use appropriate component types, for example, IMPORT_STATEMENT, FUNCTION_DEFINITION, CLASS_DEFINITION, METHOD_DEFINITION, VARIABLE_DECLARATION, IF_STATEMENT, LOOP, TRY_EXCEPT_BLOCK, DECORATOR, GENERATOR, COMPREHENSION, MODULE_DEFINITION.
3. Component Names: Provide the correct identifier for each component (function name, class name, variable name, etc.), or NULL if not applicable.
4. Component Code: Include the complete, unmodified Python code for each component.
5. Component Descriptions: Provide a detailed, technical explanation of what each component does, including handling exceptions, iterators, list comprehensions, and object-oriented features.
6. Overall Description: Provide a detailed summary of the entire Python code, explaining its purpose, architecture, and how components interact.
7. Strict JSON Output: Your ENTIRE response must be ONLY the valid JSON object. Do not include any explanations, introductions, or additional text outside the JSON structure.

Analyze the following Python code properly and return ONLY the JSON response with no additional text:
"""

In [13]:
def string_to_json(input_string):
    try:
        data = json.loads(input_string)
        return data, None
    except json.JSONDecodeError as e:
        return None, str(e)

In [14]:
def make_data(data, system_prompt):
    output_data = {}

    for key, value in data.items():
        model_name = "qwen2.5-coder:32b"

        response: ChatResponse = chat(model=model_name, messages=[
            {
                'role': 'system',
                'content': system_prompt,
            },
            { 
                'role': 'user',
                'content': value,
            }
        ])

        output_data[key] = {
            "input" : value,
            "output" : response['message']['content']
        }

        print("Processed:", key)
            
    return output_data

In [15]:
data = load_json("python_train.json")

num_keys = 25
if not isinstance(data, dict):
    raise TypeError("Input must be a dictionary.")

if not isinstance(num_keys, int) or num_keys < 0:
    raise ValueError("num_keys must be a non-negative integer.")

sliced_dict = {}
count = 0
for key, value in data.items():
    if count < num_keys:
        sliced_dict[key] = value
        count += 1
    else:
        break

del data
del count

In [16]:
print("Length of sliced dictionary:", len(sliced_dict))

Length of sliced dictionary: 25


In [17]:
output = make_data(sliced_dict, system_prompt_dict["Python"])

Processed: python_0
Processed: python_1
Processed: python_2
Processed: python_3
Processed: python_4
Processed: python_5
Processed: python_6
Processed: python_7
Processed: python_8
Processed: python_9
Processed: python_10
Processed: python_11
Processed: python_12
Processed: python_13
Processed: python_14
Processed: python_15
Processed: python_16
Processed: python_17
Processed: python_18
Processed: python_19
Processed: python_20
Processed: python_21
Processed: python_22
Processed: python_23
Processed: python_24


In [21]:
for k, v in output.items():
    print(f"Input: \n{v['input']}\n")
    print(f"Output: \n{v['output']}\n")
    print("-" * 80)

Input: 
from argparse import ArgumentParser, _HelpAction
from pkgutil import get_data
from sys import exit

# flie basename no extension
LICENSES = [
    "agpl-3.0",
    "apache-2.0",
    "bsd-2-clause",
    "bsd-3-clause",
    "epl-2.0",
    "gpl-2.0",
    "gpl-3.0",
    "lgpl-2.1",
    "lgpl-3.0",
    "mit",
    "mpl-2.0",
    "unlicenses",
    "996icu-0.1",
]


def getparser():
    parser = ArgumentParser(
        prog="gen-license",
        description="tools to create license file, support GitHub LICENSE code.",
    )

    parser.add_argument(
        "code", help="LICENSE Code, --list to see", choices=LICENSES,
        nargs="?", const=None
    )

    parser.add_argument(
        "--list", dest="list", help="Show supported LICENSE Codes", required=False,
        action="store_true"
    )

    parser.add_argument(
        "--996icu", dest="icu", help="Expand LICENSE with 996ICU LICENSE, Choose a language vesion or default zh-cn",
        required=False, nargs="?", const="zh-cn", d

In [22]:
import pickle
# Save the output to a pickle file
with open("data.pickle", "wb") as file:
    pickle.dump(output, file)

In [2]:
import pickle
# Load the output from the pickle file
with open("data.pickle", "rb") as file:
    loaded_output = pickle.load(file)
print("Loaded output from pickle file:")
for i, (k, v) in enumerate(loaded_output.items()):
    print(f"Input {i}: \n{v['input']}\n")
    print(f"Output {i}: \n{v['output']}\n")
    print("-" * 80)

Loaded output from pickle file:
Input 0: 
from argparse import ArgumentParser, _HelpAction
from pkgutil import get_data
from sys import exit

# flie basename no extension
LICENSES = [
    "agpl-3.0",
    "apache-2.0",
    "bsd-2-clause",
    "bsd-3-clause",
    "epl-2.0",
    "gpl-2.0",
    "gpl-3.0",
    "lgpl-2.1",
    "lgpl-3.0",
    "mit",
    "mpl-2.0",
    "unlicenses",
    "996icu-0.1",
]


def getparser():
    parser = ArgumentParser(
        prog="gen-license",
        description="tools to create license file, support GitHub LICENSE code.",
    )

    parser.add_argument(
        "code", help="LICENSE Code, --list to see", choices=LICENSES,
        nargs="?", const=None
    )

    parser.add_argument(
        "--list", dest="list", help="Show supported LICENSE Codes", required=False,
        action="store_true"
    )

    parser.add_argument(
        "--996icu", dest="icu", help="Expand LICENSE with 996ICU LICENSE, Choose a language vesion or default zh-cn",
        required=

In [4]:
loaded_output["python_5"]["output"] = loaded_output["python_5"]["output"][:loaded_output["python_5"]["output"].index("```")-2]

In [5]:
loaded_output["python_13"]["output"] = loaded_output["python_13"]["output"][:loaded_output["python_13"]["output"].index("```")-2]


In [6]:
loaded_output["python_14"]["output"] = loaded_output["python_14"]["output"][:loaded_output["python_14"]["output"].index("<|im_start|>{Create Answer}")-2]

In [7]:
loaded_output["python_14"]["output"] = loaded_output["python_14"]["output"] + "\n}"

In [9]:
loaded_output["python_17"]["output"] = loaded_output["python_17"]["output"][:loaded_output["python_17"]["output"].index("The provided JSON structure includes detailed descriptions")-2]
loaded_output["python_17"]["output"] = loaded_output["python_17"]["output"] + "\n}"

In [10]:
loaded_output["python_22"]["output"] = loaded_output["python_22"]["output"][:loaded_output["python_22"]["output"].index("```")-1]

In [None]:
# After manual verification

final_data = {}
remove_ind_num = [0, 1, 7, 16, 18, 23, 24]
for i, (k, v) in enumerate(loaded_output.items()):
    if i not in remove_ind_num:
        print(f"Keeping index {i} in output.")
        final_data[k] = v

Keeping index 2 in output.
Keeping index 3 in output.
Keeping index 4 in output.
Keeping index 5 in output.
Keeping index 6 in output.
Keeping index 7 in output.
Keeping index 8 in output.
Keeping index 9 in output.
Keeping index 10 in output.
Keeping index 11 in output.
Keeping index 12 in output.
Keeping index 13 in output.
Keeping index 14 in output.
Keeping index 15 in output.
Keeping index 17 in output.
Keeping index 19 in output.
Keeping index 20 in output.
Keeping index 21 in output.
Keeping index 22 in output.


In [12]:
final_data.keys()

dict_keys(['python_2', 'python_3', 'python_4', 'python_5', 'python_6', 'python_7', 'python_8', 'python_9', 'python_10', 'python_11', 'python_12', 'python_13', 'python_14', 'python_15', 'python_17', 'python_19', 'python_20', 'python_21', 'python_22'])

In [13]:
# Save the output to a JSON file
import json
output_json_path = "final_python_data.json"
try:
    save_json(final_data, output_json_path)
except Exception as e:
    print(f"Error saving final data to JSON: {e}")

Data saved successfully to final_python_data.json
