In [None]:
from langchain_core.messages import SystemMessage
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from langchain_core.prompts import ChatPromptTemplate, HumanMessagePromptTemplate
from langchain import HuggingFaceHub
from langchain.llms import HuggingFacePipeline
import subprocess
import os
import pandas as pd

# Setup the prompt templates
human_prompt = HumanMessagePromptTemplate.from_template("{request}")
chat_prompt = ChatPromptTemplate.from_messages([
    ("system", "You are a highly skilled data scientist with 20 years of experience. You specialize in writing clean, efficient, and error-free ML code. Generate only the code snippets without any explanations or comments."),
    human_prompt
])

# Initialize the Hugging Face model
model_id = "Qwen/CodeQwen1.5-7B-Chat"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id)
pipe = pipeline("text-generation", model=model,
                tokenizer=tokenizer, max_new_tokens=500)
hf_model = HuggingFacePipeline(pipeline=pipe)

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen1.5-7B-Chat")
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen1.5-7B-Chat")

prompt = "Give me a short introduction to large language model."

messages = [{"role": "user", "content": prompt}]

text = tokenizer.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True)

model_inputs = tokenizer([text], return_tensors="pt")

generated_ids = model.generate(
    model_inputs.input_ids, max_new_tokens=512, do_sample=True)

generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(
    model_inputs.input_ids, generated_ids)]

response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(response)

In [None]:
csv_path = "/Users/ilya/Desktop/GitHub_Repositories/HW_University/Data_Mining/datasets/Iris.csv"
if not os.path.exists(csv_path):
    raise FileNotFoundError(f"CSV file not found at path: {csv_path}")


def get_dataset_columns(csv_path):
    df = pd.read_csv(csv_path)
    return df.columns.tolist()


def generate_code(request):
    formatted_request = chat_prompt.format_prompt(
        request=request).to_string()
    response = hf_model(formatted_request)
    generated_code = response
    return generated_code

columns = get_dataset_columns(csv_path)
columns_info = ", ".join(columns)

# Provide a simpler prompt to the LLM
simple_prompt = (
    f"Write a Python script to perform the following tasks using the dataset with columns: {columns_info}:\n"
    "1. Read a CSV file and load it into a pandas DataFrame.\n"
    "2. Print the first 5 rows of the DataFrame.\n"
    "Use placeholders like {csv_path} for dynamic inputs. Only return the code without any explanations."
)

# Generate and validate the code
request = simple_prompt
success = False
while not success:
    generated_code = generate_code(request)
    cleaned_code = clean_code(generated_code)
    success, output = validate_code(cleaned_code)
    if not success:
        print("Validation failed with error:")
        print(output)
        # Update the request to include the error for the LLM to regenerate the code
        request = (
            f"Fix the following code and its errors:\n{cleaned_code}\nError:\n{output}\n"
            f"Ensure you use the correct column names from the dataset: {columns_info}."
        )
    else:
        print("Code validated successfully. Output:")
        print(output)

# Save the validated code to a .py file
simple_code_filename = "simple_validated_code.py"
with open(simple_code_filename, "w") as file:
    file.write(cleaned_code)

print(f"Validated code saved to {simple_code_filename}")

In [None]:
import tempfile
# Function to generate code with LLM
def generate_code(request):
    formatted_request = chat_prompt.format_prompt(
        request=request).to_string()
    response = hf_model(formatted_request)
    generated_code = response
    return generated_code

# Function to clean and correct the code


def clean_code(generated_code):
    cleaned_code = generated_code.replace("```", "").strip()
    cleaned_code_lines = cleaned_code.split("\n")
    cleaned_code_lines = [
        line for line in cleaned_code_lines if not line.lower().startswith("here is the")
    ]
    cleaned_code = "\n".join(cleaned_code_lines)
    if "python" in cleaned_code:
        cleaned_code = cleaned_code.split("python")[1].strip()
    return cleaned_code

# Function to validate the code


def validate_code(code):
    with tempfile.NamedTemporaryFile(suffix=".py", delete=False) as temp_file:
        temp_file.write(code.encode("utf-8"))
        code_filename = temp_file.name

    try:
        result = subprocess.run(
            ["python", code_filename], capture_output=True, text=True
        )
        if result.returncode != 0:
            raise Exception(result.stderr)
        return True, result.stdout
    except Exception as e:
        return False, str(e)
    finally:
        if os.path.exists(code_filename):
            os.remove(code_filename)

# Load dataset column names


def get_dataset_columns(csv_path):
    df = pd.read_csv(csv_path)
    return df.columns.tolist()

# Main function to generate and validate code in steps


def main():
    csv_path = "/Users/ilya/Desktop/GitHub_Repositories/HW_University/Data_Mining/datasets/Iris.csv"
    if not os.path.exists(csv_path):
        raise FileNotFoundError(f"CSV file not found at path: {csv_path}")

    columns = get_dataset_columns(csv_path)
    columns_info = ", ".join(columns)

    # Provide the whole prompt to the LLM
    whole_prompt = (
        f"Write a Python script to perform the following tasks using the dataset with columns: {columns_info}:\n"
        "1. Read a CSV file and load it into a pandas DataFrame.\n"
        "2. Handle missing values in the DataFrame.\n"
        "3. Encode categorical variables in the DataFrame.\n"
        "4. Split the data into training and test sets.\n"
        "5. Train a Decision Tree classifier using the training set.\n"
        "6. Evaluate the model's performance on the test set.\n"
        "Use placeholders like {csv_path} for dynamic inputs. Only return the code without any explanations."
    )

    # Step 1: Request the LLM to divide the prompt into steps
    steps_prompt = "Divide the following task into smaller steps and provide each step as a separate instruction:\n\n" + whole_prompt
    steps_response = generate_code(steps_prompt)
    steps = [step.strip()
             for step in steps_response.split('\n') if step.strip()]

    combined_code = ""

    for step in steps:
        request = f"Write Python code to {step}. Use the path '{csv_path}' as the CSV file path."
        success = False
        while not success:
            generated_code = generate_code(request)
            cleaned_code = clean_code(generated_code)
            success, output = validate_code(cleaned_code)
            if not success:
                print(f"Validation failed for step '{step}' with error:")
                print(output)
                # Update the request to include the error for the LLM to regenerate the code
                request = (
                    f"Fix the following code and its errors:\n{cleaned_code}\nError:\n{output}\n"
                    f"Ensure you use the correct column names from the dataset: {columns_info}."
                )
            else:
                print(f"Step '{step}' validated successfully.")
                combined_code += cleaned_code + "\n\n"

    # Final validation of the combined code
    success, output = validate_code(combined_code)
    if not success:
        print("Final combined code validation failed with error:")
        print(output)
    else:
        print("Final combined code validated successfully. Output:")
        print(output)

    # Save the validated code to a .py file
    validated_code_filename = "validated_combined_code.py"
    with open(validated_code_filename, "w") as file:
        file.write(combined_code)

    print(f"Validated combined code saved to {validated_code_filename}")


if __name__ == "__main__":
    main()

In [None]:
from langchain_core.messages import SystemMessage
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.prompts import HumanMessagePromptTemplate
from langchain import HuggingFaceHub
from langchain.llms import HuggingFacePipeline
import subprocess
import os
import pandas as pd

# Setup the prompt templates
human_prompt = HumanMessagePromptTemplate.from_template("{request}")
chat_prompt = ChatPromptTemplate.from_messages([
    ("system", "You are a highly skilled data scientist with 20 years of experience. You specialize in writing clean, efficient, and error-free ML code. Generate only the code snippets without any explanations or comments."),
    human_prompt
])

# Initialize the Hugging Face model
model_id = "Qwen/CodeQwen1.5-7B-Chat"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id)
pipe = pipeline("text-generation", model=model,
                tokenizer=tokenizer)
hf_model = HuggingFacePipeline(pipeline=pipe)

# Function to generate code with LLM


def generate_code(request):
    formatted_request = chat_prompt.format_prompt(
        request=request).to_messages()
    response = hf_model(formatted_request)
    generated_code = response[0]["generated_text"]
    return generated_code

# Function to clean and correct the code


def clean_code(generated_code):
    cleaned_code = generated_code.replace("```", "").strip()
    cleaned_code_lines = cleaned_code.split("\n")
    cleaned_code_lines = [
        line for line in cleaned_code_lines if not line.lower().startswith("here is the")]
    cleaned_code = "\n".join(cleaned_code_lines)
    if "python" in cleaned_code:
        cleaned_code = cleaned_code.split("python")[1].strip()
    return cleaned_code

# Function to validate the code


def validate_code(code):
    code_filename = "temp_code.py"
    with open(code_filename, "w") as file:
        file.write(code)
    try:
        result = subprocess.run(
            ["python", code_filename], capture_output=True, text=True)
        if result.returncode != 0:
            raise Exception(result.stderr)
        return True, result.stdout
    except Exception as e:
        return False, str(e)
    finally:
        if os.path.exists(code_filename):
            os.remove(code_filename)

# Load dataset column names


def get_dataset_columns(csv_path):
    df = pd.read_csv(csv_path)
    return df.columns.tolist()

# Main function to generate and validate code in steps


def main():
    csv_path = "/Users/ilya/Desktop/GitHub_Repositories/HW_University/Data_Mining/datasets/Iris.csv"
    if not os.path.exists(csv_path):
        raise FileNotFoundError(f"CSV file not found at path: {csv_path}")

    columns = get_dataset_columns(csv_path)
    columns_info = ", ".join(columns)

    # Provide the whole prompt to the LLM
    whole_prompt = (
        f"Write a Python script to perform the following tasks using the dataset with columns: {columns_info}:\n"
        "1. Read a CSV file and load it into a pandas DataFrame.\n"
        "2. Handle missing values in the DataFrame.\n"
        "3. Encode categorical variables in the DataFrame.\n"
        "4. Split the data into training and test sets.\n"
        "5. Train a Decision Tree classifier using the training set.\n"
        "6. Evaluate the model's performance on the test set.\n"
        "Use placeholders like {csv_path} for dynamic inputs. Only return the code without any explanations."
    )

    # Step 1: Request the LLM to divide the prompt into steps
    steps_prompt = "Divide the following task into smaller steps and provide each step as a separate instruction:\n\n" + whole_prompt
    steps_response = generate_code(steps_prompt)
    steps = [step.strip()
             for step in steps_response.split('\n') if step.strip()]

    combined_code = ""

    for step in steps:
        request = f"Write Python code to {step}. Use the path '{csv_path}' as the CSV file path."
        success = False
        while not success:
            generated_code = generate_code(request)
            cleaned_code = clean_code(generated_code)
            success, output = validate_code(cleaned_code)
            if not success:
                print(f"Validation failed for step '{step}' with error:")
                print(output)
                # Update the request to include the error for the LLM to regenerate the code
                request = (
                    f"Fix the following code and its errors:\n{cleaned_code}\nError:\n{output}\n"
                    f"Ensure you use the correct column names from the dataset: {columns_info}."
                )
            else:
                print(f"Step '{step}' validated successfully.")
                combined_code += cleaned_code + "\n\n"

    # Final validation of the combined code
    success, output = validate_code(combined_code)
    if not success:
        print("Final combined code validation failed with error:")
        print(output)
    else:
        print("Final combined code validated successfully. Output:")
        print(output)

    # Save the validated code to a .py file
    validated_code_filename = "validated_combined_code.py"
    with open(validated_code_filename, "w") as file:
        file.write(combined_code)

    print(f"Validated combined code saved to {validated_code_filename}")


if __name__ == "__main__":
    main()