## Just column namnes of dataset

In [None]:
from langchain_core.messages import SystemMessage
from langchain_community.llms import Ollama
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.prompts import HumanMessagePromptTemplate
import pandas as pd

# Setup the prompt templates
human_prompt = HumanMessagePromptTemplate.from_template("{request}")
chat_prompt = ChatPromptTemplate.from_messages([
    ("system", "You are a highly skilled data scientist with 20 years of experience. You specialize in writing clean, efficient, and error-free ML code. Generate only the code snippets without any explanations or comments."),
    human_prompt
])
# Ensure reproducibility by setting a seed
model = Ollama(model="llama3")

# Function to generate code with LLM


def generate_code(request):
    formatted_request = chat_prompt.format_prompt(
        request=request).to_messages()
    response = model.invoke(formatted_request)
    generated_code = response
    return generated_code

# Function to clean and correct the code


def clean_and_correct_code(generated_code, csv_path):
    cleaned_code = generated_code.replace("```", "").strip()
    cleaned_code_lines = cleaned_code.split("\n")
    cleaned_code_lines = [
        line for line in cleaned_code_lines if not line.lower().startswith("here is the")]
    cleaned_code = "\n".join(cleaned_code_lines)
    if "python" in cleaned_code:
        cleaned_code = cleaned_code.split("python")[1].strip()
    corrected_code = cleaned_code.replace("{csv_path}", f"{csv_path}")
    return corrected_code

# Load dataset column names


def get_dataset_columns(csv_path):
    df = pd.read_csv(csv_path)
    return df.columns.tolist()

# Main function for Part 1


def main_part1():
    csv_path = "/Users/ilya/Desktop/GitHub_Repositories/HW_University/Data_Mining/datasets/insurance.csv"
    columns = get_dataset_columns(csv_path)
    columns_info = ", ".join(columns)

    request = (
        f"Write a Python code for reading a CSV file, handling missing values, encoding categorical variables, "
        f"splitting the data into training and test sets, training an appropriate model, and evaluating the model's performance. "
        f"Use placeholders like {{csv_path}} for dynamic inputs. The dataset has the following columns: {columns_info}. Only return the code without any explanations."
    )

    generated_code = generate_code(request)
    corrected_code = clean_and_correct_code(generated_code, f"'{csv_path}'")

    code_filename = "generated_code.py"
    with open(code_filename, "w") as file:
        file.write(corrected_code)

    print(f"Generated code saved to {code_filename}")


if __name__ == "__main__":
    main_part1()

In [None]:
import subprocess

# Function to validate the code


def validate_code(code_filename):
    try:
        result = subprocess.run(
            ["python", code_filename], capture_output=True, text=True)
        if result.returncode != 0:
            raise Exception(result.stderr)
        return True, result.stdout
    except Exception as e:
        return False, str(e)

# Main function for Part 2


def main_part2():
    code_filename = "generated_code.py"
    success, output = validate_code(code_filename)

    if not success:
        print("Code validation failed with error:")
        print(output)
    else:
        print("Code validated successfully. Output:")
        print(output)

    return success, output


if __name__ == "__main__":
    success, output = main_part2()

In [None]:
# Main function for Part 3
def main_part3():
    csv_path = "/Users/ilya/Desktop/GitHub_Repositories/HW_University/Data_Mining/datasets/insurance.csv"
    with open("generated_code.py", "r") as file:
        initial_code = file.read()

    success, output = main_part2()

    if not success:
        request = f"Fix the following code and its errors:\n{initial_code}\nError:\n{output}"
        generated_code = generate_code(request)
        corrected_code = clean_and_correct_code(
            generated_code, f"'{csv_path}'")

        validated_code_filename = "validated_generated_code.py"
        with open(validated_code_filename, "w") as file:
            file.write(corrected_code)

        print(f"Validated code saved to {validated_code_filename}")
    else:
        print("No need to fix the code, it runs successfully.")


if __name__ == "__main__":
    main_part3()

## More info about dataset

In [1]:
from langchain_core.messages import SystemMessage
from langchain_community.llms import Ollama
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.prompts import HumanMessagePromptTemplate
import pandas as pd
import subprocess

# Setup the prompt templates
human_prompt = HumanMessagePromptTemplate.from_template("{request}")
chat_prompt = ChatPromptTemplate.from_messages([
    ("system", "You are a highly skilled data scientist with 20 years of experience. You specialize in writing clean, efficient, and error-free ML code. Generate only the code snippets without any explanations or comments."),
    human_prompt
])
model = Ollama(model="llama3")

# Function to generate code with LLM


def generate_code(request):
    formatted_request = chat_prompt.format_prompt(
        request=request).to_messages()
    response = model.invoke(formatted_request)
    generated_code = response
    return generated_code

# Function to clean and correct the code


def clean_and_correct_code(generated_code, csv_path):
    cleaned_code = generated_code.replace("```", "").strip()
    cleaned_code_lines = cleaned_code.split("\n")
    cleaned_code_lines = [
        line for line in cleaned_code_lines if not line.lower().startswith("here is the")]
    cleaned_code = "\n".join(cleaned_code_lines)
    if "python" in cleaned_code:
        cleaned_code = cleaned_code.split("python")[1].strip()
    corrected_code = cleaned_code.replace("{csv_path}", f"{csv_path}")
    return corrected_code

# Load dataset information


def get_dataset_info(csv_path):
    df = pd.read_csv(csv_path)
    columns = df.columns.tolist()
    types = df.dtypes.to_dict()
    sample_data = df.head().to_dict(orient='list')
    value_counts = {col: df[col].value_counts().to_dict()
                    for col in df.columns}
    description = df.describe().to_dict()
    return columns, types, sample_data, value_counts, description

# Main function for Part 1


def main_part1():
    csv_path = "/Users/ilya/Desktop/GitHub_Repositories/HW_University/Data_Mining/datasets/insurance.csv"
    columns, types, sample_data, value_counts, description = get_dataset_info(
        csv_path)

    columns_info = ", ".join(columns)
    types_info = ", ".join([f"{col}: {typ}" for col, typ in types.items()])
    sample_data_info = ", ".join(
        [f"{col}: {vals[:5]}" for col, vals in sample_data.items()])
    value_counts_info = ", ".join(
        [f"{col}: {dict(list(vc.items())[:5])}" for col, vc in value_counts.items()])
    description_info = ", ".join(
        [f"{col}: {desc}" for col, desc in description.items()])

    request = (
        f"Write a Python code for reading a CSV file, handling missing values, "
        f"splitting the data into training and test sets, training an appropriate model, and evaluating the model's performance. "
        f"Use placeholders like {csv_path} for dynamic inputs. The dataset has the following columns: {columns_info}. "
        f"The data types are: {types_info}. Sample data: {sample_data_info}. Value counts: {value_counts_info}. "
        f"Description: {description_info}. Only return the code without any explanations."
    )

    generated_code = generate_code(request)
    corrected_code = clean_and_correct_code(generated_code, f"{csv_path}")

    code_filename = "generated_code.py"
    with open(code_filename, "w") as file:
        file.write(corrected_code)

    print(f"Generated code saved to {code_filename}")


if __name__ == "__main__":
    main_part1()

# Function to validate the code


def validate_code(code_filename):
    try:
        result = subprocess.run(
            ["python", code_filename], capture_output=True, text=True)
        if result.returncode != 0:
            raise Exception(result.stderr)
        return True, result.stdout
    except Exception as e:
        return False, str(e)

Generated code saved to generated_code.py


In [2]:
# Main function for Part 2


def main_part2():
    code_filename = "generated_code.py"
    success, output = validate_code(code_filename)

    if not success:
        print("Code validation failed with error:")
        print(output)
    else:
        print("Code validated successfully. Output:")
        print(output)

    return success, output


if __name__ == "__main__":
    success, output = main_part2()

Code validation failed with error:
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['charges'].fillna(data['charges'].mean(), inplace=True)
Traceback (most recent call last):
  File "/Users/ilya/Desktop/GitHub_Repositories/Thesis/Latest_2 copy/generated_code.py", line 16, in <module>
    rf_model.fit(X_train, y_train)
  File "/Users/ilya/miniconda3/envs/thesis/lib/python3.10/site-packages/sklearn/base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/Users/ilya/miniconda3/envs/thesis/lib/python3.10/site-packages/sklearn/ensemble/_forest.py", line 363, in fit
    X, y = self._validate_data(
  File "/Users/ilya/minicon

In [3]:

# Main function for Part 3 with multiple validation attempts
def main_part3():
    csv_path = "/Users/ilya/Desktop/GitHub_Repositories/HW_University/Data_Mining/datasets/insurance.csv"
    columns, types, sample_data, value_counts, description = get_dataset_info(
        csv_path)

    columns_info = ", ".join(columns)
    types_info = ", ".join([f"{col}: {typ}" for col, typ in types.items()])
    sample_data_info = ", ".join(
        [f"{col}: {vals[:5]}" for col, vals in sample_data.items()])
    value_counts_info = ", ".join(
        [f"{col}: {dict(list(vc.items())[:5])}" for col, vc in value_counts.items()])
    description_info = ", ".join(
        [f"{col}: {desc}" for col, desc in description.items()])

    code_filename = "generated_code.py"
    with open(code_filename, "r") as file:
        initial_code = file.read()

    max_attempts = 5
    attempts = 0
    same_error_count = 0
    last_error = ""
    validated_code = initial_code

    while attempts < max_attempts:
        with open("generated_code.py", "w") as file:
            file.write(validated_code)

        success, output = main_part2()
        if success:
            print("Code runs successfully.")
            break
        else:
            print(f"Attempt {attempts + 1} failed with error:\n{output}")
            if output == last_error:
                same_error_count += 1
            else:
                same_error_count = 0
            last_error = output

            if same_error_count >= 2:
                print(
                    "The same error occurred multiple times. Modifying the request to help the model.")
                request = (
                    f"Fix the following code and its errors. Ensure to correct any logical mistakes or data handling issues:\n{validated_code}\nError:\n{output}. "
                    f"The dataset has the following columns: {columns_info}. "
                    f"The data types are: {types_info}. Sample data: {sample_data_info}. Value counts: {value_counts_info}. "
                    f"Description: {description_info}."
                )
            else:
                request = (
                    f"Fix the following code and its errors:\n{validated_code}\nError:\n{output}. "
                    f"The dataset has the following columns: {columns_info}. "
                    f"The data types are: {types_info}. Sample data: {sample_data_info}. Value counts: {value_counts_info}. "
                    f"Description: {description_info}."
                )

            generated_code = generate_code(request)
            validated_code = clean_and_correct_code(
                generated_code, f"{csv_path}")

            attempts += 1

    if not success:
        print("Failed to validate the code after multiple attempts.")


if __name__ == "__main__":
    main_part3()

Code validation failed with error:
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['charges'].fillna(data['charges'].mean(), inplace=True)
Traceback (most recent call last):
  File "/Users/ilya/Desktop/GitHub_Repositories/Thesis/Latest_2 copy/generated_code.py", line 16, in <module>
    rf_model.fit(X_train, y_train)
  File "/Users/ilya/miniconda3/envs/thesis/lib/python3.10/site-packages/sklearn/base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/Users/ilya/miniconda3/envs/thesis/lib/python3.10/site-packages/sklearn/ensemble/_forest.py", line 363, in fit
    X, y = self._validate_data(
  File "/Users/ilya/minicon