In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import json
import random

def generate_dataset(num_samples):
    """Generates a dataset of database operations and corresponding SQL statements.

    Args:
        num_samples: The number of samples to generate.

    Returns:
        A list of dictionaries, where each dictionary contains an "input" (JSON string representing the database operation) and an "output" (SQL string).
    """
    dataset = []
    for _ in range(num_samples):
        # Randomly choose an action type
        action = random.choice(["query", "insert", "update", "delete"])

        # Generate a table name
        table = random.choice(["users", "products", "orders", "customers", "employees", "books", "sales"])

        input_data = {"action": action, "table": table}
        output_sql = ""

        if action == "query":
            # Randomly choose columns to query
            columns = random.sample(["*"] + get_table_columns(table), random.randint(1, len(get_table_columns(table)) + 1))
            input_data["columns"] = columns

            # Generate a WHERE clause
            where_clause = generate_where_clause(table)
            if where_clause:
                input_data["where"] = where_clause
            
            # Generate a JOIN clause
            if random.random() < 0.3: # 30% chance to generate a JOIN
                join_data = generate_join_clause(table)
                if join_data:
                    input_data["join"] = join_data

            output_sql = generate_select_sql(table, columns, where_clause, join_data)

        elif action == "insert":
            # Generate data for insertion
            data = generate_insert_data(table)
            input_data["data"] = data
            output_sql = generate_insert_sql(table, data)

        elif action == "update":
            # Generate data for update
            data = generate_update_data(table)
            input_data["data"] = data

            # Generate a WHERE clause
            where_clause = generate_where_clause(table)
            if where_clause:
                input_data["where"] = where_clause
            output_sql = generate_update_sql(table, data, where_clause)

        elif action == "delete":
            # Generate a WHERE clause
            where_clause = generate_where_clause(table)
            if where_clause:
                input_data["where"] = where_clause
            output_sql = generate_delete_sql(table, where_clause)

        dataset.append({"input": json.dumps(input_data), "output": output_sql})
    return dataset


def get_table_columns(table):
    """Returns the columns of a given table.

    Args:
        table: The name of the table.

    Returns:
        A list of column names.
    """
    # Define the columns for each table here
    columns = {
        "users": ["id", "name", "email", "age"],
        "products": ["id", "name", "price", "category"],
        "orders": ["order_id", "status", "customer_id"],
        "customers": ["customer_id", "name", "country"],
        "employees": ["id", "name", "salary", "department_id"],
        "books": ["title", "author", "publication_year"],
        "sales": ["order_date", "amount"]
    }
    return columns.get(table, [])


def generate_where_clause(table):
    """Generates a WHERE clause for a given table.

    Args:
        table: The name of the table.

    Returns:
        A string representing the WHERE clause, or None if no WHERE clause can be generated.
    """
    columns = get_table_columns(table)
    if not columns:
        return None

    column = random.choice(columns)
    operator = random.choice(["=", ">", "<", ">=", "<=", "!="])
    value = generate_value_for_column(column)

    return f"{column} {operator} {value}"

def generate_join_clause(table):
    """Generates a JOIN clause for a given table.

    Args:
        table: The name of the table.

    Returns:
        A dictionary representing the JOIN clause, or None if no JOIN clause can be generated.
    """
    if table == "employees":
        return {"table": "departments", "on": "employees.department_id = departments.id", "columns": ["department_name"]}
    return None


def generate_value_for_column(column):
    """Generates a random value for a given column.

    Args:
        column: The name of the column.

    Returns:
        A string representing the value.
    """
    # Generate different values based on the column type here
    return f"'{random.choice(['value1', 'value2', 'value3'])}'"  # Simple example


def generate_insert_data(table):
    """Generates data for an INSERT statement.

    Args:
        table: The name of the table.

    Returns:
        A dictionary representing the data to be inserted.
    """
    columns = get_table_columns(table)
    data = {}
    for column in columns:
        data[column] = generate_value_for_column(column)
    return data


def generate_update_data(table):
    """Generates data for an UPDATE statement.

    Args:
        table: The name of the table.

    Returns:
        A dictionary representing the data to be updated.
    """
    columns = get_table_columns(table)
    data = {}
    for column in random.sample(columns, random.randint(1, len(columns))): # Randomly choose columns to update
        data[column] = generate_value_for_column(column)
    return data

def generate_select_sql(table, columns, where_clause, join_data):
    """Generates a SELECT SQL statement.

    Args:
        table: The name of the table.
        columns: A list of columns to select.
        where_clause: A string representing the WHERE clause, or None.
        join_data: A dictionary representing the JOIN clause, or None.

    Returns:
        A string representing the SELECT SQL statement.
    """
    sql = "SELECT " + ", ".join(columns) + " FROM " + table
    if join_data:
        sql += " JOIN " + join_data["table"] + " ON " + join_data["on"]
        if "columns" in join_data:
            sql = sql.replace("SELECT ", "SELECT " + ", ".join([table + "." + col for col in columns if col != "*"] + [join_data["table"] + "." + col for col in join_data["columns"]]) + " FROM ")
    if where_clause:
        sql += " WHERE " + where_clause
    sql += ";"
    return sql

def generate_insert_sql(table, data):
    """Generates an INSERT SQL statement.

    Args:
        table: The name of the table.
        data: A dictionary representing the data to be inserted.

    Returns:
        A string representing the INSERT SQL statement.
    """
    columns = ", ".join(data.keys())
    values = ", ".join([f"'{v}'" for v in data.values()])
    return f"INSERT INTO {table} ({columns}) VALUES ({values});"


def generate_update_sql(table, data, where_clause):
    """Generates an UPDATE SQL statement.

    Args:
        table: The name of the table.
        data: A dictionary representing the data to be updated.
        where_clause: A string representing the WHERE clause, or None.

    Returns:
        A string representing the UPDATE SQL statement.
    """
    updates = ", ".join([f"{k} = '{v}'" for k, v in data.items()])
    sql = f"UPDATE {table} SET {updates}"
    if where_clause:
        sql += " WHERE " + where_clause
    sql += ";"
    return sql


def generate_delete_sql(table, where_clause):
    """Generates a DELETE SQL statement.

    Args:
        table: The name of the table.
        where_clause: A string representing the WHERE clause, or None.

    Returns:
        A string representing the DELETE SQL statement.
    """
    sql = f"DELETE FROM {table}"
    if where_clause:
        sql += " WHERE " + where_clause
    sql += ";"
    return sql


# Generate 10 dataset samples
dataset = generate_dataset(10)

# Print the