In [None]:
# Import the necessary libraries

import glob
import time
import duckdb
import pandas as pd

In [None]:
# Establish a connection to duckdb

conn = duckdb.connect()

In [None]:
# Read the contents of the csv file using duckdb

cur_time = time.time()
df = conn.execute("""
    SELECT *
    FROM read_csv_auto("./data/Sales_Product_Combined.csv", header=True)
    """).df()
print(f"time: ({time.time() - cur_time})")
print(df)

In [None]:
# Create a view

conn.register("df_view", df)

In [None]:
# Check the schema of the view

conn.execute("DESCRIBE df_view").df()

In [None]:
# Count the total number of records

conn.execute("SELECT count(*) FROM df_view").df()

In [None]:
# Count the total number of null value and remove them

df.isnull().sum()
df = df.dropna(how='all')

In [None]:
# A test implementation of a where clause

conn.execute("""SELECT * FROM df WHERE "Order ID"='295665'""").df()

In [None]:
# Create a table from the dataframe and enforcing the appropriate schema

conn.execute("""
             CREATE OR REPLACE TABLE sales AS
                SELECT
                    "Order ID"::INTEGER AS order_id,
                    Product AS product,
                    "Quantity Ordered"::INTEGER AS quantity,
                    REPLACE("Price", ',', '')::DECIMAL AS price,
                    strptime("Order Date", '%Y-%m-%d %H:%M:%S')::DATE as order_date,
                    "Purchase Address" AS purchase_address
                FROM df
                WHERE
                    TRY_CAST("Order ID" AS INTEGER) NOTNULL    
             """)

In [None]:
# Excluding records from a select statement

conn.execute(
    "SELECT * EXCLUDE (product, order_date, purchase_address) FROM sales").df()

In [None]:
# Minimum value implementation across selected columns

conn.execute("""
             SELECT 
                MIN(COLUMNS(* EXCLUDE (product, order_date, purchase_address))) FROM sales
             """).df()

In [None]:
# Creating a view from a table

conn.execute("""
             CREATE OR REPLACE VIEW aggregated_sales AS
             SELECT 
                order_id,
                COUNT (1) AS nb_orders,
                MONTH(order_date) AS month,
                STR_SPLIT(purchase_address, ',')[2] AS city,
                SUM(quantity * price) AS revenue
            FROM sales
            GROUP BY ALL
             """)

In [None]:
conn.execute("FROM aggregated_sales").df()

In [None]:
# Total revenue across the cities

conn.execute("""
             SELECT
                city,
                SUM(revenue) AS total
            FROM aggregated_sales
            GROUP BY city
            ORDER BY total DESC
             """).df()

In [None]:
# Saving as a parquet file

conn.execute(
    "COPY (FROM aggregated_sales) TO 'aggregated_sales.parquet' (FORMAT 'parquet')")

In [None]:
# Reading a parquet file

conn.execute("FROM aggregated_sales.parquet").df()