Day 8: SQLite3 - Local Database Mastery

Goal: Master local database operations with Python

In [None]:
import sqlite3
import pandas as pd

# The standard connection pattern you'll use everywhere
def connect_to_database(db_name):
    """Connect to SQLite database with proper error handling"""
    try:
        conn = sqlite3.connect(db_name)
        print(f"Successfully connected to {db_name}")
        return conn
    except sqlite3.Error as e:
        print(f"Error connecting to database: {e}")
        return None

# Usage
conn = connect_to_database('sample_business.db')

In [None]:
conn = sqlite3.connect('business_data.db')
cursor = conn.cursor()

cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
tables = cursor.fetchall()

print("Tables found:", tables)
conn.close()


Created a sample sqlite3 database ('business_data.db') and tables, then populated with sample data.

In [None]:
import sqlite3
import pandas as pd
def insert_sample_data():
    """Insert realistic sample data"""
    conn = sqlite3.connect('business_data.db')
    cursor = conn.cursor()
    
    # Sample customers
    customers_data = [
        (1, 'TechCorp Solutions', 'Alice Johnson', 'North', 'USA', '2023-01-15', 1),
        (2, 'Global Industries', 'Bob Smith', 'South', 'USA', '2023-02-20', 1),
        (3, 'Innovation Labs', 'Carol Davis', 'East', 'USA', '2023-03-10', 1),
        (4, 'Digital Dynamics', 'David Wilson', 'West', 'USA', '2023-01-25', 0),
        (5, 'Future Systems', 'Emma Brown', 'Central', 'USA', '2023-04-05', 1)
    ]
    
    cursor.executemany('''
    INSERT OR REPLACE INTO customers 
    (customer_id, company_name, contact_name, region, country, signup_date, is_active)
    VALUES (?, ?, ?, ?, ?, ?, ?)
    ''', customers_data)
    
    # Sample products
    products_data = [
        (1, 'Software License', 'Software', 299.99, 100),
        (2, 'Consulting Hours', 'Service', 150.00, 999),
        (3, 'Training Workshop', 'Training', 500.00, 50),
        (4, 'Support Package', 'Service', 99.99, 200),
        (5, 'Hardware Kit', 'Hardware', 750.00, 25)
    ]
    
    cursor.executemany('''
    INSERT OR REPLACE INTO products 
    (product_id, product_name, category, unit_price, stock_quantity)
    VALUES (?, ?, ?, ?, ?)
    ''', products_data)
    
    # Sample orders
    orders_data = [
        (1, 1, '2023-05-01', 1499.95, 'Completed', 'Alice Rep'),
        (2, 2, '2023-05-02', 899.99, 'Completed', 'Bob Rep'),
        (3, 3, '2023-05-03', 1750.00, 'Pending', 'Carol Rep'),
        (4, 1, '2023-05-04', 599.98, 'Completed', 'Alice Rep'),
        (5, 5, '2023-05-05', 2250.00, 'Processing', 'Emma Rep')
    ]
    
    cursor.executemany('''
    INSERT OR REPLACE INTO orders 
    (order_id, customer_id, order_date, total_amount, status, sales_rep)
    VALUES (?, ?, ?, ?, ?, ?)
    ''', orders_data)

    # Sample order items
    order_items_data = [
        (1, 1, 1, 1, 1499.95),  # customer 1, product 1
        (2, 2, 2, 1, 899.99),   # customer 2, product 2
        (3, 3, 3, 2, 875.00),   # customer 3, product 3
        (4, 4, 4, 1, 599.98),   # customer 1 again
        (5, 5, 5, 3, 750.00)    # customer 5
    ]

    cursor.executemany('''
        INSERT OR REPLACE INTO order_items (item_id, order_id, product_id, quantity, unit_price)
        VALUES (?, ?, ?, ?, ?)
    ''', order_items_data)

    
    conn.commit()
    conn.close()
    print("Sample data inserted successfully!")

# Populate the database
insert_sample_data()

Next, execute SQL query and return results as pandas DataFrame. This is the pattern used constantly in AI work.

**The template for any SQL query**
```python
def my_business_query():
    query = """
    -- Your actual SQL query here
    SELECT column1, column2
    FROM table1
    WHERE condition
    """
    return execute_query_to_dataframe(query)
```
**Execute and analyze**
```python
df_result = my_business_query()
print(df_result.describe())  # Combine SQL with pandas analysis!
```

In [None]:
import sqlite3
import pandas as pd

def get_db_schema_as_dataframe(db_name='business_data.db'):
    """Returns a DataFrame listing all tables and their columns from the SQLite database."""
    rows = []

    try:
        conn = sqlite3.connect(db_name)
        cursor = conn.cursor()

        # Get all table names
        cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
        tables = cursor.fetchall()

        for (table_name,) in tables:
            cursor.execute(f"PRAGMA table_info({table_name});")
            columns_info = cursor.fetchall()

            for col in columns_info:
                rows.append({
                    "table": table_name,
                    "column_id": col[0],
                    "column_name": col[1],
                    "data_type": col[2],
                    "not_null": bool(col[3]),
                    "default_value": col[4],
                    "is_primary_key": bool(col[5])
                })

        df_schema = pd.DataFrame(rows)
        return df_schema

    except sqlite3.Error as e:
        print(f"Database error: {e}")
        return pd.DataFrame()
    finally:
        if conn:
            conn.close()
            print(f"Connection to {db_name} closed successfully.")
df_schema = get_db_schema_as_dataframe()
print(df_schema)



In [None]:
def execute_query_to_dataframe(query, db_name='business_data.db', params=None):
   
    try:
        conn = sqlite3.connect(db_name)
            # Execute query with optional parameters
        if params:
            df = pd.read_sql_query(query, conn, params=params) # If parameters are passed (to safely inject values into the query), use them
        else:
            df = pd.read_sql_query(query, conn) # Otherwise, execute the query as-is
        print(f"Query executed successfully")  
        conn.close() # Close the connection to avoid locking the database
        return df  # Return the query results as a DataFrame
        
    # Code to execute if Exception of SQLite-specific errors (e.g., bad query, missing table) occurs in the try block
    except sqlite3.Error as e: 
        print(f"Database error: {e}")
        return None
    except Exception as e:  # General exception handling for other errors like general Python exceptions
        # This will catch any other exceptions that are not SQLite-specific
        print(f"Error: {e}")
        return None
    finally:    # Ensure the connection is closed properly. This block will always execute, regardless of whether an exception occurred or not
       if conn:
            conn.close()
       print(f"Connection to {db_name} closed successfully.")

# Test with familiar SQL queries
query1 = "SELECT * FROM customers WHERE region = 'North'"
df_north_customers = execute_query_to_dataframe(query1)
print("North Region Customers:")
print(df_north_customers)

# Complex query with JOIN
query2 = """
SELECT 
    c.company_name,
    c.region,
    o.order_date,
    o.total_amount,
    o.status
FROM customers c
JOIN orders o ON c.customer_id = o.customer_id
WHERE o.total_amount > 1000
ORDER BY o.total_amount DESC
"""
df_large_orders = execute_query_to_dataframe(query2)
print("\nLarge Orders (>$1000):")
print(df_large_orders)



Next, trying with a parameter argument.

💡 Best Practice

Always use bound parameters (?, %s, etc.) — especially when working with user input or external data.

In [None]:
def get_customer_orders(customer_id):
    # Secure way to pass parameters to queries
    query = """
    SELECT 
        o.order_id,
        o.order_date,
        o.total_amount,
        o.status,
        c.company_name
    FROM orders o
    JOIN customers c ON o.customer_id = c.customer_id
    WHERE o.customer_id = ?
    """
    return execute_query_to_dataframe(query, params=(customer_id,)) # if only one value, use a comma to keep it as a tuple eg. ('West',) 

# Safe parameter passing
customer_orders = get_customer_orders(1)
print("Customer 1 Orders:")
print(customer_orders)

def get_orders_by_date_range(start_date, end_date):
    
    query = """
    SELECT 
        o.order_date,
        c.company_name,
        o.total_amount,
        o.status
    FROM orders o
    JOIN customers c ON o.customer_id = c.customer_id
    WHERE o.order_date BETWEEN ? AND ?
    ORDER BY o.order_date
    """
    return execute_query_to_dataframe(query, params=(start_date, end_date))

# Date range example
may_orders = get_orders_by_date_range('2023-05-01', '2023-05-31')
print("\nMay 2023 Orders:")
print(may_orders)

### 🔒 Safe SQL Queries: Bound Parameters vs SQL Injection

### Why Use Bound Parameters?

Bound parameters (also called parameterized queries) **separate the SQL code from user data**, preventing malicious inputs from altering the query structure.

---

### 💣 What is SQL Injection?

**SQL injection** is a type of security vulnerability where attackers manipulate input data to inject and run harmful SQL commands.

### ❌ Example: Unsafe Query (String Interpolation)
```python
region = "West' OR 1=1 --"
query = f"SELECT * FROM customers WHERE region = '{region}'"
```
The resulting SQL:
```SQL
SELECT * FROM customers WHERE region = 'West' OR 1=1 --'
```
- `OR 1=1` is always true → returns **all rows**
- `--` comments out the rest of the SQL query
- 🛑 **Result**: Attacker bypasses the filter and accesses everything
### ✅ Safe Version: Bound Parameters
```python
query = "SELECT * FROM customers WHERE region = ?"
params = ("West' OR 1=1 --",)
df = pd.read_sql_query(query, conn, params=params)
```
- The input is safely escaped
- Query becomes:
```SQL
SELECT * FROM customers WHERE region = 'West'' OR 1=1 --'
```
- The malicious code is treated as a literal string, not executable SQL

| Feature               | Unsafe (String Interpolation) | Safe (Bound Parameters) |
| --------------------- | ----------------------------- | ----------------------- |
| 🔐 SQL Injection Risk | ✅ High                        | ❌ Prevented             |
| 🔍 SQL Readability    | ❌ Can get messy               | ✅ Cleaner               |
| 🔄 Type Safety        | ❌ Manual quoting              | ✅ Automatic             |
| 💥 Risk of Crashing   | ✅ Higher                      | ❌ Lower                 |





____________________
### Writing Data Back to Database

**🔍 What cursor.execute() Does**

**🛠 Function:**
The cursor.execute() method sends a SQL command (like INSERT, SELECT, UPDATE, or DELETE) to the database engine through your connection.

**🔍 What is a Cursor?**

A cursor in SQLite (and most databases) is like a temporary "pointer" that:

- Sends SQL commands to the database
- Retrieves results from SELECT queries
- Handles row-by-row processing (if needed)

In [None]:
def insert_new_customer(company_name, contact_name, region, country):
    """Insert new customer and return the ID"""
    conn = sqlite3.connect('business_data.db')
    cursor = conn.cursor()
    
    try:
        cursor.execute('''
        INSERT INTO customers (company_name, contact_name, region, country, signup_date, is_active)
        VALUES (?, ?, ?, ?, date('now'), 1)
        ''', (company_name, contact_name, region, country))
        
        customer_id = cursor.lastrowid
        conn.commit()
        conn.close()
        
        print(f"New customer added with ID: {customer_id}")
        return customer_id
    
    except sqlite3.Error as e:
        print(f"Error inserting customer: {e}")
        conn.close()
        return None

# Add new customer
new_id = insert_new_customer('StartupXYZ', 'Frank Miller', 'West', 'USA')

def update_order_status(order_id, new_status):
    """Update order status"""
    conn = sqlite3.connect('business_data.db')
    cursor = conn.cursor()
    
    try:
        cursor.execute('''
        UPDATE orders 
        SET status = ?
        WHERE order_id = ?
        ''', (new_status, order_id))
        
        rows_affected = cursor.rowcount
        conn.commit()
        conn.close()
        
        print(f"Updated {rows_affected} order(s)")
        return rows_affected
    
    except sqlite3.Error as e:
        print(f"Error updating order: {e}")
        conn.close()
        return 0

# Update order status
update_order_status(3, 'Completed')

| Step                             | Action                                                   |
| -------------------------------- | -------------------------------------------------------- |
| 🔌 `conn = sqlite3.connect(...)` | Connects to the database                                 |
| 🎯 `cursor = conn.cursor()`      | Creates a tool (cursor) to interact with that connection |
| 🧾 `cursor.execute(...)`         | Executes a SQL command using that tool                   |

**After Executing?**
- You might use `cursor.fetchone()`, `fetchall()`, or `lastrowid` to read results.
- Always remember to `conn.commit()` for write operations, and `conn.close()` afterward.

In [None]:
def generate_sales_report(start_date, end_date):
    """Generate comprehensive sales report"""
    query = """
    SELECT 
        c.region,
        COUNT(o.order_id) as total_orders,
        SUM(o.total_amount) as total_revenue,
        AVG(o.total_amount) as avg_order_value,
        COUNT(DISTINCT c.customer_id) as unique_customers
    FROM orders o
    JOIN customers c ON o.customer_id = c.customer_id
    WHERE o.order_date BETWEEN ? AND ?
    GROUP BY c.region
    ORDER BY total_revenue DESC
    """
    return execute_query_to_dataframe(query, params=(start_date, end_date))

# Generate report
sales_report = generate_sales_report('2023-05-01', '2023-05-31')
print("Sales Report by Region:")
print(sales_report)

# Combine with pandas for enhanced analysis
sales_report['revenue_per_customer'] = sales_report['total_revenue'] / sales_report['unique_customers']
print("\nEnhanced with Pandas:")
print(sales_report[['region', 'revenue_per_customer']])

**📦 `cursor.fetchmany(size)` in SQLite**

#### 🔍 What It Does

The `.fetchmany(size)` method retrieves the **next `size` number of rows** from the result set.  
It’s useful for processing large datasets in **manageable chunks**.

#### Key Points
- Returns a list of tuples
- If no rows match, it returns an empty list: []
- Must be called after cursor.execute(...)
- Each tuple represents a row from the table

---

#### ✅ Example

```python
import sqlite3

# Connect to database
conn = sqlite3.connect('business_data.db')
cursor = conn.cursor()

# Execute query
cursor.execute("SELECT company_name, region FROM customers")

# Fetch rows in chunks of 3
print("Fetching in batches of 3:")
while True:
    batch = cursor.fetchmany(3)
    if not batch:
        break  # No more rows
    for row in batch:
        print(row)

conn.close()
```
**🔄 Comparison Summary**

| Method         | Returns                      | Use Case                        |
| -------------- | ---------------------------- | ------------------------------- |
| `fetchone()`   | Single tuple or `None`       | Step through rows one at a time |
| `fetchmany(n)` | List of up to `n` tuples     | Batch processing                |
| `fetchall()`   | List of all remaining tuples | Small/medium datasets           |

**💡 Best Practice:** Use `fetchmany()` when working with large datasets and want to avoid loading everything at once.

______

#### ❓ Does `conn.close()` Also Close the Cursor?

##### ✅ Yes — Closing the Connection Closes Its Cursor

When you call `conn.close()` it **closes the database connection** and also **invalidates** any cursors created from that connection.

#### 🔍 What Happens to the Cursor?

After `conn.close()`, the cursor is no longer usable

If you try to use it (e.g., `.execute()` or `.fetchall()`), you'll get a `sqlite3.ProgrammingError`

#### ✅ Best Practice

| Action                          | Needed?    | Notes                                 |
| ------------------------------- | ---------- | ------------------------------------- |
| `cursor.close()`                | Optional   | Only if you want to free it early     |
| `conn.close()`                  | Required ✅ | Always close the connection when done |
| Use cursor after `conn.close()` | ❌ Invalid  | Will raise a `ProgrammingError`       |


In [None]:
# Testing some queries for practice

def generate_order_report(status, customer_id):
    """Generate order report with product details"""
    query1 = """
        SELECT o.order_id,
                o.order_date,
                c.company_name,
                p.product_name,
                i.quantity,
                i.unit_price,
                o.status
        FROM order_items i
        JOIN products p ON i.product_id = p.product_id
        JOIN orders o ON i.order_id = o.order_id
        JOIN customers c ON o.customer_id = c.customer_id
        WHERE o.status = ? and c.customer_id = ?
    """
    return execute_query_to_dataframe(query1, params=(status,customer_id))  # Pass params as a tuple
# Generate report
order_report = generate_order_report('Completed',1)
print("Order Report:")
print(order_report)
# Create total_amount column
order_report['total_amount'] = order_report['unit_price'] * order_report['quantity']

# Grouping and Aggregation - Total Sales by Company
grouped_report = order_report.groupby(['company_name'])['total_amount'].sum()
print("\nTotal Sales by Company:")
print(grouped_report)

In [39]:
# Additional Claude.ai provided code for practice joins 

def practice_joins():
    """Examples of different types of joins you can now practice"""
    conn = sqlite3.connect('business_data.db')
    cursor = conn.cursor()
    
    print("\n=== PRACTICE JOIN QUERIES ===\n")
    
    # 1. Inner Join - Orders with Customer Details
    print("1. Orders with Customer Information:")
    cursor.execute('''
    SELECT o.order_id, c.company_name, o.order_date, o.total_amount, o.status
    FROM orders o
    INNER JOIN customers c ON o.customer_id = c.customer_id
    ORDER BY o.order_date
    ''')
    
    for row in cursor.fetchall():
        print(f"   Order {row[0]}: {row[1]} - ${row[3]:.2f} ({row[4]})")
    
    # 2. Three-way Join - Order Details with Products
    print("\n2. Detailed Order Items with Product Names:")
    cursor.execute('''
    SELECT c.company_name, o.order_id, p.product_name, oi.quantity
    FROM orders o
    INNER JOIN customers c ON o.customer_id = c.customer_id
    INNER JOIN order_items oi ON o.order_id = oi.order_id
    INNER JOIN products p ON oi.product_id = p.product_id
    ORDER BY o.order_id, oi.order_id
    ''')
    
    for row in cursor.fetchall():
        print(f"   {row[0]} - Order {row[1]}: {row[2]} (Qty: {row[3]}) ")
    
    # 3. Left Join - All Products with Order Counts
    print("\n3. Products with Order Frequency (including unordered products):")
    cursor.execute('''
    SELECT p.product_name, COUNT(oi.order_id) as times_ordered, 
           COALESCE(SUM(oi.quantity), 0) as total_quantity_sold
    FROM products p
    LEFT JOIN order_items oi ON p.product_id = oi.product_id
    GROUP BY p.product_id, p.product_name
    ORDER BY times_ordered DESC
    ''')
    
    for row in cursor.fetchall():
        print(f"   {row[0]}: Ordered {row[1]} times, Total qty: {row[2]}")
    
    # 4. Aggregation with Joins
    print("\n4. Customer Order Summary:")
    cursor.execute('''
    SELECT c.company_name, 
           COUNT(DISTINCT o.order_id) as total_orders,
           SUM(o.total_amount) as total_spent,
           AVG(o.total_amount) as avg_order_value
    FROM customers c
    LEFT JOIN orders o ON c.customer_id = o.customer_id
    GROUP BY c.customer_id, c.company_name
    ORDER BY total_spent DESC
    ''')
    
    for row in cursor.fetchall():
        spent = row[2] if row[2] else 0
        avg = row[3] if row[3] else 0
        print(f"   {row[0]}: {row[1]} orders, ${spent:.2f} total, ${avg:.2f} avg")
    
    conn.close()

# Run everything
if __name__ == "__main__":
    practice_joins()
    


=== PRACTICE JOIN QUERIES ===

1. Orders with Customer Information:
   Order 1: TechCorp Solutions - $1499.95 (Completed)
   Order 2: Global Industries - $899.99 (Completed)
   Order 3: Innovation Labs - $1750.00 (Pending)
   Order 4: TechCorp Solutions - $599.98 (Completed)
   Order 5: Future Systems - $2250.00 (Processing)

2. Detailed Order Items with Product Names:
   TechCorp Solutions - Order 1: Software License (Qty: 1) 
   Global Industries - Order 2: Consulting Hours (Qty: 1) 
   Innovation Labs - Order 3: Training Workshop (Qty: 2) 
   TechCorp Solutions - Order 4: Support Package (Qty: 1) 
   Future Systems - Order 5: Hardware Kit (Qty: 3) 

3. Products with Order Frequency (including unordered products):
   Software License: Ordered 1 times, Total qty: 1
   Consulting Hours: Ordered 1 times, Total qty: 1
   Training Workshop: Ordered 1 times, Total qty: 2
   Support Package: Ordered 1 times, Total qty: 1
   Hardware Kit: Ordered 1 times, Total qty: 3

4. Customer Order Sum

*Note to self - as I usually default to Left Join when querying...*

**🔍 INNER JOIN vs LEFT JOIN in Order Details Query**

### ✅ The Query:
```sql
SELECT c.company_name, o.order_id, p.product_name, oi.quantity, oi.line_total
FROM orders o
INNER JOIN customers c ON o.customer_id = c.customer_id
INNER JOIN order_items oi ON o.order_id = oi.order_id
INNER JOIN products p ON oi.product_id = p.product_id
ORDER BY o.order_id, oi.order_item_id
```

**❓ Why Use `INNER JOIN`?**
- You want only complete, connected records
- The report shows order details, so partial data (e.g., missing products or customers) would be misleading
- All the joins are on required relationships: each order should have a customer, items, and product references

**🧾 What Would `LEFT JOIN` Do?**

`LEFT JOIN` would include:
- Orders with no customer → company_name = NULL
- Orders with no items → item columns = NULL
- Items with missing products → product_name = NULL

This could be useful for audits or error checking, but not for a clean report of actual completed order records.

**🔄 Summary Table**
| JOIN Type    | Returns                                                | Use Case                       |
| ------------ | ------------------------------------------------------ | ------------------------------ |
| `INNER JOIN` | Only matching rows in **both tables**                  | ✅ Clean, relational reporting  |
| `LEFT JOIN`  | All rows from the left, plus matched right (or `NULL`) | 🧐 Optional or incomplete data |

**💡 Rule of Thumb:**
- Use `INNER JOIN` when you want only complete, valid rows across related tables.
- Use `LEFT JOIN` when you need all rows from one table, even if some relationships are missing.
