Day 9: PostgreSQL with SQLAlchemy

Added dotenv functionality after the initial postgres setup and testing.

In [None]:
# Setup for PostgreSQL with SQLAlchemy using dotenv for secure credentials management
# using local .env file, local PostgreSQL database
# Use git bash terminal to update .env file if needed

from sqlalchemy import create_engine, text
import pandas as pd
import os
from dotenv import load_dotenv

class DatabaseConnection:
    """Professional database connection class"""
    
    def __init__(self, connection_string):
        self.connection_string = connection_string
        self.engine = None
    
    def connect(self):
        """Create database engine"""
        try:
            self.engine = create_engine(self.connection_string)
            print("Database connection established")
            return True
        except Exception as e:
            print(f"Connection error: {e}")
            return False
    
    def execute_query(self, query, params=None):
        """Execute query and return DataFrame"""
        try:
            if params:
                df = pd.read_sql_query(text(query), self.engine, params=params)
            else:
                df = pd.read_sql_query(text(query), self.engine)
            return df
        except Exception as e:
            print(f"Query error: {e}")
            return None
    
    def close(self):
        """Close database connection"""
        if self.engine:
            self.engine.dispose()
            print("Database connection closed")

# Load environment variables
load_dotenv()

# Get database URL from .env file
database_url = os.getenv('DATABASE_URL')

# Create database connection instance
db = DatabaseConnection(database_url)

# Connect and execute queries
if db.connect():
    # Test connection first
    test_query = "SELECT version();"
    test_df = db.execute_query(test_query)
    if test_df is not None:
        print("Connection test successful!")
        print(test_df)
    
    # Your practice query (adjusted for Chinook database)
    query = """
    SELECT 
        il.invoice_line_id,
        i.invoice_id,
        i.invoice_date,
        t.name as track_name,
        a.title as album_title,
        ar.name as artist_name,
        g.name as genre_name,
        il.unit_price,
        il.quantity
    FROM invoice_line il 
    LEFT JOIN invoice i ON il.invoice_id = i.invoice_id
    INNER JOIN track t ON il.track_id = t.track_id
    INNER JOIN album a ON t.album_id = a.album_id
    INNER JOIN artist ar ON a.artist_id = ar.artist_id
    INNER JOIN genre g ON t.genre_id = g.genre_id
    WHERE i.invoice_date > '2010-01-01'
    LIMIT 10;
    """
    
    df = db.execute_query(query)
    if df is not None:
        print("\nQuery Results:")
        print(df.head())
        print(f"\nTotal rows returned: {len(df)}")
    
    # Close connection
    db.close()
else:
    print("Failed to connect to database. Check your .env file and PostgreSQL setup.")

Advanced customer analysis code.

In [10]:
def customer_lifetime_value_analysis():
    """Advanced business analytics query for Chinook music database"""
    query = """
    WITH customer_metrics AS (
        SELECT 
            c.customer_id,
            c.first_name || ' ' || c.last_name as customer_name,
            c.country,
            c.state,
            COUNT(DISTINCT i.invoice_id) as total_orders,
            SUM(i.total) as total_revenue,
            AVG(i.total) as avg_order_value,
            MIN(i.invoice_date) as first_order_date,
            MAX(i.invoice_date) as last_order_date,
            EXTRACT(DAYS FROM (MAX(i.invoice_date) - MIN(i.invoice_date))) as customer_lifespan_days,
            COUNT(DISTINCT il.track_id) as total_tracks_purchased
        FROM customer c
        JOIN invoice i ON c.customer_id = i.customer_id
        JOIN invoice_line il ON i.invoice_id = il.invoice_id
        GROUP BY c.customer_id, c.first_name, c.last_name, c.country, c.state
    ),
    customer_analysis AS (
        SELECT 
            *,
            CASE 
                WHEN customer_lifespan_days > 0 
                THEN total_revenue / NULLIF(customer_lifespan_days, 0) * 365 
                ELSE total_revenue 
            END as estimated_annual_value,
            total_revenue / NULLIF(total_orders, 0) as revenue_per_order,
            total_tracks_purchased / NULLIF(total_orders, 0) as avg_tracks_per_order
        FROM customer_metrics
    )
    SELECT 
        customer_id,
        customer_name,
        country,
        state,
        total_orders,
        total_tracks_purchased,
        ROUND(total_revenue, 2) as total_revenue,
        ROUND(avg_order_value, 2) as avg_order_value,
        ROUND(revenue_per_order, 2) as revenue_per_order,
        ROUND(avg_tracks_per_order, 1) as avg_tracks_per_order,
        first_order_date,
        last_order_date,
        customer_lifespan_days,
        ROUND(estimated_annual_value, 2) as estimated_annual_value,
        CASE 
            WHEN total_revenue > 45 THEN 'High Value'
            WHEN total_revenue > 25 THEN 'Medium Value'
            ELSE 'Low Value'
        END as customer_segment,
        CASE 
            WHEN customer_lifespan_days = 0 THEN 'One-time'
            WHEN customer_lifespan_days <= 90 THEN 'Short-term'
            WHEN customer_lifespan_days <= 365 THEN 'Medium-term'
            ELSE 'Long-term'
        END as customer_lifecycle_stage
    FROM customer_analysis
    ORDER BY total_revenue DESC
    """
    
    # If using a database connection function
    db = get_database_connection()
    if db.connect():
        df = db.execute_query(query)
        db.close()
        return df
    return None

# Execute advanced analytics
clv_analysis = customer_lifetime_value_analysis()
if clv_analysis is not None:
    print("Customer Lifetime Value Analysis (Chinook Music Store):")
    print(clv_analysis.head(10))  # Show top 10 customers
    
    # Segment analysis
    segment_summary = clv_analysis.groupby('customer_segment').agg({
        'customer_id': 'count',
        'total_revenue': ['sum', 'mean'],
        'total_orders': 'mean',
        'total_tracks_purchased': 'mean',
        'estimated_annual_value': 'mean'
    }).round(2)
    
    segment_summary.columns = ['Customer_Count', 'Total_Revenue', 'Avg_Revenue_Per_Customer', 
                              'Avg_Orders', 'Avg_Tracks', 'Avg_Annual_Value']
    
    print("\nCustomer Segment Summary:")
    print(segment_summary)
    
    # Geographic analysis
    country_summary = clv_analysis.groupby('country').agg({
        'customer_id': 'count',
        'total_revenue': ['sum', 'mean'],
        'avg_order_value': 'mean'
    }).round(2)
    
    country_summary.columns = ['Customer_Count', 'Total_Revenue', 'Avg_Revenue_Per_Customer', 'Avg_Order_Value']
    country_summary = country_summary.sort_values('Total_Revenue', ascending=False)
    
    print("\nTop 10 Countries by Revenue:")
    print(country_summary.head(10))
    
    # Lifecycle stage analysis
    lifecycle_summary = clv_analysis.groupby('customer_lifecycle_stage').agg({
        'customer_id': 'count',
        'total_revenue': 'mean',
        'customer_lifespan_days': 'mean'
    }).round(2)
    
    print("\nCustomer Lifecycle Analysis:")
    print(lifecycle_summary)

Database connection established
Database connection closed
Customer Lifetime Value Analysis (Chinook Music Store):
   customer_id          customer_name         country   state  total_orders  \
0            6            Helena Holý  Czech Republic    None             7   
1           26     Richard Cunningham             USA      TX             7   
2           45        Ladislav Kovács         Hungary    None             7   
3           46          Hugh O'Reilly         Ireland  Dublin             7   
4           57             Luis Rojas           Chile    None             7   
5           25         Victor Stevens             USA      WI             7   
6            7          Astrid Gruber         Austria    None             7   
7           37        Fynn Zimmermann         Germany    None             7   
8           24          Frank Ralston             USA      IL             7   
9            5  František Wichterlová  Czech Republic    None             7   

   total_tracks