# Rental Marketplace Analytics - User Engagement Metrics

This notebook analyzes key user engagement metrics for the rental marketplace platform, including:

1. **Total Bookings per User**: Count of rentals booked per user every week
2. **Average Booking Duration**: Mean duration of confirmed stays over time
3. **Repeat Customer Rate**: Measure of users who book more than once within a rolling 30-day period

In [None]:
# Import necessary libraries
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import psycopg2
from pathlib import Path
from datetime import datetime, timedelta

# Add project root to Python path
project_root = Path.cwd().parent
sys.path.append(str(project_root))

# Import project settings
from src.config.settings import REDSHIFT_CONFIG

# Set up plotting style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 12

In [None]:
# Function to connect to Redshift
def connect_to_redshift():
    """Establish connection to Redshift."""
    try:
        conn = psycopg2.connect(
            host=REDSHIFT_CONFIG["host"],
            port=REDSHIFT_CONFIG["port"],
            dbname=REDSHIFT_CONFIG["database"],
            user=REDSHIFT_CONFIG["user"],
            password=REDSHIFT_CONFIG["password"]
        )
        print("Connected to Redshift successfully")
        return conn
    except Exception as e:
        print(f"Failed to connect to Redshift: {str(e)}")
        return None

# Function to execute a query and return results as a DataFrame
def execute_query(conn, query):
    """Execute a SQL query and return results as a DataFrame."""
    try:
        return pd.read_sql_query(query, conn)
    except Exception as e:
        print(f"Error executing query: {str(e)}")
        return pd.DataFrame()

In [None]:
# Connect to Redshift
conn = connect_to_redshift()

if conn is None:
    print("Using sample data for demonstration purposes")
    # Create sample data for demonstration
    # This allows the notebook to run even without a Redshift connection
    
    # Sample data for user engagement metrics
    weeks = pd.date_range(start='2023-01-01', periods=12, freq='W')
    user_engagement = pd.DataFrame({
        'week_start_date': weeks,
        'total_users': np.random.randint(500, 1500, len(weeks)),
        'active_users': np.random.randint(300, 1000, len(weeks)),
        'new_users': np.random.randint(50, 200, len(weeks)),
        'total_bookings': np.random.randint(200, 800, len(weeks)),
        'avg_bookings_per_user': np.random.uniform(1.0, 3.0, len(weeks)),
        'avg_booking_duration': np.random.uniform(2.0, 7.0, len(weeks)),
        'repeat_customer_rate': np.random.uniform(10, 40, len(weeks))
    })
    
    # Sample data for user booking history
    user_booking_history = []
    for week in weeks:
        for user_id in range(1, 21):  # 20 sample users
            user_booking_history.append({
                'user_id': user_id,
                'week_start_date': week,
                'total_bookings': np.random.randint(0, 5),
                'total_spend': np.random.uniform(0, 2000),
                'avg_booking_duration': np.random.uniform(1, 10),
                'favorite_city': np.random.choice(['New York', 'Los Angeles', 'Chicago', 'Miami', 'San Francisco']),
                'is_repeat_customer': np.random.choice([True, False], p=[0.3, 0.7])
            })
    
    user_booking_history_df = pd.DataFrame(user_booking_history)

## 1. Total Bookings per User

Analyzing the number of rentals booked per user every week.

In [None]:
if conn is not None:
    # Query user engagement metrics from Redshift
    query = """
    SELECT 
        week_start_date,
        total_users,
        active_users,
        new_users,
        total_bookings,
        avg_bookings_per_user,
        avg_booking_duration,
        repeat_customer_rate
    FROM 
        pres_user_engagement_weekly
    ORDER BY 
        week_start_date
    """
    user_engagement = execute_query(conn, query)

# Display the data
user_engagement.head()

In [None]:
# Plot average bookings per user trend
plt.figure(figsize=(14, 7))
plt.plot(user_engagement['week_start_date'], user_engagement['avg_bookings_per_user'], marker='o', linewidth=2, color='#1f77b4')

# Add a line for the average
avg_bookings_per_user = user_engagement['avg_bookings_per_user'].mean()
plt.axhline(y=avg_bookings_per_user, color='r', linestyle='--', label=f'Average: {avg_bookings_per_user:.2f}')

plt.title('Average Bookings per User (Weekly)', fontsize=16)
plt.xlabel('Week', fontsize=14)
plt.ylabel('Average Bookings per User', fontsize=14)
plt.grid(True, alpha=0.3)
plt.xticks(rotation=45)
plt.legend()
plt.tight_layout()
plt.show()

# Calculate statistics
max_bookings = user_engagement['avg_bookings_per_user'].max()
min_bookings = user_engagement['avg_bookings_per_user'].min()
bookings_trend = user_engagement['avg_bookings_per_user'].iloc[-1] - user_engagement['avg_bookings_per_user'].iloc[0]

print(f"Average bookings per user: {avg_bookings_per_user:.2f}")
print(f"Maximum average bookings: {max_bookings:.2f}")
print(f"Minimum average bookings: {min_bookings:.2f}")
print(f"Trend (first to last week): {bookings_trend:.2f} ({bookings_trend/user_engagement['avg_bookings_per_user'].iloc[0]*100:.1f}%)")

In [None]:
if conn is not None:
    # Query user booking history from Redshift
    query = """
    SELECT 
        user_id,
        week_start_date,
        total_bookings,
        total_spend,
        avg_booking_duration,
        favorite_city,
        is_repeat_customer
    FROM 
        pres_user_booking_history
    ORDER BY 
        week_start_date, user_id
    """
    user_booking_history_df = execute_query(conn, query)

# Get the most recent week's data
latest_week = user_booking_history_df['week_start_date'].max()
latest_user_bookings = user_booking_history_df[user_booking_history_df['week_start_date'] == latest_week]

# Distribution of bookings per user
plt.figure(figsize=(12, 6))
sns.histplot(latest_user_bookings['total_bookings'], bins=10, kde=True, color='#2ca02c')
plt.title(f'Distribution of Bookings per User (Week of {latest_week.strftime("%Y-%m-%d")})', fontsize=16)
plt.xlabel('Number of Bookings', fontsize=14)
plt.ylabel('Number of Users', fontsize=14)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# Calculate statistics
booking_counts = latest_user_bookings['total_bookings'].value_counts().sort_index()
print("Number of users by booking count:")
for bookings, count in booking_counts.items():
    print(f"  {bookings} booking(s): {count} users ({count/len(latest_user_bookings)*100:.1f}%)")

## 2. Average Booking Duration

Analyzing the mean duration of confirmed stays over time.

In [None]:
# Plot average booking duration trend
plt.figure(figsize=(14, 7))
plt.plot(user_engagement['week_start_date'], user_engagement['avg_booking_duration'], marker='o', linewidth=2, color='#d62728')

# Add a line for the average
avg_duration = user_engagement['avg_booking_duration'].mean()
plt.axhline(y=avg_duration, color='r', linestyle='--', label=f'Average: {avg_duration:.2f} days')

plt.title('Average Booking Duration (Weekly)', fontsize=16)
plt.xlabel('Week', fontsize=14)
plt.ylabel('Average Duration (days)', fontsize=14)
plt.grid(True, alpha=0.3)
plt.xticks(rotation=45)
plt.legend()
plt.tight_layout()
plt.show()

# Calculate statistics
max_duration = user_engagement['avg_booking_duration'].max()
min_duration = user_engagement['avg_booking_duration'].min()
duration_trend = user_engagement['avg_booking_duration'].iloc[-1] - user_engagement['avg_booking_duration'].iloc[0]

print(f"Average booking duration: {avg_duration:.2f} days")
print(f"Maximum average duration: {max_duration:.2f} days")
print(f"Minimum average duration: {min_duration:.2f} days")
print(f"Trend (first to last week): {duration_trend:.2f} days ({duration_trend/user_engagement['avg_booking_duration'].iloc[0]*100:.1f}%)")

In [None]:
# Analyze the relationship between booking duration and spending
plt.figure(figsize=(12, 8))
plt.scatter(latest_user_bookings['avg_booking_duration'], latest_user_bookings['total_spend'], 
            s=latest_user_bookings['total_bookings']*50, alpha=0.6, c='#ff7f0e')

# Add labels for each point
for i, row in latest_user_bookings.iterrows():
    if row['total_bookings'] > 0:  # Only label users with bookings
        plt.annotate(row['user_id'], 
                     (row['avg_booking_duration'], row['total_spend']),
                     xytext=(5, 5), textcoords='offset points')

plt.title('Relationship Between Booking Duration and Total Spend', fontsize=16)
plt.xlabel('Average Booking Duration (days)', fontsize=14)
plt.ylabel('Total Spend ($)', fontsize=14)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# Calculate correlation (excluding users with no bookings)
bookings_df = latest_user_bookings[latest_user_bookings['total_bookings'] > 0]
correlation = bookings_df['avg_booking_duration'].corr(bookings_df['total_spend'])
print(f"Correlation between booking duration and total spend: {correlation:.2f}")

## 3. Repeat Customer Rate

Measuring how many users book more than once within a rolling 30-day period.

In [None]:
# Plot repeat customer rate trend
plt.figure(figsize=(14, 7))
plt.plot(user_engagement['week_start_date'], user_engagement['repeat_customer_rate'], marker='o', linewidth=2, color='#9467bd')

# Add a line for the average
avg_repeat_rate = user_engagement['repeat_customer_rate'].mean()
plt.axhline(y=avg_repeat_rate, color='r', linestyle='--', label=f'Average: {avg_repeat_rate:.2f}%')

plt.title('Repeat Customer Rate (Weekly)', fontsize=16)
plt.xlabel('Week', fontsize=14)
plt.ylabel('Repeat Customer Rate (%)', fontsize=14)
plt.grid(True, alpha=0.3)
plt.xticks(rotation=45)
plt.legend()
plt.tight_layout()
plt.show()

# Calculate statistics
max_repeat_rate = user_engagement['repeat_customer_rate'].max()
min_repeat_rate = user_engagement['repeat_customer_rate'].min()
repeat_rate_trend = user_engagement['repeat_customer_rate'].iloc[-1] - user_engagement['repeat_customer_rate'].iloc[0]

print(f"Average repeat customer rate: {avg_repeat_rate:.2f}%")
print(f"Maximum repeat rate: {max_repeat_rate:.2f}%")
print(f"Minimum repeat rate: {min_repeat_rate:.2f}%")
print(f"Trend (first to last week): {repeat_rate_trend:.2f}% ({repeat_rate_trend/user_engagement['repeat_customer_rate'].iloc[0]*100:.1f}%)")

In [None]:
# Analyze repeat customers vs. one-time customers
repeat_customers = latest_user_bookings[latest_user_bookings['is_repeat_customer'] == True]
one_time_customers = latest_user_bookings[latest_user_bookings['is_repeat_customer'] == False]

# Compare spending patterns
repeat_avg_spend = repeat_customers['total_spend'].mean()
one_time_avg_spend = one_time_customers['total_spend'].mean()

# Compare booking duration
repeat_avg_duration = repeat_customers['avg_booking_duration'].mean()
one_time_avg_duration = one_time_customers['avg_booking_duration'].mean()

# Plot comparison
plt.figure(figsize=(12, 6))

# Spending comparison
plt.subplot(1, 2, 1)
plt.bar(['One-time Customers', 'Repeat Customers'], [one_time_avg_spend, repeat_avg_spend], color=['#8c564b', '#e377c2'])
plt.title('Average Spend', fontsize=14)
plt.ylabel('Average Spend (USD)', fontsize=12)
plt.grid(True, alpha=0.3)

# Duration comparison
plt.subplot(1, 2, 2)
plt.bar(['One-time Customers', 'Repeat Customers'], [one_time_avg_duration, repeat_avg_duration], color=['#8c564b', '#e377c2'])
plt.title('Average Booking Duration', fontsize=14)
plt.ylabel('Average Duration (days)', fontsize=12)
plt.grid(True, alpha=0.3)

plt.suptitle('Comparison: Repeat vs. One-time Customers', fontsize=16)
plt.tight_layout(rect=[0, 0, 1, 0.95])
plt.show()

# Print statistics
print(f"Repeat customers: {len(repeat_customers)} ({len(repeat_customers)/len(latest_user_bookings)*100:.1f}%)")
print(f"One-time customers: {len(one_time_customers)} ({len(one_time_customers)/len(latest_user_bookings)*100:.1f}%)")
print(f"\nAverage spend:")
print(f"  Repeat customers: ${repeat_avg_spend:.2f}")
print(f"  One-time customers: ${one_time_avg_spend:.2f}")
print(f"  Difference: ${repeat_avg_spend - one_time_avg_spend:.2f} ({(repeat_avg_spend/one_time_avg_spend - 1)*100:.1f}% higher)")
print(f"\nAverage booking duration:")
print(f"  Repeat customers: {repeat_avg_duration:.2f} days")
print(f"  One-time customers: {one_time_avg_duration:.2f} days")
print(f"  Difference: {repeat_avg_duration - one_time_avg_duration:.2f} days ({(repeat_avg_duration/one_time_avg_duration - 1)*100:.1f}% longer)")

## 4. User Engagement Overview

Analyzing overall user engagement trends.

In [None]:
# Plot user metrics over time
plt.figure(figsize=(14, 8))

plt.plot(user_engagement['week_start_date'], user_engagement['total_users'], marker='o', linewidth=2, label='Total Users')
plt.plot(user_engagement['week_start_date'], user_engagement['active_users'], marker='s', linewidth=2, label='Active Users')
plt.plot(user_engagement['week_start_date'], user_engagement['new_users'], marker='^', linewidth=2, label='New Users')

plt.title('User Engagement Metrics Over Time', fontsize=16)
plt.xlabel('Week', fontsize=14)
plt.ylabel('Number of Users', fontsize=14)
plt.grid(True, alpha=0.3)
plt.xticks(rotation=45)
plt.legend()
plt.tight_layout()
plt.show()

# Calculate user activity rate
user_engagement['activity_rate'] = user_engagement['active_users'] / user_engagement['total_users'] * 100

# Plot activity rate
plt.figure(figsize=(14, 6))
plt.plot(user_engagement['week_start_date'], user_engagement['activity_rate'], marker='o', linewidth=2, color='#7f7f7f')

plt.title('User Activity Rate Over Time', fontsize=16)
plt.xlabel('Week', fontsize=14)
plt.ylabel('Activity Rate (%)', fontsize=14)
plt.grid(True, alpha=0.3)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Calculate statistics
avg_activity_rate = user_engagement['activity_rate'].mean()
print(f"Average user activity rate: {avg_activity_rate:.2f}%")

## Summary of Findings

Based on the analysis of user engagement metrics, we can draw the following conclusions:

1. **Total Bookings per User**: [Summary of bookings per user trends and insights]

2. **Average Booking Duration**: [Summary of booking duration trends and insights]

3. **Repeat Customer Rate**: [Summary of repeat customer rate trends and insights]

4. **User Engagement Overview**: [Summary of overall user engagement trends and insights]

### Recommendations

Based on these findings, we recommend the following actions to improve user engagement:

1. [Recommendation 1]
2. [Recommendation 2]
3. [Recommendation 3]

In [None]:
# Close the Redshift connection if it exists
if conn is not None:
    conn.close()
    print("Redshift connection closed")