# Comprehensive Database Tutorial and Demonstration

This notebook provides a detailed walkthrough of our custom Database class, showcasing its features and functionalities. We'll cover everything from basic table access to advanced querying and analysis.

## Table of Contents
1. [Setup and Initialization](#1-setup-and-initialization)
2. [Exploring Database Structure](#2-exploring-database-structure)
3. [Accessing Tables and Views](#3-accessing-tables-and-views)
4. [Creating and Managing Views](#4-creating-and-managing-views)
5. [Merging Data](#5-merging-data)
6. [Querying the Database](#6-querying-the-database)
7. [Advanced Analysis](#7-advanced-analysis)
8. [Error Handling and Best Practices](#8-error-handling-and-best-practices)
9. [Performance Considerations](#9-performance-considerations)
10. [Conclusion and Next Steps](#10-conclusion-and-next-steps)

## 1. Setup and Initialization

In [None]:
import database_functions as func
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

# Create a Database instance

db = func.Database()
print("Database initialized successfully.")

## 2. Exploring Database Structure

In [None]:
# List all tables in the database
print("Available tables:")
for table_name in db.tables.keys():
    print(f"- {table_name}")

# Display basic information about each table
for table_name, table in db.tables.items():
    print(f"\nTable: {table_name}")
    print(f"  Rows: {table().shape[0]}")
    print(f"  Columns: {table.get_columns()}")
    print(f"  Available views: {', '.join(table.list_views())}")

## 3. Accessing Tables and Views

In [None]:
# Access a specific table
voter_table = db.voter   # retruns a Table class
voter_table_df = db.voter() # returns a dataframe
voter_table_df = db.dataframes['voter'] # returns a dataframe

# Return a list of column names in a Table class
    # table_name.get_columns() -> list[column_name: str]
voter_table.get_columns()

# Return a list of available views in a Table class
    # table_name.list_views() -> list[view_name: str]
voter_table.list_views()

# Access a specific view in 2 ways from a table class:
voter_2022 = voter_table.get_view('2022')  # table_name.get_view(view_name)
voter_2022 = voter_table._2022  # add prefix '_' to the view name starts with digit

# Access a specific view in 2 ways from the database class: 

voter_2022 = db.get_view('voter', '2022')   # db.get_view(table_name, view_name)
voter_2022 = db.voter._2022   # db.table_name.view_name

voter_2022.head()

## 4. Creating and Managing Views

In [None]:
# Add a new view to the voter table
# db.add_view(table_name, view_name, columns)
db.add_view('voter', 'registered_voters', ['county_name', 'total_registered_2018', 'total_registered_2020', 'total_registered_2022'])
# db.voter.add_view('registered_voters', ['county_name', 'total_registered_2018', 'total_registered_2020', 'total_registered_2022'])

# Verify the new view
registered_voters = db.get_view('voter', 'registered_voters')
registered_voters.head()


## 5. Merging Data

In [None]:
# Merge multiple views
# db.merge_views(list[(table_name, view_name), (table_name, view_name), ...], key = 'column_name')
merged_views = db.merge_views([('demo', 'population'), ('voter', 'registered_voters')])
# merged_views.head()

# Merge all tables
db_merged = db.merge_all()
# db_merged.head()

## 6. Querying the Database

In [None]:
# Query the merged database
    # db.query(conditions, columns)
    # conditions: dict[column_name: function]

# Query for counties with population over 1 million
    # conditions: populatio_january_2023 > 1000000
    # columns: ['county_name', 'population_january_2023', 'median_household_income_2021']
    
large_counties = db.query({'population_january_2023': lambda x: x > 1000000}, 
                          ['county_name', 'population_january_2023', 'median_household_income_2021'])
large_counties.head()

# Query for multiple conditions using a dictionary
conditions = {'population_january_2023': lambda x: x > 1000000,
              'median_household_income_2021': lambda x: x > 90000}
large_high_income_counties = db.query(conditions,
    ['county_name', 'population_january_2023', 'median_household_income_2021']
)
large_high_income_counties.head()


## 7. Advanced Analysis

In [None]:

# Analyze relationship between population and voter registration
merged_data = db.merge_views([('demo', 'population'), ('voter', 'registered_voters')])
merged_data['registration_rate_2022'] = merged_data['total_registered_2022'] / merged_data['population_january_2023'] * 100

plt.figure(figsize=(10, 6))
plt.scatter(merged_data['population_january_2023'], merged_data['registration_rate_2022'])
plt.title('Population vs. Voter Registration Rate (2022)')
plt.xlabel('Population')
plt.ylabel('Voter Registration Rate (%)')
plt.xscale('log')  # Use log scale for population
plt.tight_layout()
plt.show()

# Calculate correlation
correlation = merged_data['population_january_2023'].corr(merged_data['registration_rate_2022'])
print(f"\nCorrelation between population and voter registration rate: {correlation:.2f}")

## 8. Error Handling and Best Practices

In [None]:
# Demonstrate error handling
try:
    db.get_view('non_existent_table', 'some_view')
except ValueError as e:
    print("Error:", str(e))

try:
    db.get_view('voter', 'non_existent_view')
except ValueError as e:
    print("Error:", str(e))

# Best practice: Check if a view exists before trying to access it
def safe_get_view(db, table_name, view_name):
    if table_name in db.tables:
        table = db.tables[table_name]
        if view_name in table.list_views():
            return table.get_view(view_name)
        else:
            print(f"View '{view_name}' not found in table '{table_name}'")
    else:
        print(f"Table '{table_name}' not found in the database")
    return None

# Example usage of safe_get_view
safe_view = safe_get_view(db, 'voter', '2022')
if safe_view is not None:
    print("Successfully retrieved the '2022' view from the 'voter' table")
    print(safe_view.head())

safe_get_view(db, 'non_existent_table', 'some_view')
safe_get_view(db, 'voter', 'non_existent_view')

## 9. Performance Considerations

In [None]:
import time

def measure_time(func):
    start_time = time.time()
    result = func()
    end_time = time.time()
    print(f"Operation took {end_time - start_time:.4f} seconds")
    return result

print("Time to merge all tables:")
merged_all = measure_time(db.merge_all)

print("\nTime to query large counties:")
large_counties = measure_time(lambda: db.query({'population_january_2023': lambda x: x > 1000000}))

print("\nTime to access a view:")
voter_2022 = measure_time(lambda: db.get_view('voter', '2022'))

# Comparing performance of different operations
print("\nComparing performance of different operations:")
print("1. Accessing a single table:")
measure_time(lambda: db.voter())

print("\n2. Accessing a view:")
measure_time(lambda: db.get_view('voter', '2022'))

print("\n3. Merging two views:")
measure_time(lambda: db.merge_views([('demo', 'population'), ('voter', 'registered_voters')]))

print("\n4. Querying the merged dataset:")
measure_time(lambda: db.query({'median_household_income_2021': lambda x: x > 100000}))


## 10. Conclusion and Next Steps