# Store Transformed Data in SQLite Database

This notebook loads the transformed parquet files and stores them in the SQLite database.


In [4]:
import pandas as pd
import sqlite3
from pathlib import Path
import sys
import importlib

sys.path.append('../src')

# reload module if already imported
if 'db.init_db' in sys.modules:
    importlib.reload(sys.modules['db.init_db'])

from db.init_db import init_database


In [5]:
# init db from schema
print("Initializing database...")
init_database(db_path="../src/db/database.sqlite", schema_path="../src/db/schema.sql")


Initializing database...
Removing existing database at ../src/db/database.sqlite
Reading schema from ../src/db/schema.sql
Creating database at ../src/db/database.sqlite
  Creating 3 tables...
    Executing CREATE TABLE 1: CREATE TABLE properties (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    suburb TEXT NOT NULL,
    po...
     Table created successfully.
    Executing CREATE TABLE 2: CREATE TABLE suburb_quarterly (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    suburb TEXT NOT NULL,
...
     Table created successfully.
    Executing CREATE TABLE 3: CREATE TABLE suburb_analytics (
    suburb TEXT NOT NULL,
    property_type TEXT NOT NULL CHECK(prop...
     Table created successfully.
  Creating indexes...

SUCCESS: Database initialized successfully!
  Created tables: ['properties', 'sqlite_sequence', 'suburb_quarterly', 'suburb_analytics']
  Created indexes: 12

SUCCESS: Database ready at /home/korij/development/web/business/housing_affordability_crisis/final/backend/notebooks/../src/d

In [6]:
data_dir = "../data/transformed_split"

print("Loading parquet files...")

houses_props = pd.read_parquet(f"{data_dir}/properties_houses.parquet")
units_props = pd.read_parquet(f"{data_dir}/properties_units.parquet")

houses_quarterly = pd.read_parquet(f"{data_dir}/quarterly_stats_houses.parquet")
units_quarterly = pd.read_parquet(f"{data_dir}/quarterly_stats_units.parquet")

houses_analytics = pd.read_parquet(f"{data_dir}/suburb_analytics_houses.parquet")
units_analytics = pd.read_parquet(f"{data_dir}/suburb_analytics_units.parquet")

print(f"Loaded all parquet files")
print(f"  Properties - Houses: {len(houses_props):,}, Units: {len(units_props):,}")
print(f"  Quarterly - Houses: {len(houses_quarterly):,}, Units: {len(units_quarterly):,}")
print(f"  Analytics - Houses: {len(houses_analytics):,}, Units: {len(units_analytics):,}")


Loading parquet files...
Loaded all parquet files
  Properties - Houses: 880,568, Units: 1,072,888
  Quarterly - Houses: 47,016, Units: 34,001
  Analytics - Houses: 633, Units: 526


In [7]:
# prepping properties data
print("\nPreparing properties data...")

houses_props_db = houses_props.copy()
houses_props_db['property_type'] = 'house'
houses_props_db['listing_date'] = None  # schema requires this but we don't have it
houses_props_db['days_on_market'] = None

units_props_db = units_props.copy()
units_props_db['property_type'] = 'unit'
units_props_db['listing_date'] = None
units_props_db['days_on_market'] = None

all_properties = pd.concat([houses_props_db, units_props_db], ignore_index=True)

properties_db = all_properties[[
    'suburb', 'postcode', 'district', 'property_type',
    'listing_date', 'contract_date', 'settlement_date', 
    'sale_price', 'days_on_market', 'contract_to_settlement_days'
]].copy()

print(f"Prepared {len(properties_db):,} property records")
print(f"  Columns: {list(properties_db.columns)}")
print(f"\nSample data:")
print(properties_db.head(3))



Preparing properties data...
Prepared 1,953,456 property records
  Columns: ['suburb', 'postcode', 'district', 'property_type', 'listing_date', 'contract_date', 'settlement_date', 'sale_price', 'days_on_market', 'contract_to_settlement_days']

Sample data:
       suburb postcode district property_type listing_date contract_date  \
0  ANNANGROVE     2156      081         house         None    2004-11-21   
1      MOSMAN     2088      087         house         None    2004-12-21   
2   EDGECLIFF     2027      210         house         None    2004-12-09   

  settlement_date  sale_price days_on_market  contract_to_settlement_days  
0      2005-01-01   1525000.0           None                         41.0  
1      2005-01-01   3950000.0           None                         11.0  
2      2005-01-02   1950000.0           None                         24.0  


In [8]:
# quarterly stats prep
print("\nPreparing quarterly stats data...")

def prepare_quarterly_stats(df, property_type):
    df = df.copy()
    df['property_type'] = property_type
    
    # column name mapping from parquet to schema
    column_mapping = {
        'sale_price_num_sales': 'num_sales',
        'sale_price_median_price_raw': 'median_price',  # Raw median maps to median_price
        'sale_price_median_price_smoothed': 'median_price_smoothed',  # Smoothed median
        'sale_price_mean_price': 'mean_price',
        'sale_price_min_price': 'min_price',
        'sale_price_max_price': 'max_price',
        'sale_price_price_stddev': 'price_stddev',
        'sale_price_price_p25': 'price_p25',
        'sale_price_price_p75': 'price_p75',
        'contract_to_settlement_days_median_ctsd': 'median_ctsd',
        'contract_to_settlement_days_mean_ctsd': 'mean_ctsd',
        'contract_to_settlement_days_fast_settlements_percentage': 'fast_settlements_percentage',
    }
    
    df = df.rename(columns=column_mapping)
    
    # add missing columns as None
    required_cols = [
        'suburb', 'property_type', 'year', 'quarter', 'quarter_start',
        'num_sales', 'median_price', 'median_price_smoothed', 'mean_price', 'min_price', 'max_price',
        'price_stddev', 'price_p25', 'price_p75', 'median_ctsd', 'mean_ctsd',
        'fast_settlements_percentage', 'liquidity_score'
    ]
    
    for col in required_cols:
        if col not in df.columns:
            df[col] = None
    
    df = df[required_cols].copy()
    
    # schema fields we don't have
    df['contract_to_settlement_score'] = None
    df['qoq_price_change_percentage'] = None
    df['yoy_price_change_percentage'] = None
    
    return df

houses_quarterly_db = prepare_quarterly_stats(houses_quarterly, 'house')
units_quarterly_db = prepare_quarterly_stats(units_quarterly, 'unit')

all_quarterly = pd.concat([houses_quarterly_db, units_quarterly_db], ignore_index=True)

print(f"Prepared {len(all_quarterly):,} quarterly records")
print(f"  Columns: {list(all_quarterly.columns)}")
print(f"\nSample data:")
print(all_quarterly.head(3))



Preparing quarterly stats data...
Prepared 81,017 quarterly records
  Columns: ['suburb', 'property_type', 'year', 'quarter', 'quarter_start', 'num_sales', 'median_price', 'median_price_smoothed', 'mean_price', 'min_price', 'max_price', 'price_stddev', 'price_p25', 'price_p75', 'median_ctsd', 'mean_ctsd', 'fast_settlements_percentage', 'liquidity_score', 'contract_to_settlement_score', 'qoq_price_change_percentage', 'yoy_price_change_percentage']

Sample data:
       suburb property_type  year  quarter quarter_start  num_sales  \
0  ABBOTSBURY         house  2005        1    2005-01-01          5   
1  ABBOTSBURY         house  2005        2    2005-04-01          7   
2  ABBOTSBURY         house  2005        3    2005-07-01          8   

   median_price  median_price_smoothed     mean_price  min_price  ...  \
0      540000.0               540000.0  485600.000000   225000.0  ...   
1      471000.0               519300.0  487714.285714   420000.0  ...   
2      523500.0               

In [9]:
# analytics prep
print("\nPreparing analytics data...")

def prepare_analytics(df, property_type):
    df = df.copy()
    df['property_type'] = property_type
    
    column_mapping = {
        'current_avg_ctsd': 'current_avg_ctsd',
        'total_sales_last_12m': 'current_num_sales',
        'growth_1yr_pct': 'growth_1yr_percentage',
        'growth_3yr_pct': 'growth_3yr_percentage',
        'growth_5yr_pct': 'growth_5yr_percentage',
        'growth_since_2005_pct': 'growth_since_2005_percentage',
        'current_median_price_smoothed': 'current_median_price_smoothed',
        'growth_1yr_pct_smoothed': 'growth_1yr_percentage_smoothed',
        'growth_3yr_pct_smoothed': 'growth_3yr_percentage_smoothed',
        'growth_5yr_pct_smoothed': 'growth_5yr_percentage_smoothed',
        'growth_since_2005_pct_smoothed': 'growth_since_2005_percentage_smoothed',
        'liquidity_score': 'overall_liquidity_score',
    }
    
    df = df.rename(columns=column_mapping)
    
    # JSON fields stay as TEXT
    schema_cols = [
        'suburb', 'property_type', 'last_updated',
        'current_quarter', 'current_median_price', 'current_median_price_smoothed', 'current_avg_ctsd', 'current_num_sales',
        'growth_1yr_percentage', 'growth_3yr_percentage', 'growth_5yr_percentage',
        'growth_10yr_percentage', 'growth_since_2005_percentage',
        'growth_1yr_percentage_smoothed', 'growth_3yr_percentage_smoothed', 'growth_5yr_percentage_smoothed',
        'growth_10yr_percentage_smoothed', 'growth_since_2005_percentage_smoothed',
        'cagr_5yr', 'cagr_10yr', 'cagr_5yr_smoothed', 'cagr_10yr_smoothed',
        'volatility_score', 'max_drawdown_pct', 'recovery_quarters',
        'avg_quarterly_volume', 'overall_liquidity_score', 'market_health_score',
        'q1_avg_premium_percentage', 'q2_avg_premium_percentage',
        'q3_avg_premium_percentage', 'q4_avg_premium_percentage', 'best_quarter_to_sell',
        'forecast_q1_price', 'forecast_q1_lower', 'forecast_q1_upper',
        'forecast_q2_price', 'forecast_q2_lower', 'forecast_q2_upper',
        'price_rank', 'growth_rank', 'speed_rank',
        'total_quarters_with_data', 'data_completeness_percentage',
        'price_quarterly', 'ctsd_quarterly'
    ]
    
    for col in schema_cols:
        if col not in df.columns:
            df[col] = None
    
    df = df[schema_cols].copy()
    
    return df

houses_analytics_db = prepare_analytics(houses_analytics, 'house')
units_analytics_db = prepare_analytics(units_analytics, 'unit')

all_analytics = pd.concat([houses_analytics_db, units_analytics_db], ignore_index=True)

print(f"Prepared {len(all_analytics):,} analytics records")
print(f"  Columns: {list(all_analytics.columns)}")
print(f"\nSample data:")
print(all_analytics.head(3))



Preparing analytics data...
Prepared 1,159 analytics records
  Columns: ['suburb', 'property_type', 'last_updated', 'current_quarter', 'current_median_price', 'current_median_price_smoothed', 'current_avg_ctsd', 'current_num_sales', 'growth_1yr_percentage', 'growth_3yr_percentage', 'growth_5yr_percentage', 'growth_10yr_percentage', 'growth_since_2005_percentage', 'growth_1yr_percentage_smoothed', 'growth_3yr_percentage_smoothed', 'growth_5yr_percentage_smoothed', 'growth_10yr_percentage_smoothed', 'growth_since_2005_percentage_smoothed', 'cagr_5yr', 'cagr_10yr', 'cagr_5yr_smoothed', 'cagr_10yr_smoothed', 'volatility_score', 'max_drawdown_pct', 'recovery_quarters', 'avg_quarterly_volume', 'overall_liquidity_score', 'market_health_score', 'q1_avg_premium_percentage', 'q2_avg_premium_percentage', 'q3_avg_premium_percentage', 'q4_avg_premium_percentage', 'best_quarter_to_sell', 'forecast_q1_price', 'forecast_q1_lower', 'forecast_q1_upper', 'forecast_q2_price', 'forecast_q2_lower', 'foreca

In [10]:
# insert into database
db_path = "../src/db/database.sqlite"
conn = sqlite3.connect(db_path)

print("="*80)
print("INSERTING DATA INTO DATABASE")
print("="*80)

print("\n1. Inserting properties...")
properties_db.to_sql('properties', conn, if_exists='append', index=False, method='multi', chunksize=10000)
print(f"    Inserted {len(properties_db):,} property records")

print("\n2. Inserting quarterly stats...")
all_quarterly.to_sql('suburb_quarterly', conn, if_exists='append', index=False, method='multi', chunksize=10000)
print(f"    Inserted {len(all_quarterly):,} quarterly records")

print("\n3. Inserting analytics...")
all_analytics.to_sql('suburb_analytics', conn, if_exists='append', index=False, method='multi', chunksize=1000)
print(f"    Inserted {len(all_analytics):,} analytics records")

conn.commit()
print("\n✓ All data inserted!")

# verify counts
print("\n" + "="*80)
print("VERIFICATION")
print("="*80)

cursor = conn.cursor()
cursor.execute("SELECT COUNT(*) FROM properties")
props_count = cursor.fetchone()[0]
print(f"Properties in DB: {props_count:,}")

cursor.execute("SELECT COUNT(*) FROM suburb_quarterly")
quarterly_count = cursor.fetchone()[0]
print(f"Quarterly stats in DB: {quarterly_count:,}")

cursor.execute("SELECT COUNT(*) FROM suburb_analytics")
analytics_count = cursor.fetchone()[0]
print(f"Analytics in DB: {analytics_count:,}")

cursor.execute("SELECT property_type, COUNT(*) FROM properties GROUP BY property_type")
print("\nProperties by type:")
for row in cursor.fetchall():
    print(f"  {row[0]}: {row[1]:,}")

cursor.execute("SELECT property_type, COUNT(*) FROM suburb_quarterly GROUP BY property_type")
print("\nQuarterly stats by type:")
for row in cursor.fetchall():
    print(f"  {row[0]}: {row[1]:,}")

cursor.execute("SELECT property_type, COUNT(*) FROM suburb_analytics GROUP BY property_type")
print("\nAnalytics by type:")
for row in cursor.fetchall():
    print(f"  {row[0]}: {row[1]:,}")

conn.close()
print("\n✓ Database populated and verified!")


INSERTING DATA INTO DATABASE

1. Inserting properties...
    Inserted 1,953,456 property records

2. Inserting quarterly stats...
    Inserted 81,017 quarterly records

3. Inserting analytics...
    Inserted 1,159 analytics records

✓ All data inserted!

VERIFICATION
Properties in DB: 1,953,456
Quarterly stats in DB: 81,017
Analytics in DB: 1,159

Properties by type:
  house: 880,568
  unit: 1,072,888

Quarterly stats by type:
  house: 47,016
  unit: 34,001

Analytics by type:
  house: 633
  unit: 526

✓ Database populated and verified!
