In [None]:
import pandas as pd     
import os
import sys
sys.path.append('../')
import dotenv
dotenv.load_dotenv()

# load from .env file




from include.db.connections import get_master_db_connection
from include.transform.client_data_transform import generate_address_id

# Data Ingestion Pipeline

The purpose of this pipeline is to allow for uploads of additional pool addresses to the Pool database. 

Steps are as follows:

1. Get the given csv file.
2. Assert required column set.
3. Nan Checks on required columns, rejecting failing rows.
4. Validity check on postal code column. 
5. lat lon validation (detect massive outliers and remove). 
6. Properties internal de-duplication
7. Properties from db de-duplication
8. Properties upload. 
9. Pool objectes upload. 

## Step 1,2: CSV Ingestion Column Set Check

In [None]:
def ingest_csv_data(path):
    required_columns = [ 'address_number', 'municipality', 'lat',
       'lon', 'postal_code', 'street_name', 'province_state', 'country',
       'geom']

    additional_columns = ['address_id', 'pool_type']

    df = pd.read_csv(path)

    def check_required_columns(df, required_columns):
        missing_columns = [col for col in required_columns if col not in df.columns]
        if missing_columns:
            raise ValueError(f"Missing required columns: {missing_columns}")
        return True

    def has_additional_columns(df, additional_columns):
        present_additional_columns = [col for col in additional_columns if col in df.columns]
        return present_additional_columns


    def create_additional_column_stubs(df):
        #check the df does not have address_id
        if 'address_id' not in df.columns:
            df['address_id'] = generate_address_id()
        

        if 'pool_type' not in df.columns:
            df['pool_type'] = None
        return df


    has_required = check_required_columns(df, required_columns)
    present_additional = has_additional_columns(df, additional_columns)

    if not has_required:
        raise ValueError("DataFrame does not have all required columns.")

    if not present_additional:
        df = create_additional_column_stubs(df, additional_columns)

    all_needed_columns = required_columns + additional_columns
    df = df[all_needed_columns]

    
    return df


In [16]:
required_columns = [ 'address_number', 'municipality', 'lat',
       'lon', 'postal_code', 'street_name', 'province_state', 'country',
       'geom']

additional_columns = ['address_id', 'pool_type']

df = pd.read_csv('/home/james/PDS/client_data_feeds/realestate/moncton_data_prep/moncton_upload.csv')

def check_required_columns(df, required_columns):
    missing_columns = [col for col in required_columns if col not in df.columns]
    if missing_columns:
        raise ValueError(f"Missing required columns: {missing_columns}")
    return True

def has_additional_columns(df, additional_columns):
    present_additional_columns = [col for col in additional_columns if col in df.columns]
    return present_additional_columns


def create_additional_column_stubs(df):
    #check the df does not have address_id
    if 'address_id' not in df.columns:
        df['address_id'] = generate_address_id()
    

    if 'pool_type' not in df.columns:
        df['pool_type'] = None
    return df


has_required = check_required_columns(df, required_columns)
present_additional = has_additional_columns(df, additional_columns)

if not has_required:
    raise ValueError("DataFrame does not have all required columns.")

if not present_additional:
    df = create_additional_column_stubs(df, additional_columns)

print("DataFrame is ready for processing.")


DataFrame is ready for processing.


In [17]:
#remove any columns not in required or additional columns
all_needed_columns = required_columns + additional_columns
df = df[all_needed_columns]


## Step 2: NaN Checks

In [None]:

import re

def cleanup_dataframe(df):
        # NAN Checks: 
    indexes_with_missing_values = set()

    for col in required_columns:
        missing_in_col = df[df[col].isnull() | (df[col] == '')].index
        indexes_with_missing_values.update(missing_in_col)

    # drop rows with missing values in required columns
    df_cleaned = df.drop(indexes_with_missing_values)
    CA_POSTAL_RE = re.compile(
        r"\b[ABCEGHJ-NPRSTVXY]\d[ABCEGHJ-NPRSTV-Z][ -]?\d[ABCEGHJ-NPRSTV-Z]\d\b",
        re.IGNORECASE,
    )

    def is_valid_canadian_postal_code(postal_code):
        if pd.isnull(postal_code):
            return False
        return bool(CA_POSTAL_RE.fullmatch(postal_code.strip()))

    # validate postal codes
    invalid_postal_indexes = df_cleaned[~df_cleaned['postal_code'].apply(is_valid_canadian_postal_code)].index

    # drop rows with invalid postal codes
    df_clean_postal = df_cleaned.drop(invalid_postal_indexes)

    return df_clean_postal

In [18]:
# check each of the required columns for null, None, or empty string values
indexes_with_missing_values = set()

for col in required_columns:
    missing_in_col = df[df[col].isnull() | (df[col] == '')].index
    indexes_with_missing_values.update(missing_in_col)

# drop rows with missing values in required columns
df_cleaned = df.drop(indexes_with_missing_values)

print(f"Dropped {len(indexes_with_missing_values)} rows with missing required values.")


Dropped 0 rows with missing required values.


## Step 3: Validity check on Postal Codes

In [20]:
import re

CA_POSTAL_RE = re.compile(
    r"\b[ABCEGHJ-NPRSTVXY]\d[ABCEGHJ-NPRSTV-Z][ -]?\d[ABCEGHJ-NPRSTV-Z]\d\b",
    re.IGNORECASE,
)

def is_valid_canadian_postal_code(postal_code):
    if pd.isnull(postal_code):
        return False
    return bool(CA_POSTAL_RE.fullmatch(postal_code.strip()))

# validate postal codes
invalid_postal_indexes = df_cleaned[~df_cleaned['postal_code'].apply(is_valid_canadian_postal_code)].index

# drop rows with invalid postal codes
df_clean_postal = df_cleaned.drop(invalid_postal_indexes)

# Ennsure that all lat-lons are in the same hemisphere

In [21]:
def check_lat_lon_in_same_hemisphere(df):
    # reject if we have both + and - latitudes or longitudes
    latitudes = df['lat']
    longitudes = df['lon']

    if (latitudes > 0).any() and (latitudes < 0).any():
        return False
    if (longitudes > 0).any() and (longitudes < 0).any():
        return False
    

    return True

has_consistent_hemisphere = check_lat_lon_in_same_hemisphere(df_clean_postal)

if not has_consistent_hemisphere:
    raise ValueError("Latitude and Longitude values are not in the same hemisphere.")




## Internal Duplicate Removal 

In [22]:
# first we standardize the postal codes, as well as street_name and municipality to uppercase and strip whitespace
def standardize_text_fields(df):
    df['postal_code'] = df['postal_code'].str.upper().str.replace(' ', '', regex=False)
    df['street_name'] = df['street_name'].str.strip().str.title()
    df['municipality'] = df['municipality'].str.strip().str.title()
    return df
df_clean_postal = standardize_text_fields(df_clean_postal)

In [24]:

# Drop duplicates where street_name AND address_number match, along with EITHER municipality OR postal code
mask = df_clean_postal.duplicated(subset=['street_name', 'address_number', 'municipality'], keep='first') | \
    df_clean_postal.duplicated(subset=['street_name', 'address_number', 'postal_code'], keep='first')

df_final = df_clean_postal[~mask]

In [None]:
def remove_duplicates(df):
    # Drop duplicates where street_name AND address_number match, along with EITHER municipality OR postal code
    mask = df.duplicated(subset=['street_name', 'address_number', 'municipality'], keep='first') | \
        df.duplicated(subset=['street_name', 'address_number', 'postal_code'], keep='first')
    df_no_duplicates = df[~mask]
    return df_no_duplicates

## Now we de-duplicate with the db. 

In [None]:
import geopandas as gpd
from shapely.geometry import box
from geopy.distance import distance
from shapely import box

def build_bbox(df):

    min_lat = df['lat'].min()
    max_lat = df['lat'].max()
    min_lon = df['lon'].min()
    max_lon = df['lon'].max()
    bbox = (min_lon, min_lat, max_lon, max_lat)

    #add a 5 km buffer to the bbox
    buffer_km = 5

    # import geopy to calculate the buffer

    bottom_left = (min_lat, min_lon)
    top_right = (max_lat, max_lon)
    bottom_left_buffered = distance(kilometers=buffer_km).destination(bottom_left, 225)  # 225 degrees is southwest
    top_right_buffered = distance(kilometers=buffer_km).destination(top_right,45)    # 45 degrees is northeast
    bbox_buffered = (bottom_left_buffered.longitude, bottom_left_buffered.latitude,
                    top_right_buffered.longitude, top_right_buffered.latitude)



    return bbox_buffered

bbox_buffered = build_bbox(df_final)

In [None]:

MASTER_DB_URL=''
conn = get_master_db_connection(MASTER_DB_URL)

In [27]:
# now we query the master db to get all addresses within the bbox
query = f"""
SELECT * FROM properties WHERE ST_Intersects(geom, ST_MakeEnvelope({bbox_buffered[0]}, {bbox_buffered[1]}, {bbox_buffered[2]}, {bbox_buffered[3]}, 4326));
"""
master_addresses = gpd.read_postgis(query, conn, geom_col='geom')
print(f"Retrieved {len(master_addresses)} addresses from master database within bounding box.")

  df = pd.read_sql(


Retrieved 23 addresses from master database within bounding box.


  return pd.read_sql(spatial_ref_sys_sql, con)


In [28]:
master_addresses.head()

Unnamed: 0,id,address_id,address_number,country,lat,lon,postal_code,street_name,municipality,province_state,geom,updated_at
0,4b4f96c2-48a0-462b-b388-e6e4587865f6,8980286175673212036,287,Canada,46.096984,-64.714013,E1A7W9,Manon Street,Dieppe,New Brunswick,POINT (-64.71401 46.09698),2026-01-28 20:35:16.413808+00:00
1,56080f32-f758-4ff7-8c07-cc6cdd7fd648,2385072820701278606,58,Canada,46.084448,-64.823529,E1E3X2,Briarwood,Moncton,New Brunswick,POINT (-64.82353 46.08445),2026-01-28 20:35:16.413808+00:00
2,f9c40a96-d689-4194-999b-a9f7d0365283,6735574806161078946,41,Canada,46.077931,-64.708625,E1A8A1,Cyr Street,Dieppe,New Brunswick,POINT (-64.70862 46.07793),2026-01-28 20:35:16.413808+00:00
3,520e8ca3-bcd0-4885-8763-cc4bc89b8d7d,2110689378835180626,11,Canada,46.053524,-64.782852,E1B0R5,Rosebank Crescent,Riverview,New Brunswick,POINT (-64.78285 46.05352),2026-01-28 20:35:16.413808+00:00
4,a00270de-3579-40b9-aa66-6d31b0425830,5841912189221429131,424,Canada,46.107994,-64.749371,E1A2T1,Shediac Road,Moncton,New Brunswick,POINT (-64.74937 46.10799),2026-01-28 20:35:16.413808+00:00


In [30]:
# first we drop any rows from df if address_id matches one in master_addresses
inital_row_count = len(df_final)
df_final = df_final[~df_final['address_id'].isin(master_addresses['address_id'])]

# use standardization on master_addresses as well
master_addresses = standardize_text_fields(master_addresses)

# next we drop rows where street_name AND address_number match, along with EITHER municipality OR postal code in master_addresses



mask = df_final.apply(
    lambda row: (
        ((master_addresses['street_name'] == row['street_name']) &
         (master_addresses['address_number'] == row['address_number']) &
         (master_addresses['municipality'] == row['municipality'])).any() or
        ((master_addresses['street_name'] == row['street_name']) &
         (master_addresses['address_number'] == row['address_number']) &
         (master_addresses['postal_code'] == row['postal_code'])).any()
    ),
    axis=1
)

df_final_unique = df_final[~mask]
final_row_count = len(df_final_unique)
dropped_due_to_master = inital_row_count - final_row_count
print(f"Dropped {dropped_due_to_master} rows that matched existing addresses in master database.")

Dropped 0 rows that matched existing addresses in master database.


In [None]:
def remove_duplicates(input_df, master_df):
    # Drop duplicates where street_name AND address_number match, along with EITHER municipality OR postal code in master_df
    mask = input_df.apply(
        lambda row: (
            ((master_df['street_name'] == row['street_name']) &
             (master_df['address_number'] == row['address_number']) &
             (master_df['municipality'] == row['municipality'])).any() or
            ((master_df['street_name'] == row['street_name']) &
             (master_df['address_number'] == row['address_number']) &
             (master_df['postal_code'] == row['postal_code'])).any()
        ),
        axis=1
    )
    df_no_duplicates = input_df[~mask]
    return df_no_duplicates

# Upload properties to db. 

In [31]:
# first we will need to split out and create a separate pool_df


POOL_COLUMNS = ['address_id', 'pool_type']
pool_df = df_final_unique[POOL_COLUMNS].copy()
address_df = df_final_unique.drop(columns='pool_type')
address_df.head()

Unnamed: 0,address_number,municipality,lat,lon,postal_code,street_name,province_state,country,geom,address_id
0,595,Dieppe,46.038099,-64.695354,E1A7K6,Dover Road,NB,Canada,POINT(-64.69535435037182 46.03809929328514),206712
1,120,Moncton,46.059133,-64.885075,E1B5G1,Acacia Drive,NB,Canada,POINT(-64.8850753446976 46.05913287378083),206711
2,26,Dieppe,46.101863,-64.682485,E1A1R5,Lorette Street,NB,Canada,POINT(-64.68248518917952 46.10186259923136),206715
3,35,Dieppe,46.101701,-64.677647,E1A1R5,Appleton Street,NB,Canada,POINT(-64.67764705740598 46.10170090063899),206716
4,268,Moncton,46.104572,-64.743529,E1A5R8,Highlandview Road,NB,Canada,POINT(-64.74352927911653 46.10457231079941),206718


In [38]:
# now we upload address_df to master db using plain SQL
from shapely.geometry import Point
conn = get_master_db_connection(MASTER_DB_URL)
# Ensure geom column has Point geometries
if 'geom' not in address_df.columns or address_df['geom'].dtype == 'object':
    address_df['geom'] = address_df.apply(lambda row: Point(row['lon'], row['lat']), axis=1)

# Prepare the insert statement
insert_query = """
INSERT INTO properties (address_id, address_number, country, lat, lon, postal_code, 
                        street_name, municipality, province_state, geom)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, ST_GeogFromText(%s))
"""

try:
    cursor = conn.cursor()
    rows_inserted = 0
    
    for idx, row in address_df.iterrows():
        # Create WKT representation of the point
        wkt = f"POINT({row['lon']} {row['lat']})"
        
        values = (
            int(row['address_id']),
            row['address_number'],
            row['country'],
            float(row['lat']),
            float(row['lon']),
            row['postal_code'],
            row['street_name'],
            row['municipality'],
            row['province_state'],
            wkt
        )
        
        cursor.execute(insert_query, values)
        rows_inserted += 1
    
    conn.commit()
    cursor.close()
    print(f"Successfully uploaded {rows_inserted} addresses to properties table.")
    
except Exception as e:
    conn.rollback()
    print(f"Error uploading addresses: {e}")
    raise


Successfully uploaded 2527 addresses to properties table.


In [49]:
# Step 1: Generate pool_id values for pool_df using time-based hash
import time
import hashlib

def generate_time_based_id():
    """Generate a unique ID based on time and random component"""
    timestamp = str(time.time()).encode()
    random_component = str(time.time_ns()).encode()
    hash_obj = hashlib.sha256(timestamp + random_component)
    # Convert hash to int and ensure it fits in PostgreSQL BIGINT (signed 64-bit)
    # Max value for signed bigint is 9223372036854775807
    hash_int = int(hash_obj.hexdigest()[:16], 16)
    return hash_int % 9223372036854775807

# Generate pool_id - always regenerate with new time-based IDs
pool_df['pool_id'] = [generate_time_based_id() for _ in range(len(pool_df))]

pool_df.head()


Unnamed: 0,address_id,pool_type,pool_id,property_id
0,206712,in-ground-pool-closed,3986387544446953920,e8b201de-105c-4cb5-bd25-d477596ffa7c
1,206711,in-ground-pool-closed,5707433658476730178,05fa2b27-267e-47be-883e-298dea6d4fc5
2,206715,above-ground-pool-open,4178940787347723104,7a43216a-6050-49fd-aa61-d2ea514f2b19
3,206716,above-ground-pool-closed,2346899496511946761,6182ffd4-139f-48f7-9118-e28b0a72e376
4,206718,above-ground-pool-closed,7443731055373392643,e462df0e-2962-4b31-a572-2963d389dee0


In [50]:
# Step 2: Get the property UUIDs for the address_ids we just inserted
address_ids = pool_df['address_id'].tolist()

# Query to get id (UUID) and address_id from properties table
query = """
SELECT id, address_id 
FROM properties 
WHERE address_id = ANY(%s)
"""
conn = get_master_db_connection(MASTER_DB_URL)
cursor = conn.cursor()
cursor.execute(query, (address_ids,))
property_mappings = cursor.fetchall()
cursor.close()

# Create a mapping dict: address_id -> property_id (UUID)
address_to_property = {row[1]: row[0] for row in property_mappings}

# Add property_id column to pool_df
pool_df['property_id'] = pool_df['address_id'].map(address_to_property)

# Check if any addresses didn't get mapped
unmapped = pool_df[pool_df['property_id'].isnull()]
if len(unmapped) > 0:
    print(f"Warning: {len(unmapped)} pools couldn't be mapped to properties")
    print(unmapped)

print(f"Mapped {len(pool_df[pool_df['property_id'].notnull()])} pools to properties")
pool_df.head()


Mapped 2527 pools to properties


Unnamed: 0,address_id,pool_type,pool_id,property_id
0,206712,in-ground-pool-closed,3986387544446953920,e8b201de-105c-4cb5-bd25-d477596ffa7c
1,206711,in-ground-pool-closed,5707433658476730178,05fa2b27-267e-47be-883e-298dea6d4fc5
2,206715,above-ground-pool-open,4178940787347723104,7a43216a-6050-49fd-aa61-d2ea514f2b19
3,206716,above-ground-pool-closed,2346899496511946761,6182ffd4-139f-48f7-9118-e28b0a72e376
4,206718,above-ground-pool-closed,7443731055373392643,e462df0e-2962-4b31-a572-2963d389dee0


In [51]:
# Step 3: Upload pools to database
# Filter out any unmapped pools
pool_df_upload = pool_df[pool_df['property_id'].notnull()].copy()

# Insert pools
insert_query = """
INSERT INTO pools (pool_id, property_id, pool_type)
VALUES (%s, %s, %s)
"""

try:
    cursor = conn.cursor()
    pools_inserted = 0
    
    for idx, row in pool_df_upload.iterrows():
        values = (
            int(row['pool_id']),
            row['property_id'],  # This is already a UUID string
            row['pool_type'] if pd.notna(row['pool_type']) else None
        )
        
        cursor.execute(insert_query, values)
        pools_inserted += 1
    
    conn.commit()
    cursor.close()
    print(f"Successfully uploaded {pools_inserted} pools to database.")
    
except Exception as e:
    conn.rollback()
    print(f"Error uploading pools: {e}")
    raise


Successfully uploaded 2527 pools to database.
