In [1]:
from pathlib import Path

import pandas as pd

from mock_up_data_model_data import DataFaker

data_dir = Path(".").joinpath("output")

In this notebook, I'll show the execution of the incremental data faker.

## Full load 1

In [2]:
data_maker = DataFaker(
    output_dir=data_dir,
    seed=42,
    address_count=250000,
    incremental=False,
    update_percentage=0.05,
    new_percentage=0.02,
)


Business Network CSV Data Generator
Random Seed: 42 (reproducible data)
Output Directory: output
Timestamp: 20260120_034426
Mode: FULL

Generating 250,000 addresses...
  Progress: 10,000/250,000 (4.0%)
  Progress: 20,000/250,000 (8.0%)
  Progress: 30,000/250,000 (12.0%)
  Progress: 40,000/250,000 (16.0%)
  Progress: 50,000/250,000 (20.0%)
  Progress: 60,000/250,000 (24.0%)
  Progress: 70,000/250,000 (28.0%)
  Progress: 80,000/250,000 (32.0%)
  Progress: 90,000/250,000 (36.0%)
  Progress: 100,000/250,000 (40.0%)
  Progress: 110,000/250,000 (44.0%)
  Progress: 120,000/250,000 (48.0%)
  Progress: 130,000/250,000 (52.0%)
  Progress: 140,000/250,000 (56.0%)
  Progress: 150,000/250,000 (60.0%)
  Progress: 160,000/250,000 (64.0%)
  Progress: 170,000/250,000 (68.0%)
  Progress: 180,000/250,000 (72.0%)
  Progress: 190,000/250,000 (76.0%)
  Progress: 200,000/250,000 (80.0%)
  Progress: 210,000/250,000 (84.0%)
  Progress: 220,000/250,000 (88.0%)
  Progress: 230,000/250,000 (92.0%)
  Progress: 24

In [22]:
!cat {str(data_dir.joinpath("DATASET_INFO.txt"))}

Business Network Mock Data - Dataset Summary

Generated: 2026-01-19 21:44:44
Timestamp: 20260120_034426
Random Seed: 42
Output Directory: output

Dataset Contents:
----------------------------------------------------------------------
Addresses:           250,000 rows
Businesses:          2,500 rows
Trading Partnerships: 20,000 rows
Total Records:       272,541 rows

Files Generated:
----------------------------------------------------------------------
1. addresses_20260120_034426.csv - Business location data
2. businesses_20260120_034426.csv - Business profile information
3. trading_partnerships_20260120_034426.csv - Business relationships

Reproducibility:
----------------------------------------------------------------------
To regenerate this exact dataset, use seed: 42
  python generate_csv_data.py 42



In [3]:
file_names = [p.name for p in data_dir.iterdir() if p.name.endswith(".csv")]
file_names.sort()
file_names

['addresses_20260120_034426.csv',
 'businesses_20260120_034426.csv',
 'trading_partnerships_20260120_034426.csv']

## Incremental Run 1

In [4]:
data_maker = DataFaker(
    output_dir=data_dir,
    seed=314159,
    address_count=250000,
    incremental=True,
    update_percentage=0.05,
    new_percentage=0.02,
)


Business Network CSV Data Generator
Random Seed: 314159 (reproducible data)
Output Directory: output
Timestamp: 20260120_034444
Mode: INCREMENTAL (update=5.0%, new=2.0%)

Found existing files. Generating incremental update...
Addresses: 250,000 existing, 12,500 updates, 5,000 new
Businesses: 2,500 existing, 125 updates, 50 new
Partnerships: 20,000 existing, 1,000 updates, 400 new

Updating addresses file...
 Created output/addresses_20260120_034444.csv (255,000 rows)
Updating businesses file...
 Created output/businesses_20260120_034444.csv (2,550 rows)
Updating partnerships file...
 Created output/trading_partnerships_20260120_034444.csv (20,400 rows)

 Incremental update complete!



In [5]:
file_names = [p.name for p in data_dir.iterdir() if p.name.endswith(".csv")]
file_names.sort()
file_names

['addresses_20260120_034426.csv',
 'addresses_20260120_034444.csv',
 'businesses_20260120_034426.csv',
 'businesses_20260120_034444.csv',
 'trading_partnerships_20260120_034426.csv',
 'trading_partnerships_20260120_034444.csv']

## Incremental Run 2

In [6]:
data_maker = DataFaker(
    output_dir=data_dir,
    seed=2718,
    address_count=250000,
    incremental=True,
    update_percentage=0.05,
    new_percentage=0.02,
)


Business Network CSV Data Generator
Random Seed: 2718 (reproducible data)
Output Directory: output
Timestamp: 20260120_034446
Mode: INCREMENTAL (update=5.0%, new=2.0%)

Found existing files. Generating incremental update...
Addresses: 255,000 existing, 12,750 updates, 5,100 new
Businesses: 2,550 existing, 127 updates, 51 new
Partnerships: 20,400 existing, 1,020 updates, 408 new

Updating addresses file...
 Created output/addresses_20260120_034446.csv (260,100 rows)
Updating businesses file...
 Created output/businesses_20260120_034446.csv (2,601 rows)
Updating partnerships file...
 Created output/trading_partnerships_20260120_034446.csv (20,808 rows)

 Incremental update complete!



In [7]:
file_names = [p.name for p in data_dir.iterdir() if p.name.endswith(".csv")]
file_names.sort()
file_names

['addresses_20260120_034426.csv',
 'addresses_20260120_034444.csv',
 'addresses_20260120_034446.csv',
 'businesses_20260120_034426.csv',
 'businesses_20260120_034444.csv',
 'businesses_20260120_034446.csv',
 'trading_partnerships_20260120_034426.csv',
 'trading_partnerships_20260120_034444.csv',
 'trading_partnerships_20260120_034446.csv']

# Data Examination

In [9]:
addr1_df = pd.read_csv(data_dir.joinpath(file_names[0]))
addr2_df = pd.read_csv(data_dir.joinpath(file_names[1]))
addr3_df = pd.read_csv(data_dir.joinpath(file_names[2]))

business1_df = pd.read_csv(data_dir.joinpath(file_names[3]))
business2_df = pd.read_csv(data_dir.joinpath(file_names[4]))
business3_df = pd.read_csv(data_dir.joinpath(file_names[5]))

trading_partners1_df = pd.read_csv(data_dir.joinpath(file_names[6]))
trading_partners2_df = pd.read_csv(data_dir.joinpath(file_names[7]))
trading_partners3_df = pd.read_csv(data_dir.joinpath(file_names[8]))

## Address data

In [10]:
print(addr1_df.shape)
display(addr1_df.head(2))

print(addr2_df.shape)
display(addr2_df.head(2))

print(addr3_df.shape)
display(addr3_df.head(2))

(250000, 8)


Unnamed: 0,address_id,street_address,city,state,postal_code,country,created_at,updated_at
0,1,43321 Brittany Bypass,North Jefferyhaven,Rhode Island,3979,United States,2026-01-20T03:44:26Z,2026-01-20T03:44:26Z
1,2,863 Lawrence Valleys,Port Lindachester,Michigan,36922,United States,2026-01-20T03:44:26Z,2026-01-20T03:44:26Z


(255000, 8)


Unnamed: 0,address_id,street_address,city,state,postal_code,country,created_at,updated_at
0,1,43321 Brittany Bypass,North Jefferyhaven,Rhode Island,3979,United States,2026-01-20T03:44:26Z,2026-01-20T03:44:26Z
1,2,863 Lawrence Valleys,Port Lindachester,Michigan,36922,United States,2026-01-20T03:44:26Z,2026-01-20T03:44:26Z


(260100, 8)


Unnamed: 0,address_id,street_address,city,state,postal_code,country,created_at,updated_at
0,1,43321 Brittany Bypass,North Jefferyhaven,Rhode Island,3979,United States,2026-01-20T03:44:26Z,2026-01-20T03:44:26Z
1,2,863 Lawrence Valleys,Port Lindachester,Michigan,36922,United States,2026-01-20T03:44:26Z,2026-01-20T03:44:26Z


In [11]:
addr3_df["updated_at"].value_counts()

updated_at
2026-01-20T03:44:47Z    17850
2026-01-20T03:44:45Z    16616
2026-01-20T03:44:30Z    13122
2026-01-20T03:44:42Z    13074
2026-01-20T03:44:41Z    13041
2026-01-20T03:44:37Z    13034
2026-01-20T03:44:39Z    13010
2026-01-20T03:44:36Z    13004
2026-01-20T03:44:38Z    12985
2026-01-20T03:44:27Z    12915
2026-01-20T03:44:28Z    12889
2026-01-20T03:44:31Z    12846
2026-01-20T03:44:32Z    12813
2026-01-20T03:44:40Z    12798
2026-01-20T03:44:29Z    12765
2026-01-20T03:44:35Z    12741
2026-01-20T03:44:34Z    12731
2026-01-20T03:44:33Z    12580
2026-01-20T03:44:43Z    12398
2026-01-20T03:44:26Z     6888
Name: count, dtype: int64

In [12]:
addr3_df["created_at"].value_counts()

created_at
2026-01-20T03:44:30Z    14553
2026-01-20T03:44:41Z    14486
2026-01-20T03:44:42Z    14472
2026-01-20T03:44:39Z    14461
2026-01-20T03:44:36Z    14446
2026-01-20T03:44:37Z    14415
2026-01-20T03:44:38Z    14390
2026-01-20T03:44:28Z    14326
2026-01-20T03:44:27Z    14306
2026-01-20T03:44:31Z    14251
2026-01-20T03:44:40Z    14167
2026-01-20T03:44:32Z    14144
2026-01-20T03:44:29Z    14143
2026-01-20T03:44:35Z    14086
2026-01-20T03:44:34Z    14047
2026-01-20T03:44:33Z    13999
2026-01-20T03:44:43Z    13702
2026-01-20T03:44:26Z     7606
2026-01-20T03:44:47Z     5100
2026-01-20T03:44:45Z     5000
Name: count, dtype: int64

## Business data

In [13]:
print(business1_df.shape)
display(business1_df.head(2))

print(business2_df.shape)
display(business2_df.head(2))

print(business3_df.shape)
display(business3_df.head(2))

(2500, 15)


Unnamed: 0,business_id,name,legal_name,tax_id,industry,description,email,phone,website,employee_count,annual_revenue,address_id,is_active,created_at,updated_at
0,1,Carrillo-Roberts,Carrillo-Roberts Inc.,98-5661569,Technology,Deliver sticky niches,nancynorris@foster.com,214.691.8804,https://shaffer.org/,250,16534414,58514,True,2026-01-20T03:44:43Z,2026-01-20T03:44:43Z
1,2,"Williams, Murphy and Donaldson","Williams, Murphy and Donaldson LLC",27-6968995,Real Estate,Generate interactive convergence,kadams@leonard.com,385.595.7196x7940,https://www.nelson.net/,10000,5934366,154795,True,2026-01-20T03:44:43Z,2026-01-20T03:44:43Z


(2550, 15)


Unnamed: 0,business_id,name,legal_name,tax_id,industry,description,email,phone,website,employee_count,annual_revenue,address_id,is_active,created_at,updated_at
0,1,Carrillo-Roberts,Carrillo-Roberts Inc.,98-5661569,Technology,Deliver sticky niches,nancynorris@foster.com,214.691.8804,https://shaffer.org/,250,16534414,58514,True,2026-01-20T03:44:43Z,2026-01-20T03:44:43Z
1,2,"Williams, Murphy and Donaldson","Williams, Murphy and Donaldson LLC",27-6968995,Real Estate,Generate interactive convergence,kadams@leonard.com,385.595.7196x7940,https://www.nelson.net/,10000,5934366,154795,True,2026-01-20T03:44:43Z,2026-01-20T03:44:43Z


(2601, 15)


Unnamed: 0,business_id,name,legal_name,tax_id,industry,description,email,phone,website,employee_count,annual_revenue,address_id,is_active,created_at,updated_at
0,1,Carrillo-Roberts,Carrillo-Roberts Inc.,98-5661569,Technology,Deliver sticky niches,nancynorris@foster.com,214.691.8804,https://shaffer.org/,250,16534414,58514,True,2026-01-20T03:44:43Z,2026-01-20T03:44:43Z
1,2,"Williams, Murphy and Donaldson","Williams, Murphy and Donaldson LLC",27-6968995,Real Estate,Generate interactive convergence,kadams@leonard.com,385.595.7196x7940,https://www.nelson.net/,10000,5934366,154795,True,2026-01-20T03:44:43Z,2026-01-20T03:44:43Z


In [14]:
print(business1_df["business_id"].nunique())
print(business2_df["business_id"].nunique())
print(business3_df["business_id"].nunique())

2500
2550
2601


## Trading Partnership data

In [15]:
print(trading_partners1_df.shape)
display(trading_partners1_df.head(2))

print(trading_partners2_df.shape)
display(trading_partners2_df.head(2))

print(trading_partners3_df.shape)
display(trading_partners3_df.head(2))

(20000, 11)


Unnamed: 0,partnership_id,business1_id,business2_id,is_active,partnership_type,start_date,end_date,contract_value,notes,created_at,updated_at
0,1,1751,1624,True,supplier,2023-06-26T07:07:41.244836,,515718.0,Skin western marriage everything choice him my...,2026-01-20T03:44:44Z,2026-01-20T03:44:44Z
1,2,2099,1530,False,distributor,2022-08-17T18:07:45.546344,2024-01-07T20:10:31.396890,4239633.0,Director method local occur financial painting...,2026-01-20T03:44:44Z,2026-01-20T03:44:44Z


(20400, 11)


Unnamed: 0,partnership_id,business1_id,business2_id,is_active,partnership_type,start_date,end_date,contract_value,notes,created_at,updated_at
0,1,1751,1624,True,supplier,2023-06-26T07:07:41.244836,,515718.0,Skin western marriage everything choice him my...,2026-01-20T03:44:44Z,2026-01-20T03:44:44Z
1,2,2099,1530,False,distributor,2022-08-17T18:07:45.546344,2024-01-07T20:10:31.396890,4239633.0,Director method local occur financial painting...,2026-01-20T03:44:44Z,2026-01-20T03:44:44Z


(20808, 11)


Unnamed: 0,partnership_id,business1_id,business2_id,is_active,partnership_type,start_date,end_date,contract_value,notes,created_at,updated_at
0,1,1751,1624,True,supplier,2023-06-26T07:07:41.244836,,515718.0,Skin western marriage everything choice him my...,2026-01-20T03:44:44Z,2026-01-20T03:44:44Z
1,2,2099,1530,False,distributor,2022-08-17T18:07:45.546344,2024-01-07T20:10:31.396890,4239633.0,Director method local occur financial painting...,2026-01-20T03:44:44Z,2026-01-20T03:44:44Z


In [16]:
print(trading_partners1_df["partnership_id"].nunique())
print(trading_partners2_df["partnership_id"].nunique())
print(trading_partners3_df["partnership_id"].nunique())

20000
20400
20808


In [17]:
print(trading_partners1_df["business1_id"].nunique())
print(trading_partners2_df["business1_id"].nunique())
print(trading_partners3_df["business1_id"].nunique())

2499
2504
2517


In [18]:
print(trading_partners1_df["business2_id"].nunique())
print(trading_partners2_df["business2_id"].nunique())
print(trading_partners3_df["business2_id"].nunique())

2500
2506
2517


## Checking relational constraints

There aren't any `address_id`s in the business data that aren't in the address data. Success!

There also aren't any `business1_id` or `business2_id`s in the trading partnership data that aren't in the business data. Success again!

In [19]:
set(business3_df["address_id"].unique()) - set(addr3_df["address_id"].unique())

set()

In [20]:
set(trading_partners3_df["business1_id"].unique()) - set(business3_df["business_id"].unique())

set()

In [21]:
set(trading_partners3_df["business2_id"].unique()) - set(business3_df["business_id"].unique())

set()