In [19]:
import numpy as np
import pandas as pd
np.random.seed(42)
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys


In [6]:
df_counts = pd.read_excel("raw_data.xlsx")
df_counts.head()

Unnamed: 0,neighbourhood,count
0,Oud Scheveningen,2983
1,Vissershaven,4416
2,Scheveningen Badplaats,5670
3,Visserijbuurt,4048
4,v Stolkpark/Schev Bosjes,843


In [8]:
df_expanded = df_counts.loc[df_counts.index.repeat(df_counts["count"])].reset_index(drop=True)
df_expanded.shape


(563122, 2)

In [10]:
seed = 42
rng = np.random.RandomState(seed)
n_agents = len(df_expanded)
print(n_agents)

563122


In [11]:
# create a random permutation of 1…n_agents
random_ids = rng.permutation(n_agents) + 1

df_expanded["Individual_ID"] = random_ids

# ─── Step 4: Select & reorder columns, and write to CSV ────────────────────────
df_agents = df_expanded[["Individual_ID", "neighbourhood"]]
df_agents.to_csv("agents.csv", index=False)


In [12]:
print(f"Generated agents.csv with {n_agents:,} rows.")

Generated agents.csv with 563,122 rows.


In [62]:
# Load the raw distribution data
df = pd.read_excel('raw_data.xlsx')


# Display DataFrame info for column types and non-null counts
print("DataFrame info:")
print(df.info(), "\n")


DataFrame info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 114 entries, 0 to 113
Data columns (total 16 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   neighbourhood                           114 non-null    object 
 1   count                                   111 non-null    float64
 2   0 to 4-year-olds [persons]              114 non-null    object 
 3   5 to 14-year-olds [persons]             114 non-null    object 
 4   15 to 19-year-olds [persons]            114 non-null    object 
 5   20 to 44-year-olds [persons]            114 non-null    object 
 6   45 to 64 year-olds [persons]            114 non-null    int64  
 7   65 to 79-year-olds [persons]            114 non-null    object 
 8   80-year-olds and above [persons]        114 non-null    object 
dtypes: float64(2), int64(1), object(13)
memory usage: 14.4+ KB
None 



In [63]:
df.head()

Unnamed: 0,neighbourhood,count,0 to 4-year-olds [persons],5 to 14-year-olds [persons],15 to 19-year-olds [persons],20 to 44-year-olds [persons],45 to 64 year-olds [persons],65 to 79-year-olds [persons],80-year-olds and above [persons],% 0 to 4-year-olds [procent (%)],% 5 to 14-year-olds [procent (%)],% 15 to 19-year-olds [procent (%)],% 20 to 44-year-olds [procent (%)],% 45 to 64-year-olds [procent (%)],% 65 to 79-year-olds [procent (%)],% 80-year-olds and above [procent (%)]
0,Oud Scheveningen,2958.0,148,272,116,1029,831,436,126,0.05,0.092,0.039,0.348,0.281,0.147,0.043
1,Vissershaven,4699.0,200,458,220,1526,1276,762,257,0.043,0.097,0.047,0.325,0.272,0.162,0.055
2,Scheveningen Badplaats,5682.0,241,528,267,2114,1534,789,209,0.042,0.093,0.047,0.372,0.27,0.139,0.037
3,Visserijbuurt,4086.0,189,443,215,1365,1021,595,258,0.046,0.108,0.053,0.334,0.25,0.146,0.063
4,v Stolkpark/Schev Bosjes,814.0,27,71,55,190,244,169,58,0.033,0.087,0.068,0.233,0.3,0.208,0.071


In [64]:
rows_with_x = df[df.apply(lambda row: row.astype(str).str.contains('x').any(), axis=1)]
rows_with_x.head(25)

Unnamed: 0,neighbourhood,count,0 to 4-year-olds [persons],5 to 14-year-olds [persons],15 to 19-year-olds [persons],20 to 44-year-olds [persons],45 to 64 year-olds [persons],65 to 79-year-olds [persons],80-year-olds and above [persons],% 0 to 4-year-olds [procent (%)],% 5 to 14-year-olds [procent (%)],% 15 to 19-year-olds [procent (%)],% 20 to 44-year-olds [procent (%)],% 45 to 64-year-olds [procent (%)],% 65 to 79-year-olds [procent (%)],% 80-year-olds and above [procent (%)]
32,Zuiderpark,99.0,x,x,x,x,20,49,26,x,x,x,x,0.202,0.495,0.263
45,Haagse Bos,460.0,8,x,9,153,92,116,79,0.017,x,0.02,0.333,0.2,0.252,0.172
52,Binckhorst,2058.0,127,67,36,1446,311,70,x,0.062,0.033,0.017,0.703,0.151,0.034,x
73,Bosjes van Pex,386.0,9,35,22,82,134,84,20,0.023,0.091,0.057,0.212,0.347,0.218,0.052
77,Kerketuinen/Zichtenburg,156.0,x,x,x,80,69,5,x,x,x,x,0.513,0.442,0.032,x
102,De Reef,744.0,75,109,34,390,105,28,x,0.101,0.147,0.046,0.524,0.141,0.038,x
108,Vlietzoom-Oost,102.0,x,7,x,22,29,30,7,x,0.069,x,0.216,0.284,0.294,0.069
109,De Rivieren,40.0,x,5,x,12,16,x,x,x,0.125,x,0.3,0.4,x,x


In [65]:
# Convert columns 2 to 8 to numeric, coercing errors to NaN, and then sum them row-wise
rows_with_x["accounted_for_people"] = rows_with_x.iloc[:, 2:9].apply(pd.to_numeric, errors='coerce').sum(axis=1)

# Subtract column 1 ("count") from the sum to calculate "unaccounted_for_people"
rows_with_x["unaccounted_for_people"] = rows_with_x["count"] - rows_with_x["accounted_for_people"]

# Display the updated DataFrame
rows_with_x.head(50)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rows_with_x["accounted_for_people"] = rows_with_x.iloc[:, 2:9].apply(pd.to_numeric, errors='coerce').sum(axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rows_with_x["unaccounted_for_people"] = rows_with_x["count"] - rows_with_x["accounted_for_people"]


Unnamed: 0,neighbourhood,count,0 to 4-year-olds [persons],5 to 14-year-olds [persons],15 to 19-year-olds [persons],20 to 44-year-olds [persons],45 to 64 year-olds [persons],65 to 79-year-olds [persons],80-year-olds and above [persons],% 0 to 4-year-olds [procent (%)],% 5 to 14-year-olds [procent (%)],% 15 to 19-year-olds [procent (%)],% 20 to 44-year-olds [procent (%)],% 45 to 64-year-olds [procent (%)],% 65 to 79-year-olds [procent (%)],% 80-year-olds and above [procent (%)],accounted_for_people,unaccounted_for_people
32,Zuiderpark,99.0,x,x,x,x,20,49,26,x,x,x,x,0.202,0.495,0.263,95.0,4.0
45,Haagse Bos,460.0,8,x,9,153,92,116,79,0.017,x,0.02,0.333,0.2,0.252,0.172,457.0,3.0
52,Binckhorst,2058.0,127,67,36,1446,311,70,x,0.062,0.033,0.017,0.703,0.151,0.034,x,2057.0,1.0
73,Bosjes van Pex,386.0,9,35,22,82,134,84,20,0.023,0.091,0.057,0.212,0.347,0.218,0.052,386.0,0.0
77,Kerketuinen/Zichtenburg,156.0,x,x,x,80,69,5,x,x,x,x,0.513,0.442,0.032,x,154.0,2.0
102,De Reef,744.0,75,109,34,390,105,28,x,0.101,0.147,0.046,0.524,0.141,0.038,x,741.0,3.0
108,Vlietzoom-Oost,102.0,x,7,x,22,29,30,7,x,0.069,x,0.216,0.284,0.294,0.069,95.0,7.0
109,De Rivieren,40.0,x,5,x,12,16,x,x,x,0.125,x,0.3,0.4,x,x,33.0,7.0


In [66]:
x_counts = rows_with_x.iloc[:, 2:9].apply(lambda col: col.astype(str).str.count('x')).sum(axis=1)
x_counts_dict = dict(zip(rows_with_x["neighbourhood"], x_counts))
print(x_counts_dict)

{'Zuiderpark': 4, 'Haagse Bos': 1, 'Binckhorst': 1, 'Bosjes van Pex': 0, 'Kerketuinen/Zichtenburg': 4, 'De Reef': 1, 'Vlietzoom-Oost': 2, 'De Rivieren': 4}


In [67]:
rows_to_update = ['Haagse Bos', 'Binckhorst', 'De Reef']

for row in rows_to_update:
    row_index = rows_with_x[rows_with_x['neighbourhood'] == row].index[0]
    unaccounted_value = rows_with_x.loc[row_index, 'unaccounted_for_people']
    rows_with_x.loc[row_index] = rows_with_x.loc[row_index].replace('x', int(unaccounted_value))

rows_with_x.loc[rows_with_x['neighbourhood'].isin(rows_to_update)]

Unnamed: 0,neighbourhood,count,0 to 4-year-olds [persons],5 to 14-year-olds [persons],15 to 19-year-olds [persons],20 to 44-year-olds [persons],45 to 64 year-olds [persons],65 to 79-year-olds [persons],80-year-olds and above [persons],% 0 to 4-year-olds [procent (%)],% 5 to 14-year-olds [procent (%)],% 15 to 19-year-olds [procent (%)],% 20 to 44-year-olds [procent (%)],% 45 to 64-year-olds [procent (%)],% 65 to 79-year-olds [procent (%)],% 80-year-olds and above [procent (%)],accounted_for_people,unaccounted_for_people
45,Haagse Bos,460.0,8,3,9,153,92,116,79,0.017,3.0,0.02,0.333,0.2,0.252,0.172,457.0,3.0
52,Binckhorst,2058.0,127,67,36,1446,311,70,1,0.062,0.033,0.017,0.703,0.151,0.034,1.0,2057.0,1.0
102,De Reef,744.0,75,109,34,390,105,28,3,0.101,0.147,0.046,0.524,0.141,0.038,3.0,741.0,3.0


In [68]:
x_counts = rows_with_x.iloc[:, 2:9].apply(lambda col: col.astype(str).str.count('x')).sum(axis=1)
x_counts_dict = dict(zip(rows_with_x["neighbourhood"], x_counts))
print(x_counts_dict)

{'Zuiderpark': 4, 'Haagse Bos': 0, 'Binckhorst': 0, 'Bosjes van Pex': 0, 'Kerketuinen/Zichtenburg': 4, 'De Reef': 0, 'Vlietzoom-Oost': 2, 'De Rivieren': 4}


In [69]:
# Convert columns 2 to 8 to numeric, coercing errors to NaN, and then sum them row-wise
rows_with_x["accounted_for_people"] = rows_with_x.iloc[:, 2:9].apply(pd.to_numeric, errors='coerce').sum(axis=1)

# Subtract column 1 ("count") from the sum to calculate "unaccounted_for_people"
rows_with_x["unaccounted_for_people"] = rows_with_x["count"] - rows_with_x["accounted_for_people"]

# Display the updated DataFrame
rows_with_x.head(50)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rows_with_x["accounted_for_people"] = rows_with_x.iloc[:, 2:9].apply(pd.to_numeric, errors='coerce').sum(axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rows_with_x["unaccounted_for_people"] = rows_with_x["count"] - rows_with_x["accounted_for_people"]


Unnamed: 0,neighbourhood,count,0 to 4-year-olds [persons],5 to 14-year-olds [persons],15 to 19-year-olds [persons],20 to 44-year-olds [persons],45 to 64 year-olds [persons],65 to 79-year-olds [persons],80-year-olds and above [persons],% 0 to 4-year-olds [procent (%)],% 5 to 14-year-olds [procent (%)],% 15 to 19-year-olds [procent (%)],% 20 to 44-year-olds [procent (%)],% 45 to 64-year-olds [procent (%)],% 65 to 79-year-olds [procent (%)],% 80-year-olds and above [procent (%)],accounted_for_people,unaccounted_for_people
32,Zuiderpark,99.0,x,x,x,x,20,49,26,x,x,x,x,0.202,0.495,0.263,95.0,4.0
45,Haagse Bos,460.0,8,3,9,153,92,116,79,0.017,3,0.02,0.333,0.2,0.252,0.172,460.0,0.0
52,Binckhorst,2058.0,127,67,36,1446,311,70,1,0.062,0.033,0.017,0.703,0.151,0.034,1,2058.0,0.0
73,Bosjes van Pex,386.0,9,35,22,82,134,84,20,0.023,0.091,0.057,0.212,0.347,0.218,0.052,386.0,0.0
77,Kerketuinen/Zichtenburg,156.0,x,x,x,80,69,5,x,x,x,x,0.513,0.442,0.032,x,154.0,2.0
102,De Reef,744.0,75,109,34,390,105,28,3,0.101,0.147,0.046,0.524,0.141,0.038,3,744.0,0.0
108,Vlietzoom-Oost,102.0,x,7,x,22,29,30,7,x,0.069,x,0.216,0.284,0.294,0.069,95.0,7.0
109,De Rivieren,40.0,x,5,x,12,16,x,x,x,0.125,x,0.3,0.4,x,x,33.0,7.0


In [70]:
rows_to_update = ['Zuiderpark', 'Kerketuinen/Zichtenburg', 'Vlietzoom-Oost', 'De Rivieren']

for row in rows_to_update:
    row_index = rows_with_x[rows_with_x['neighbourhood'] == row].index[0]
    unaccounted_value = rows_with_x.loc[row_index, 'unaccounted_for_people']
    
    if row == 'Zuiderpark':
        # Split 4 unaccounted people into 2 for 15-year-old and 2 for 20-year-old
        rows_with_x.loc[row_index, '15 to 19-year-olds [persons]'] = 2
        rows_with_x.loc[row_index, '20 to 44-year-olds [persons]'] = 2
    elif row == 'Kerketuinen/Zichtenburg':
        # Split 2 unaccounted people into 1 for 80-year-old and 1 for 15-year-old
        rows_with_x.loc[row_index, '80-year-olds and above [persons]'] = 1
        rows_with_x.loc[row_index, '15 to 19-year-olds [persons]'] = 1
    elif row == 'Vlietzoom-Oost':
        # Split 2 unaccounted people into 1 for 80-year-old and 1 for 15-year-old
        rows_with_x.loc[row_index, '0 to 4-year-olds [persons]'] = 3
        rows_with_x.loc[row_index, '15 to 19-year-olds [persons]'] = 4
    elif row == 'De Rivieren':
        # Split 2 unaccounted people into 1 for 80-year-old and 1 for 15-year-old
        rows_with_x.loc[row_index, '80-year-olds and above [persons]'] = 1
        rows_with_x.loc[row_index, '65 to 79-year-olds [persons]'] = 2
        rows_with_x.loc[row_index, '0 to 4-year-olds [persons]'] = 2
        rows_with_x.loc[row_index, '15 to 19-year-olds [persons]'] = 2
    else:
        # Replace 'x' with the unaccounted value for other rows
        rows_with_x.loc[row_index] = rows_with_x.loc[row_index].replace('x', int(unaccounted_value))

# Filter rows for the updated neighborhoods
updated_rows = rows_with_x.loc[rows_with_x['neighbourhood'].isin(rows_to_update)]

In [71]:
x_counts = rows_with_x.iloc[:, 2:9].apply(lambda col: col.astype(str).str.count('x')).sum(axis=1)
x_counts_dict = dict(zip(rows_with_x["neighbourhood"], x_counts))
print(x_counts_dict)

{'Zuiderpark': 2, 'Haagse Bos': 0, 'Binckhorst': 0, 'Bosjes van Pex': 0, 'Kerketuinen/Zichtenburg': 2, 'De Reef': 0, 'Vlietzoom-Oost': 0, 'De Rivieren': 0}


In [72]:
# Convert columns 2 to 8 to numeric, coercing errors to NaN, and then sum them row-wise
rows_with_x["accounted_for_people"] = rows_with_x.iloc[:, 2:9].apply(pd.to_numeric, errors='coerce').sum(axis=1)

# Subtract column 1 ("count") from the sum to calculate "unaccounted_for_people"
rows_with_x["unaccounted_for_people"] = rows_with_x["count"] - rows_with_x["accounted_for_people"]

# Display the updated DataFrame
rows_with_x.head(50)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rows_with_x["accounted_for_people"] = rows_with_x.iloc[:, 2:9].apply(pd.to_numeric, errors='coerce').sum(axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rows_with_x["unaccounted_for_people"] = rows_with_x["count"] - rows_with_x["accounted_for_people"]


Unnamed: 0,neighbourhood,count,0 to 4-year-olds [persons],5 to 14-year-olds [persons],15 to 19-year-olds [persons],20 to 44-year-olds [persons],45 to 64 year-olds [persons],65 to 79-year-olds [persons],80-year-olds and above [persons],% 0 to 4-year-olds [procent (%)],% 5 to 14-year-olds [procent (%)],% 15 to 19-year-olds [procent (%)],% 20 to 44-year-olds [procent (%)],% 45 to 64-year-olds [procent (%)],% 65 to 79-year-olds [procent (%)],% 80-year-olds and above [procent (%)],accounted_for_people,unaccounted_for_people
32,Zuiderpark,99.0,x,x,2,2,20,49,26,x,x,x,x,0.202,0.495,0.263,99.0,0.0
45,Haagse Bos,460.0,8,3,9,153,92,116,79,0.017,3,0.02,0.333,0.2,0.252,0.172,460.0,0.0
52,Binckhorst,2058.0,127,67,36,1446,311,70,1,0.062,0.033,0.017,0.703,0.151,0.034,1,2058.0,0.0
73,Bosjes van Pex,386.0,9,35,22,82,134,84,20,0.023,0.091,0.057,0.212,0.347,0.218,0.052,386.0,0.0
77,Kerketuinen/Zichtenburg,156.0,x,x,1,80,69,5,1,x,x,x,0.513,0.442,0.032,x,156.0,0.0
102,De Reef,744.0,75,109,34,390,105,28,3,0.101,0.147,0.046,0.524,0.141,0.038,3,744.0,0.0
108,Vlietzoom-Oost,102.0,3,7,4,22,29,30,7,x,0.069,x,0.216,0.284,0.294,0.069,102.0,0.0
109,De Rivieren,40.0,2,5,2,12,16,2,1,x,0.125,x,0.3,0.4,x,x,40.0,0.0


In [73]:
# Replace all instances of 'x' with 0 in columns 2-9 (index 2 to 9 inclusive)
rows_with_x.iloc[:, 2:10] = rows_with_x.iloc[:, 2:10].replace('x', 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rows_with_x.iloc[:, 2:10] = rows_with_x.iloc[:, 2:10].replace('x', 0)
  rows_with_x.iloc[:, 2:10] = rows_with_x.iloc[:, 2:10].replace('x', 0)


In [74]:
# Display the updated DataFrame
rows_with_x.head(50)



Unnamed: 0,neighbourhood,count,0 to 4-year-olds [persons],5 to 14-year-olds [persons],15 to 19-year-olds [persons],20 to 44-year-olds [persons],45 to 64 year-olds [persons],65 to 79-year-olds [persons],80-year-olds and above [persons],% 0 to 4-year-olds [procent (%)],% 5 to 14-year-olds [procent (%)],% 15 to 19-year-olds [procent (%)],% 20 to 44-year-olds [procent (%)],% 45 to 64-year-olds [procent (%)],% 65 to 79-year-olds [procent (%)],% 80-year-olds and above [procent (%)],accounted_for_people,unaccounted_for_people
32,Zuiderpark,99.0,0,0,2,2,20,49,26,0.0,x,x,x,0.202,0.495,0.263,99.0,0.0
45,Haagse Bos,460.0,8,3,9,153,92,116,79,0.017,3,0.02,0.333,0.2,0.252,0.172,460.0,0.0
52,Binckhorst,2058.0,127,67,36,1446,311,70,1,0.062,0.033,0.017,0.703,0.151,0.034,1,2058.0,0.0
73,Bosjes van Pex,386.0,9,35,22,82,134,84,20,0.023,0.091,0.057,0.212,0.347,0.218,0.052,386.0,0.0
77,Kerketuinen/Zichtenburg,156.0,0,0,1,80,69,5,1,0.0,x,x,0.513,0.442,0.032,x,156.0,0.0
102,De Reef,744.0,75,109,34,390,105,28,3,0.101,0.147,0.046,0.524,0.141,0.038,3,744.0,0.0
108,Vlietzoom-Oost,102.0,3,7,4,22,29,30,7,0.0,0.069,x,0.216,0.284,0.294,0.069,102.0,0.0
109,De Rivieren,40.0,2,5,2,12,16,2,1,0.0,0.125,x,0.3,0.4,x,x,40.0,0.0


In [76]:
# Drop the last two columns
rows_with_x = rows_with_x.iloc[:, :-2]

# Merge the modified rows back into the original DataFrame
df.update(rows_with_x)

In [78]:
df.head()

Unnamed: 0,neighbourhood,count,0 to 4-year-olds [persons],5 to 14-year-olds [persons],15 to 19-year-olds [persons],20 to 44-year-olds [persons],45 to 64 year-olds [persons],65 to 79-year-olds [persons],80-year-olds and above [persons],% 0 to 4-year-olds [procent (%)],% 5 to 14-year-olds [procent (%)],% 15 to 19-year-olds [procent (%)],% 20 to 44-year-olds [procent (%)],% 45 to 64-year-olds [procent (%)],% 65 to 79-year-olds [procent (%)],% 80-year-olds and above [procent (%)]
0,Oud Scheveningen,2958.0,148,272,116,1029,831.0,436,126,0.05,0.092,0.039,0.348,0.281,0.147,0.043
1,Vissershaven,4699.0,200,458,220,1526,1276.0,762,257,0.043,0.097,0.047,0.325,0.272,0.162,0.055
2,Scheveningen Badplaats,5682.0,241,528,267,2114,1534.0,789,209,0.042,0.093,0.047,0.372,0.27,0.139,0.037
3,Visserijbuurt,4086.0,189,443,215,1365,1021.0,595,258,0.046,0.108,0.053,0.334,0.25,0.146,0.063
4,v Stolkpark/Schev Bosjes,814.0,27,71,55,190,244.0,169,58,0.033,0.087,0.068,0.233,0.3,0.208,0.071


In [79]:

df = df.iloc[:, :-7]
df.head(50)

Unnamed: 0,neighbourhood,count,0 to 4-year-olds [persons],5 to 14-year-olds [persons],15 to 19-year-olds [persons],20 to 44-year-olds [persons],45 to 64 year-olds [persons],65 to 79-year-olds [persons],80-year-olds and above [persons]
0,Oud Scheveningen,2958.0,148.0,272.0,116.0,1029.0,831.0,436.0,126.0
1,Vissershaven,4699.0,200.0,458.0,220.0,1526.0,1276.0,762.0,257.0
2,Scheveningen Badplaats,5682.0,241.0,528.0,267.0,2114.0,1534.0,789.0,209.0
3,Visserijbuurt,4086.0,189.0,443.0,215.0,1365.0,1021.0,595.0,258.0
4,v Stolkpark/Schev Bosjes,814.0,27.0,71.0,55.0,190.0,244.0,169.0,58.0
5,Waldeck-Zuid,2039.0,59.0,196.0,92.0,475.0,505.0,496.0,216.0
6,Statenkwartier,10481.0,397.0,1253.0,712.0,3079.0,3133.0,1448.0,459.0
7,Geuzenkwartier,4474.0,221.0,455.0,230.0,1445.0,1175.0,732.0,216.0
8,Vogelwijk,5489.0,208.0,893.0,479.0,921.0,1828.0,957.0,203.0
9,Rond de Energiecentrale,6768.0,356.0,535.0,265.0,3346.0,1491.0,642.0,133.0


In [91]:
import pandas as pd
import random

# Set the random seed for reproducibility
random.seed(42)

# Define the age ranges for each group
age_ranges = {
    '0 to 4-year-olds [persons]': (0, 4),
    '5 to 14-year-olds [persons]': (5, 14),
    '15 to 19-year-olds [persons]': (15, 19),
    '20 to 44-year-olds [persons]': (20, 44),
    '45 to 64 year-olds [persons]': (45, 64),
    '65 to 79-year-olds [persons]': (65, 79),
    '80-year-olds and above [persons]': (80, 110)  # Assuming 110 as the upper limit for 80+
}

# Create an empty list to store agent data
agents = pd.read_csv('agents.csv')
# Create a list to store the assigned ages
ages = []

# Iterate through each agent in the agents DataFrame
for _, agent in agents.iterrows():
    neighbourhood = agent['neighbourhood']
    
    # Get the row in `df` corresponding to the agent's neighbourhood
    neighbourhood_row = df[df['neighbourhood'] == neighbourhood]
    
    # Flatten the age distribution for this neighbourhood into a list of age ranges
    age_distribution = []
    for age_group, (min_age, max_age) in age_ranges.items():
        if age_group in neighbourhood_row.columns:
            
            count = int(neighbourhood_row[age_group].values[0])
            age_distribution.extend([age_group] * count)
    
    # Randomly select an age group based on the distribution
    selected_age_group = random.choice(age_distribution)
    min_age, max_age = age_ranges[selected_age_group]
    
    # Assign a random age within the selected age group
    random_age = random.randint(min_age, max_age)
    ages.append(random_age)

# Add the "Age" column to the agents DataFrame
agents['Age'] = ages
agents.head()

# Save the updated agents DataFrame back to a CSV file
#agents.to_csv('agents.csv', index=False)

Unnamed: 0,Individual_ID,neighbourhood,Age
0,473913,Oud Scheveningen,66
1,398117,Oud Scheveningen,2
2,42689,Oud Scheveningen,27
3,57218,Oud Scheveningen,43
4,534804,Oud Scheveningen,13


In [82]:
agents_df.head(10)

In [92]:
agents.to_csv('agents.csv', index=False)