# Combine data

This file makes one dataset needed for the model. Make sure to run first the scripts that generates the POIs and the densities of the interested cities. Both are in a different script because they are both quite time consuming.

In [290]:
import pandas as pd
import geopandas as gpd

# Read data

## Specific information (depending on the cities)

The cities used in this projects are listed in the code below. They are purposely chosen to have a certain amount of inhabitants. 

In [291]:
# 500k inhabitants
lst_Antwerpen = ["ANTWERPEN"] 

# 270k inhabitants
lst_Gent = ["GENT"]

# 80k inhabitants
# lst_cities_80ktrain = ["MECHELEN", "HASSELT", "SINT-NIKLAAS"]
# lst_cities_80ktest = ["AALST", "KORTRIJK"]
lst_cities_80ktrain = ["MECHELEN", "SINT-NIKLAAS"] 
lst_cities_80ktest = ["HASSELT"]

# 40 a 50k inhabitants --> 50k
lst_cities_50ktrain = ["HEIST-OP-DEN-BERG", "LOKEREN", "GEEL"]
lst_cities_50ktest = ["DENDERMONDE", "BEVEREN"]


Adapt the information in the code below for the interested cities. Only this part of code needs to be changed.

In [292]:
lst_cities = lst_cities_50ktest
name_file = '50ktest'  # used to read the pois data & save the final_data
tot_inhabitants_city = 50000

## Read general data

Read the data of MOW of the trips in the morning peak between 7h-8h.

In [293]:
verplaatsingen_tot_per_u = pd.read_csv('Verplaatsingen_Totaal_uur_spmVlaanderenVersie4.2.2_2017.CSV', skiprows = 2)  # First rows are something written
hb_matrix_7u_8u = verplaatsingen_tot_per_u[["'H", "B", 'Tab:8']]
hb_matrix_7u_8u = hb_matrix_7u_8u.rename(columns={'Tab:8': 'Tot', "'H": "H"})

## Read density data

Read the density data generated with the file "Determing_density_per_zone".

In [294]:
final_data = gpd.GeoDataFrame()
lst_city_rows = []

for name_city in lst_cities:
    data_density_specific_city = gpd.read_file("Data_density/densities_cities/data_density_" + name_city + ".shp")
    lst_city_rows += [name_city] * len(data_density_specific_city)  # will be used to add a column in the df to know which city
    final_data = pd.concat([final_data, data_density_specific_city], ignore_index=True)

final_data['city'] = lst_city_rows
final_data['city_size'] = [tot_inhabitants_city]*len(final_data)

In [295]:
final_data

Unnamed: 0,ZONENUMMER,percentage,inhabitant,density,count,geometry,city,city_size
0,2467,100.0,804.273060,3954.691868,36,"POLYGON ((105184.260 193832.897, 105175.236 19...",GENT,270000
1,2468,100.0,2717.092743,8733.092072,49,"POLYGON ((105410.629 194259.208, 105429.192 19...",GENT,270000
2,2469,100.0,1147.842147,4772.002629,36,"POLYGON ((105094.151 193691.739, 105073.654 19...",GENT,270000
3,2470,100.0,879.090943,8459.200304,21,"POLYGON ((104839.808 194650.326, 104838.648 19...",GENT,270000
4,2471,100.0,2332.238767,6067.689094,45,"POLYGON ((104839.808 194650.326, 104771.356 19...",GENT,270000
...,...,...,...,...,...,...,...,...
111,2578,100.0,185.933613,385.275689,18,"POLYGON ((108920.378 204498.146, 108903.619 20...",GENT,270000
112,2579,100.0,3.788993,5.448700,3,"POLYGON ((106895.175 201032.485, 106837.165 20...",GENT,270000
113,2580,100.0,0.000000,0.000000,1,"POLYGON ((109489.596 205856.732, 107486.280 20...",GENT,270000
114,2581,100.0,2.185371,0.991735,3,"POLYGON ((107486.280 205189.986, 109489.596 20...",GENT,270000


In [296]:
# remove column percentage & count because this will not be use
final_data = final_data.drop(columns=['count', 'percentage'])

In [297]:
final_data

Unnamed: 0,ZONENUMMER,inhabitant,density,geometry,city,city_size
0,2467,804.273060,3954.691868,"POLYGON ((105184.260 193832.897, 105175.236 19...",GENT,270000
1,2468,2717.092743,8733.092072,"POLYGON ((105410.629 194259.208, 105429.192 19...",GENT,270000
2,2469,1147.842147,4772.002629,"POLYGON ((105094.151 193691.739, 105073.654 19...",GENT,270000
3,2470,879.090943,8459.200304,"POLYGON ((104839.808 194650.326, 104838.648 19...",GENT,270000
4,2471,2332.238767,6067.689094,"POLYGON ((104839.808 194650.326, 104771.356 19...",GENT,270000
...,...,...,...,...,...,...
111,2578,185.933613,385.275689,"POLYGON ((108920.378 204498.146, 108903.619 20...",GENT,270000
112,2579,3.788993,5.448700,"POLYGON ((106895.175 201032.485, 106837.165 20...",GENT,270000
113,2580,0.000000,0.000000,"POLYGON ((109489.596 205856.732, 107486.280 20...",GENT,270000
114,2581,2.185371,0.991735,"POLYGON ((107486.280 205189.986, 109489.596 20...",GENT,270000


# Calculate production and attraction per zone

The production and attraction of each zone is calculated and added to the final_data

In [298]:
zone_numbers = list(final_data['ZONENUMMER'])

In [299]:
productions = []
attractions = []

def calculate_production(zone_nr):
    prod_zone = hb_matrix_7u_8u[hb_matrix_7u_8u["H"] == int(zone_nr)]['Tot'].sum()
    return prod_zone

def calculate_attraction(zone_nr):
    attr_zone = hb_matrix_7u_8u[hb_matrix_7u_8u["B"] == int(zone_nr)]['Tot'].sum()
    return attr_zone

for num in zone_numbers:
    prod_zone_calculated = calculate_production(num)
    productions.append(prod_zone_calculated)
    attr_zone_calculated = calculate_attraction(num)
    attractions.append(attr_zone_calculated)

In [300]:
final_data['production'] = productions
final_data['attraction'] = attractions

In [301]:
final_data

Unnamed: 0,ZONENUMMER,inhabitant,density,geometry,city,city_size,production,attraction
0,2467,804.273060,3954.691868,"POLYGON ((105184.260 193832.897, 105175.236 19...",GENT,270000,252.865,965.871
1,2468,2717.092743,8733.092072,"POLYGON ((105410.629 194259.208, 105429.192 19...",GENT,270000,770.022,1595.021
2,2469,1147.842147,4772.002629,"POLYGON ((105094.151 193691.739, 105073.654 19...",GENT,270000,531.773,2015.771
3,2470,879.090943,8459.200304,"POLYGON ((104839.808 194650.326, 104838.648 19...",GENT,270000,245.520,170.511
4,2471,2332.238767,6067.689094,"POLYGON ((104839.808 194650.326, 104771.356 19...",GENT,270000,500.952,1263.949
...,...,...,...,...,...,...,...,...
111,2578,185.933613,385.275689,"POLYGON ((108920.378 204498.146, 108903.619 20...",GENT,270000,9.974,39.974
112,2579,3.788993,5.448700,"POLYGON ((106895.175 201032.485, 106837.165 20...",GENT,270000,15.816,4.816
113,2580,0.000000,0.000000,"POLYGON ((109489.596 205856.732, 107486.280 20...",GENT,270000,1.000,1.000
114,2581,2.185371,0.991735,"POLYGON ((107486.280 205189.986, 109489.596 20...",GENT,270000,1.000,0.000


# POIs

Read the data of the Point Of Interests (POIs)

In [302]:
# Data of Antwerpen is available as .shp file instead of .csv file
if name_file != 'Antwerpen': 
    data_pois = pd.read_csv("Data_POI/pois_categorized_" + name_file + '.csv')

else:
    data_pois = gpd.read_file("Data_POI/Antwerpen/POIs_categorized.shp")


  data_pois = pd.read_csv("Data_POI/pois_categorized_" + name_file + '.csv')


In [303]:
data_pois

Unnamed: 0.1,Unnamed: 0,osmid,geometry,landuse,building,amenity,leisure,shop,office,sport,...,Health,Leisure,Shops,Services,Industry,Catering_industry,Tourism,Others,Leisure_area,classification
0,8831,27889030,POINT (3.746847595091478 51.06193496105809),,yes,school,,,,,...,0.000000,0.0000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,School
1,8835,27889053,POINT (3.756295270495231 51.06354952109326),,yes,,,,,,...,0.000000,0.0000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,School
2,8841,34067274,POINT (3.701424719420178 51.03140460288225),,yes,,,,,,...,0.000000,0.0000,0.000000,0.000000,0.0,477.751337,0.0,0.0,0.000000,School
3,8842,40852759,POINT (3.710820822690929 51.03609815014397),,train_station,,,,,,...,1186.432354,0.0000,1186.432354,1186.432354,0.0,118.643235,0.0,0.0,0.000000,Health
4,8844,41335827,POINT (3.682950650728885 51.04696424690124),,sports_centre,,sports_centre,,,tennis,...,0.000000,2649.1928,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,Leisure
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90012,155312,7401264,POINT (3.764147543785821 51.09008448244784),,,,sports_centre,,,soccer,...,0.000000,0.0000,0.000000,0.000000,0.0,0.000000,0.0,0.0,20985.623481,Leisure_area
90013,155325,8306622,POINT (3.745926045849876 51.06621525465082),,,,park,,,,...,0.000000,0.0000,0.000000,0.000000,0.0,0.000000,0.0,0.0,51255.129493,Leisure_area
90014,155385,9874016,POINT (3.699867246477566 51.09288515413256),,,,park,,,,...,0.000000,0.0000,0.000000,0.000000,0.0,0.000000,0.0,0.0,165424.474509,Leisure_area
90015,155460,12728994,POINT (3.724431402323066 51.02705502186826),,,,park,,,,...,0.000000,0.0000,0.000000,0.000000,0.0,0.000000,0.0,0.0,26002.570602,Leisure_area


In [304]:
# Convert the pd df to a gpd df (only needed for the data that is saved as .csv file)
from shapely.wkt import loads
if name_file != 'Antwerpen':
    data_pois['geometry'] = data_pois['geometry'].apply(loads)  # Convert WKT to Shapely geometry objects
    data_pois = gpd.GeoDataFrame(data_pois, geometry='geometry', crs='EPSG:4326') # EPSG:4326 is chosen because this is WGS 84

In [305]:
print(data_pois.crs)

EPSG:4326


In [306]:
print(final_data.crs)

EPSG:31370


In [307]:
# Make sure that the crs are the same
data_pois = data_pois.to_crs(epsg=31370)
print(data_pois.crs)

EPSG:31370


In [308]:
# Initialize an empty list to store the total sums of POIs per zone
total_sums = []

# Loop through each zone i (row) in final_data
for i in range(len(final_data)):
    # Check which POIs are in the zone
    mask = (final_data.iloc[i,:].geometry).contains(data_pois.geometry)
    zone_i_points = data_pois[mask]
    # Calculate the sum of POIs for each category for this zone
    total_sums_POIs = zone_i_points.sum(numeric_only=True)
    
    # Append the total sums to the list
    total_sums.append(total_sums_POIs)

# Convert total_sums to DataFrame
total_sums_df = pd.DataFrame(total_sums)

# Combine the two dataframes
final_data = pd.concat([final_data, total_sums_df], axis=1)

In [309]:
final_data.columns

Index(['ZONENUMMER', 'inhabitant', 'density', 'geometry', 'city', 'city_size',
       'production', 'attraction', 'Unnamed: 0', 'osmid', 'building:levels',
       'polygon_area', 'office_outer', 'large_residential',
       'small_residential', 'School', 'Health', 'Leisure', 'Shops', 'Services',
       'Industry', 'Catering_industry', 'Tourism', 'Others', 'Leisure_area'],
      dtype='object')

POIs categorized contains a lot of columns that will not be used, so drop these columns. These columns differs among the different cities, so check first if it is in the dataframe


In [310]:
columns_to_delete = ['Unnamed: 0', 'osmid', 'polygon_area', 'building:levels', 'shop_outer', 'office_outer', 'tourism_inner_poly', 'office_inner_poly', 'sport_inner_poly']

for column_name in columns_to_delete:
    if column_name in final_data:
        final_data = final_data.drop(columns=column_name)

In [311]:
final_data

Unnamed: 0,ZONENUMMER,inhabitant,density,geometry,city,city_size,production,attraction,large_residential,small_residential,School,Health,Leisure,Shops,Services,Industry,Catering_industry,Tourism,Others,Leisure_area
0,2467,804.273060,3954.691868,"POLYGON ((105184.260 193832.897, 105175.236 19...",GENT,270000,252.865,965.871,3682.170215,50511.095518,7132.437994,166.118609,3909.543358,16250.966143,14390.828625,0.000000,17049.292734,6532.082039,10895.960539,3465.859923
1,2468,2717.092743,8733.092072,"POLYGON ((105410.629 194259.208, 105429.192 19...",GENT,270000,770.022,1595.021,7514.938419,117408.020203,13887.516715,598.349485,2944.023268,9283.314743,20233.002534,1987.422401,8153.527634,3275.916877,1435.731971,19413.533645
2,2469,1147.842147,4772.002629,"POLYGON ((105094.151 193691.739, 105073.654 19...",GENT,270000,531.773,2015.771,24719.747133,38110.237909,7342.780263,221.602732,4094.486782,35844.719439,20704.937687,0.000000,10505.158643,3981.695294,500.949986,0.000000
3,2470,879.090943,8459.200304,"POLYGON ((104839.808 194650.326, 104838.648 19...",GENT,270000,245.520,170.511,7645.839036,35345.222585,0.000000,346.997186,92.178082,1662.331753,1498.794963,0.000000,7764.661214,4133.462087,0.000000,1879.121492
4,2471,2332.238767,6067.689094,"POLYGON ((104839.808 194650.326, 104771.356 19...",GENT,270000,500.952,1263.949,16358.827533,44831.573405,4927.202098,39243.558808,4388.634171,2738.776977,11399.718562,7477.079249,704.467549,3080.305046,6771.232255,48218.319956
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
111,2578,185.933613,385.275689,"POLYGON ((108920.378 204498.146, 108903.619 20...",GENT,270000,9.974,39.974,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
112,2579,3.788993,5.448700,"POLYGON ((106895.175 201032.485, 106837.165 20...",GENT,270000,15.816,4.816,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
113,2580,0.000000,0.000000,"POLYGON ((109489.596 205856.732, 107486.280 20...",GENT,270000,1.000,1.000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
114,2581,2.185371,0.991735,"POLYGON ((107486.280 205189.986, 109489.596 20...",GENT,270000,1.000,0.000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


# Save the file

In [312]:
final_data.to_file('Data_combined/combined_data_' + name_file + '.shp')

  final_data.to_file('Data_combined/combined_data_' + name_file + '.shp')


# Combine the data in a dataset ALL_train, ALL_test

In [313]:
# Read the shapefiles
final_Antwerpen = gpd.read_file('Data_combined/combined_data_Antwerpen.shp')
final_Gent = gpd.read_file('Data_combined/combined_data_Gent.shp')
final_50ktrain = gpd.read_file('Data_combined/combined_data_50ktrain.shp')
final_50ktest = gpd.read_file('Data_combined/combined_data_50ktest.shp')
final_80ktrain = gpd.read_file('Data_combined/combined_data_80ktrain.shp')
final_80ktest = gpd.read_file('Data_combined/combined_data_80ktest.shp')

# Concatenate the DataFrames
combined_train = gpd.GeoDataFrame(pd.concat([final_Antwerpen, final_50ktrain, final_80ktrain], ignore_index=True))
combined_test = gpd.GeoDataFrame(pd.concat([final_Gent, final_50ktest, final_80ktest], ignore_index=True))

# Write the combined shapefile to a new file
combined_train.to_file('Data_combined/combined_data_ALL_TRAIN.shp')
combined_test.to_file('Data_combined/combined_data_ALL_TEST.shp')

In [325]:
combined_test

Unnamed: 0,ZONENUMMER,inhabitant,density,city,city_size,production,attraction,large_resi,small_resi,School,Health,Leisure,Shops,Services,Industry,Catering_i,Tourism,Others,Leisure_ar,geometry
0,2467,804.273060,3954.691868,GENT,270000,252.865,965.871,3682.170215,50511.095518,7132.437994,166.118609,3909.543358,16250.966143,14390.828625,0.000000,17049.292734,6532.082039,10895.960539,3465.859923,"POLYGON ((105184.260 193832.897, 105175.236 19..."
1,2468,2717.092743,8733.092072,GENT,270000,770.022,1595.021,7514.938419,117408.020203,13887.516715,598.349485,2944.023268,9283.314743,20233.002534,1987.422401,8153.527634,3275.916877,1435.731971,19413.533645,"POLYGON ((105410.629 194259.208, 105429.192 19..."
2,2469,1147.842147,4772.002629,GENT,270000,531.773,2015.771,24719.747133,38110.237909,7342.780263,221.602732,4094.486782,35844.719439,20704.937687,0.000000,10505.158643,3981.695294,500.949986,0.000000,"POLYGON ((105094.151 193691.739, 105073.654 19..."
3,2470,879.090943,8459.200304,GENT,270000,245.520,170.511,7645.839036,35345.222585,0.000000,346.997186,92.178082,1662.331753,1498.794963,0.000000,7764.661214,4133.462087,0.000000,1879.121492,"POLYGON ((104839.808 194650.326, 104838.648 19..."
4,2471,2332.238767,6067.689094,GENT,270000,500.952,1263.949,16358.827533,44831.573405,4927.202098,39243.558808,4388.634171,2738.776977,11399.718562,7477.079249,704.467549,3080.305046,6771.232255,48218.319956,"POLYGON ((104839.808 194650.326, 104771.356 19..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
212,2955,1264.834969,364.727400,HASSELT,80000,227.721,112.721,0.000000,95482.962751,2177.700542,344.657889,0.000000,1881.062980,1993.835545,1036.607526,1450.929891,516.814203,385.950963,88.491147,"POLYGON ((213412.304 185139.481, 213495.452 18..."
213,2956,675.347395,234.058153,HASSELT,80000,136.546,44.547,0.000000,53941.331644,0.000000,113.632006,157.315500,327.571476,0.000000,2199.457617,497.034163,318.288368,0.000000,5719.541643,"POLYGON ((214788.952 184142.221, 214765.361 18..."
214,2957,4355.259424,573.723158,HASSELT,80000,858.434,635.436,404.847767,316112.984379,2498.764049,5661.221221,2258.934691,6533.974771,6560.999464,10259.275258,2713.056891,86.358369,863.211651,38652.221544,"POLYGON ((213486.041 181619.291, 213480.885 18..."
215,2958,1969.782185,412.835340,HASSELT,80000,403.719,207.721,914.923461,162645.965254,2033.345536,203.181213,4035.605943,5619.516937,3299.455574,9596.975704,212.136269,259.695882,555.953031,23288.098803,"POLYGON ((209721.563 182171.548, 209729.427 18..."
