# Imports and loading data

In [25]:
import pandas as pd
import sys
import os

# Add the src directory to the path so we can import our modules
sys.path.append(os.path.abspath('..'))

# Import the utility function for one-time directory change
from src.utilities.path_utilities import ensure_parent_dir

# Change to parent directory (only happens once even if cell is re-executed)
ensure_parent_dir('notebook_setup')

Directory already changed to: /Users/adrianhajdukiewicz/projects/private/2025_data_biz_hackathon


'/Users/adrianhajdukiewicz/projects/private/2025_data_biz_hackathon'

In [26]:
from src.data_preparation.read_data import read_inpost_data
from src.feature_engineering.geo_features import combine_location_datasets

inpost_raw = read_inpost_data()
zabka_raw  = pd.read_csv('data/zabka_shops.csv')

Dataset shape: (32453, 32)

Column names:
['href', 'name', 'type', 'status', 'location', 'location_type', 'location_date', 'location_description', 'location_description_1', 'location_description_2', 'distance', 'opening_hours', 'address', 'address_details', 'phone_number', 'payment_point_descr', 'functions', 'partner_id', 'is_next', 'payment_available', 'payment_type', 'virtual', 'recommended_low_interest_box_machines_list', 'apm_doubled', 'location_247', 'operating_hours_extended', 'agency', 'image_url', 'easy_access_zone', 'air_index_level', 'physical_type_mapped', 'physical_type_description']

First 3 rows:
                                                href     name  \
0  https://api-pl-points.easypack24.net/v1/points...   ADA01M   
1  https://api-pl-points.easypack24.net/v1/points...   ADA01N   
2  https://api-pl-points.easypack24.net/v1/points...  ADAM01N   

              type     status                                       location  \
0  [parcel_locker]  Operating  {'longitud

In [27]:
#head()
zabka_raw.head()

Unnamed: 0,id,slug,openTime,closeTime,city,address,postcode,voivodeship,county,community,region,salesRegion,openTimeSeconds,closeTimeSeconds,lat,lng,services
0,ID06093,"ID06093,gdansk-jabloniowa-29a",06:00,23:00,Gdańsk,Jabłoniowa 29A,80-175,Pomorskie,Gdańsk,GDAŃSK (GMINA MIEJSKA),DS3,PS3.6.3,21600.0,82800.0,54.330567,18.557187,"BIH,DEN,GSM,KPO,LOT,ODP,PAC,RAC,REJ,TER,ZBC"
1,ID03871,"ID03871,gorzow-wielkopolski-obroncow-pokoju-38...",06:00,23:00,Gorzów Wielkopolski,Obrońców Pokoju 38 nr 38 I,66-400,Lubuskie,Gorzów Wielkopolski,GORZÓW WIELKOPOLSKI (GMINA MIE,DS2,PS2.5.3,21600.0,82800.0,52.764806,15.264941,"BIH,DEN,GSM,KPO,LOT,ODP,PAC,RAC,REJ,TER,ZBC"
2,ID06169,"ID06169,ruda-slaska-ul-niedurnego-45-lok-1",06:00,23:00,Ruda Śląska,ul. Niedurnego 45 lok. 1,41-709,Śląskie,Ruda Śląska,RUDA ŚLĄSKA (GMINA MIEJSKA),DS8,PS8.2.2,21600.0,82800.0,50.28414,18.876,"BIH,DEN,GSM,KPO,LOT,ODP,PAC,RAC,REJ,TER,ZBC"
3,ID06264,"ID06264,warszawa-ul-zeganska-18",06:00,23:00,Warszawa,ul. Żegańska 18,04-713,Mazowieckie,Warszawa,"WARSZAWA (GMINA MIEJSKA, MIAST",DS5,PS5.4.1,21600.0,82800.0,52.20562,21.172909,"BIH,DEN,GSM,KPO,LOT,ODP,PAC,RAC,REJ,TER,ZBC"
4,ID05100,"ID05100,wejherowo-dworcowa-2",06:00,23:00,Wejherowo,Dworcowa 2,84-200,Pomorskie,wejherowski,WEJHEROWO (GMINA MIEJSKA),DS3,PS3.2.3,21600.0,82800.0,54.603632,18.228519,"BIH,DEN,GSM,KPO,LOT,ODP,PAC,RAC,REJ,TER,ZBC"


# Combining InPost and Żabka datasets

Let's extract the necessary columns from each dataset (latitude, longitude, and opening hours) and then combine them using our utility function.

In [28]:
# Combine the datasets using our utility function
combined_df = combine_location_datasets(
    inpost_raw=inpost_raw,
    zabka_raw=zabka_raw,
    add_source_column=True,
    save_to_file=True,
    output_path='data/y_raw_before_counting_per_square.csv'
)

# Display the combined dataframe
combined_df.head()

Combined dataset saved to 'data/y_raw_before_counting_per_square.csv'


Unnamed: 0,latitude,longitude,opening_hours,source
0,51.73834,22.26405,24/7,inpost
1,51.74455,22.25847,24/7,inpost
2,52.26299,18.08788,24/7,inpost
3,50.25873,22.69906,24/7,inpost
4,52.06923,20.63541,24/7,inpost


In [29]:
# Check for any missing values in the combined dataframe
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42212 entries, 0 to 42211
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   latitude       42208 non-null  float64
 1   longitude      42208 non-null  float64
 2   opening_hours  42038 non-null  object 
 3   source         42212 non-null  object 
dtypes: float64(2), object(2)
memory usage: 1.3+ MB


In [30]:
# Get some basic statistics about the combined dataset
print(f"Total number of locations: {len(combined_df)}")
print(f"Number of InPost locations: {len(combined_df[combined_df['source'] == 'inpost'])}")
print(f"Number of Żabka locations: {len(combined_df[combined_df['source'] == 'zabka'])}")

# Calculate the distribution of opening hours
print("\nMost common opening hours:")
print(combined_df['opening_hours'].value_counts().head(10))

Total number of locations: 42212
Number of InPost locations: 32453
Number of Żabka locations: 9759

Most common opening hours:
opening_hours
24/7                    27542
06:00-23:00              9219
06:00-22:00               390
PN-PT 10-18               373
PN-SB 08-23               215
6 - 24                    213
PN-PT 09-18               202
PN-SB 10-23               172
PN-PT 10-18 SB 10-14      125
PN-PT 09-17               115
Name: count, dtype: int64
