In [1]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
import glob, os, re

In [4]:
df = pd.read_csv("./data/hackplay_warszawa_full.csv")

In [5]:
cells_unique = (
    df[["cell_rk", "cell_lon", "cell_lat"]]
    .drop_duplicates("cell_rk")
    .reset_index(drop=True)
)
gdf_cells = gpd.GeoDataFrame(
    cells_unique,
    geometry=gpd.points_from_xy(cells_unique["cell_lon"], cells_unique["cell_lat"]),
    crs="EPSG:4326",
)

In [6]:
district_files = sorted(glob.glob("./data/dzielnice/*.geojson"))
gdf_districts_list = []
for path in district_files:
    g = gpd.read_file(path)
    name = os.path.splitext(os.path.basename(path))[0]
    if g.crs is None:
        g.set_crs("EPSG:4326", inplace=True)
    else:
        g = g.to_crs("EPSG:4326")
    g["district"] = name
    g = g.dissolve(by="district", as_index=False)
    gdf_districts_list.append(g)

gdf_districts = pd.concat(gdf_districts_list, ignore_index=True)
gdf_districts = gpd.GeoDataFrame(gdf_districts, geometry="geometry", crs="EPSG:4326")

In [7]:
joined = gpd.sjoin(gdf_cells, gdf_districts[["district", "geometry"]], predicate="within", how="left")
cell_to_district = dict(zip(joined["cell_rk"], joined["district"]))

In [8]:
df["district"] = df["cell_rk"].map(cell_to_district)


In [11]:
df['district'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 3080 entries, 0 to 3079
Series name: district
Non-Null Count  Dtype 
--------------  ----- 
3080 non-null   object
dtypes: object(1)
memory usage: 24.2+ KB


In [14]:
df.to_csv("./data/hackplay_warszawa_with_districts.csv", index=False)

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3080 entries, 0 to 3079
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   start_dttm     3079 non-null   object 
 1   user_id        3080 non-null   object 
 2   cell_rk        3080 non-null   int64  
 3   lac            3000 non-null   float64
 4   cid            3080 non-null   float64
 5   technology     3080 non-null   object 
 6   frequency      2986 non-null   float64
 7   cell_lon       3080 non-null   float64
 8   cell_lat       3080 non-null   float64
 9   cos_rk         3080 non-null   int64  
 10  cos_nm         3080 non-null   object 
 11  cos_family_nm  3080 non-null   object 
 12  district       3080 non-null   object 
dtypes: float64(5), int64(2), object(6)
memory usage: 312.9+ KB
