# Preprocess Data

This notebook preprocesses the raw data and fixes encoding, data structure and enhances the data with additional data properties such as coordinates of the city.

In [1]:
from pathlib import Path
import park_and_ride_analysis

module_dir = Path(park_and_ride_analysis.__file__).parent
project_dir = module_dir.parent.parent
data_dir = project_dir / "data"

file_name = "einpendelverflechtungen_darmstadt_2021.csv"
raw_data = data_dir / "raw" / file_name
processed_data = data_dir / "processed" / file_name

darmstadt_coords = (49.8728, 8.6512)

In [2]:
import pandas as pd

df = pd.read_csv(
    raw_data, delimiter=';', engine="python", encoding='iso-8859-1', skiprows=9, names=[
        'Wohnort (Code)',
        'Wohnort',
        'Gesamt (Anzahl)',
        'Männlich (Anzahl)',
        'Weiblich (Anzahl)',
        'Anteil (Prozent)',
        'Luftlinienentfernung (km)',
    ], skipfooter=4, na_values=["-", ".", "x"], dtype={"Wohnort (Code)": str}, decimal=","
)

In [11]:
import geopy.geocoders
from tqdm import tqdm
import time

tqdm.pandas()

geolocator = geopy.geocoders.Nominatim(user_agent="tpte_project")

def get_coordinates(place: str) -> tuple[float, float]:
    # delay for quota limits
    time.sleep(1)
    try:
        location = geolocator.geocode(f"{place.split(',')[0]}, Deutschland")
        return (location.latitude, location.longitude) if location else (None, None)
    except Exception as e:
        print(f"Error getting coordinates for {place}: {e}")
        return (None, None)

# Calculate coordinates only for interesting cities, because of quota limits
filtered_df = df[df['Anteil (Prozent)'] > 0.0]
df.loc[filtered_df.index, 'coordinates'] = filtered_df['Wohnort'].progress_apply(lambda x: get_coordinates(x))

100%|██████████| 167/167 [03:03<00:00,  1.10s/it]


In [12]:
df.to_csv(processed_data, index=False)