In [38]:
from geopy.geocoders import Nominatim
from geopy.adapters import AdapterHTTPError
import requests
import logging

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def get_lat_long_for_cityname(city: str):
    """Converts a string of a city name provided into
    lat/long coordinates."""

    geolocator = Nominatim(user_agent="MyApp")

    try:
        location = geolocator.geocode(city)
        lat = location.latitude
        long = location.longitude

        # log the coordinates retrieved
        logger.info(f"Coordinates for {city}: {lat}/{long}")

    # if the coordinates cannot be retrieved log a warning
    except (AttributeError, KeyError, ValueError, AdapterHTTPError) as err:
        logger.warning(
            f"""Coordinates for {city}: could not be retrieved.
            Error: {err}"""
        )
        lat = "NA"
        long = "NA"

    city_coordinates = {"city": city, "lat": lat, "long": long}

    return city_coordinates


def get_zipcode_for_lat_long(lat: float, long: float):
    """Converts lat/long coordinates into a zipcode."""

    geolocator = Nominatim(user_agent="MyApp")

    try:
        location = geolocator.reverse((lat, long), exactly_one=True)
        zipcode = location.raw['address']['postcode']

        # log the zipcode retrieved
        logger.info(f"Zipcode for {lat}/{long}: {zipcode}")

    # if the zipcode cannot be retrieved log a warning
    except (AttributeError, KeyError, ValueError, AdapterHTTPError) as err:
        logger.warning(
            f"""Zipcode for {lat}/{long}: could not be retrieved.
            Error: {err}"""
        )
        zipcode = "NA"

    return  zipcode


In [20]:
import pandas as pd
import numpy as np


data = pd.read_parquet('merged_data.parquet')

In [39]:
import re

# Function to clean stationcode and handle errors
def clean_stationcode(stationcode):
    cleaned_code = re.sub(r"\D", "", stationcode)
    if cleaned_code:  # Ensure the cleaned string is not empty
        return int(cleaned_code)
    else:
        return None  # or any default value you want to use

df =  data.drop_duplicates(subset='stationcode')
df = df.dropna(subset='stationcode')

df['stationcode'] = df['stationcode'].apply(lambda x: clean_stationcode(x))
print(df.info())


station_to_info = {}
for index, row in df.iterrows():
    stationcode = row['stationcode']
    lat = row['latitude']
    long = row['longitude']
    zipcode = get_zipcode_for_lat_long(lat, long)
    station_to_info[stationcode] = {'lat': lat, 'long': long, 'zipcode': zipcode}

<class 'pandas.core.frame.DataFrame'>
Index: 1502 entries, 0 to 42536118
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype                  
---  ------             --------------  -----                  
 0   id                 1470 non-null   float64                
 1   record_timestamp   1502 non-null   datetime64[us, Etc/UTC]
 2   stationcode        1501 non-null   float64                
 3   ebike              1502 non-null   int64                  
 4   mechanical         1502 non-null   int64                  
 5   duedate            1502 non-null   object                 
 6   numbikesavailable  1502 non-null   int64                  
 7   numdocksavailable  1502 non-null   int64                  
 8   capacity           1502 non-null   int64                  
 9   is_renting         1502 non-null   object                 
 10  is_installed       1502 non-null   object                 
 11  is_returning       1502 non-null   object                

INFO:__main__:Zipcode for 48.8685433/2.3600032: 75010
INFO:__main__:Zipcode for 48.870893/2.353521: 75010
INFO:__main__:Zipcode for 48.87397217237368/2.348388757504776: 75010
INFO:__main__:Zipcode for 48.87287120589831/2.3542150855064397: 75010
INFO:__main__:Zipcode for 48.870702423554306/2.358739863900845: 75010
INFO:__main__:Zipcode for 48.872089449907/2.3575825989246: 75010
INFO:__main__:Zipcode for 48.857058739111/2.3417982839439: 75001
INFO:__main__:Zipcode for 48.870948218595/2.3612049221992: 75010
INFO:__main__:Zipcode for 48.868976638826/2.3623438552022: 75010
INFO:__main__:Zipcode for 48.8731167/2.3593052: 75010
INFO:__main__:Zipcode for 48.871044051984/2.3661044619878: 75010
INFO:__main__:Zipcode for 48.871624306962985/2.3639265082738348: 75010
INFO:__main__:Zipcode for 48.874330377560476/2.3621229082345963: 75010
INFO:__main__:Zipcode for 48.875034655883944/2.359801238597928: 75010
INFO:__main__:Zipcode for 48.874575/2.356796: 75010
INFO:__main__:Zipcode for 48.875388227727/

In [11]:
print(df.iloc[0]['latitude'])

get_zipcode_for_lat_long(lat=df.iloc[0]['latitude'], long=df.iloc[0]['longitude'])


48.8685433


INFO:__main__:Zipcode for 48.8685433/2.3600032: 75010


{'lat': np.float64(48.8685433),
 'long': np.float64(2.3600032),
 'zipcode': '75010'}

In [13]:
station_to_info = {}


station_to_info = {}
for index, row in df.iterrows():
    stationcode = row['stationcode']
    lat = row['latitude']
    long = row['longitude']
    zipcode = get_zipcode_for_lat_long(lat, long)
    station_to_info[stationcode] = {'lat': lat, 'long': long, 'zipcode': zipcode}

INFO:__main__:Zipcode for 48.8685433/2.3600032: 75010
INFO:__main__:Zipcode for 48.870893/2.353521: 75010
INFO:__main__:Zipcode for 48.87397217237368/2.348388757504776: 75010
INFO:__main__:Zipcode for 48.87287120589831/2.3542150855064397: 75010
INFO:__main__:Zipcode for 48.870702423554306/2.358739863900845: 75010
INFO:__main__:Zipcode for 48.872089449907/2.3575825989246: 75010
INFO:__main__:Zipcode for 48.857058739111/2.3417982839439: 75001
INFO:__main__:Zipcode for 48.870948218595/2.3612049221992: 75010
INFO:__main__:Zipcode for 48.868976638826/2.3623438552022: 75010
INFO:__main__:Zipcode for 48.8731167/2.3593052: 75010
INFO:__main__:Zipcode for 48.871044051984/2.3661044619878: 75010
INFO:__main__:Zipcode for 48.871624306962985/2.3639265082738348: 75010
INFO:__main__:Zipcode for 48.874330377560476/2.3621229082345963: 75010
INFO:__main__:Zipcode for 48.875034655883944/2.359801238597928: 75010
INFO:__main__:Zipcode for 48.874575/2.356796: 75010
INFO:__main__:Zipcode for 48.875388227727/

In [37]:
station_to_info

{10001.0: {'lat': 48.8685433,
  'long': 2.3600032,
  'zipcode': {'lat': 48.8685433, 'long': 2.3600032, 'zipcode': '75010'}},
 10004.0: {'lat': 48.870893,
  'long': 2.353521,
  'zipcode': {'lat': 48.870893, 'long': 2.353521, 'zipcode': '75010'}},
 10005.0: {'lat': 48.87397217237368,
  'long': 2.348388757504776,
  'zipcode': {'lat': 48.87397217237368,
   'long': 2.348388757504776,
   'zipcode': '75010'}},
 10006.0: {'lat': 48.87287120589831,
  'long': 2.3542150855064397,
  'zipcode': {'lat': 48.87287120589831,
   'long': 2.3542150855064397,
   'zipcode': '75010'}},
 10008.0: {'lat': 48.870702423554306,
  'long': 2.358739863900845,
  'zipcode': {'lat': 48.870702423554306,
   'long': 2.358739863900845,
   'zipcode': '75010'}},
 10009.0: {'lat': 48.872089449907,
  'long': 2.3575825989246,
  'zipcode': {'lat': 48.872089449907,
   'long': 2.3575825989246,
   'zipcode': '75010'}},
 1001.0: {'lat': 48.857058739111,
  'long': 2.3417982839439,
  'zipcode': {'lat': 48.857058739111,
   'long': 2.34

In [34]:
station_to_info_clean = {
    key: {
        'lat': value['lat'], 
        'long': value['long'], 
        'zipcode': value['zipcode']['zipcode']  # Access the nested 'zipcode' value
    }
    for key, value in station_to_info.items()
}

print(station_to_info_clean)

TypeError: tuple indices must be integers or slices, not str

In [26]:


station_to_info_clean = {clean_stationcode(k): v for k, v in station_to_info.items()}

info_df = pd.DataFrame.from_dict(
    station_to_info_clean, orient="index", columns=["latitude", "longitude", "zipcode"])

info_df.head()

Unnamed: 0,latitude,longitude,zipcode
10001.0,48.868543,2.360003,"{'lat': 48.8685433, 'long': 2.3600032, 'zipcod..."
10004.0,48.870893,2.353521,"{'lat': 48.870893, 'long': 2.353521, 'zipcode'..."
10005.0,48.873972,2.348389,"{'lat': 48.87397217237368, 'long': 2.348388757..."
10006.0,48.872871,2.354215,"{'lat': 48.87287120589831, 'long': 2.354215085..."
10008.0,48.870702,2.35874,"{'lat': 48.870702423554306, 'long': 2.35873986..."
