In [2]:
import requests
import json
import os
from datetime import datetime, timedelta
from pyspark.sql import SparkSession
from delta import configure_spark_with_delta_pip
from pyspark.sql.functions import split, col
from traitlets.config import Config

import ConnectionConfig as cc  # Your custom module

Dynamically set JAVA_HOME: /Users/user/Library/Java/JavaVirtualMachines/temurin-21.0.2/Contents/Home


In [3]:
# Set connection profile
cc.set_connectionProfile("default")
cc.setupEnvironment()
cc.listEnvironment()
# Initialize Spark session



HOMEBREW_PREFIX: /opt/homebrew
COMMAND_MODE: unix2003
INFOPATH: /opt/homebrew/share/info:
SHELL: /bin/zsh
PYTHONPATH: /Users/user/Desktop/data4_project_group5
__CFBundleIdentifier: com.jetbrains.pycharm
TMPDIR: /var/folders/k_/tkt88xx94n17f7_nvrzjrkwc0000gn/T/
LC_ALL: en_US.UTF-8
HOME: /Users/user
HOMEBREW_REPOSITORY: /opt/homebrew
PATH: /Users/user/Desktop/data4_project_group5/myenv/bin:/Users/user/Library/Java/JavaVirtualMachines/temurin-21.0.2/Contents/Home/bin:/opt/homebrew/opt/python@3.11/bin:/opt/homebrew/opt/python@3.11/bin:/opt/homebrew/bin:/opt/homebrew/opt/python@3.11/bin:/opt/homebrew/bin:/opt/homebrew/sbin:/usr/local/bin:/System/Cryptexes/App/usr/bin:/usr/bin:/bin:/usr/sbin:/sbin:/var/run/com.apple.security.cryptexd/codex.system/bootstrap/usr/local/bin:/var/run/com.apple.security.cryptexd/codex.system/bootstrap/usr/bin:/var/run/com.apple.security.cryptexd/codex.system/bootstrap/usr/appleinternal/bin:/Applications/VMware Fusion.app/Contents/Public:/usr/local/go/bin:/Users/us

In [35]:
spark = cc.startLocalCluster("DIM_WEATHER",4)
spark.getActiveSession()
# Folder to store weather responses
WEATHER_FOLDER = "weather"
if not os.path.exists(WEATHER_FOLDER):
    os.makedirs(WEATHER_FOLDER)

In [5]:

stations_df = spark.read \
    .format("jdbc") \
    .option("driver" , cc.get_Property("driver")) \
    .option("url", cc.create_jdbc()) \
    .option("dbtable", "stations") \
    .option("user", cc.get_Property("username")) \
    .option("password", cc.get_Property("password")) \
    .option("partitionColumn", "stationid") \
    .option("numPartitions", 4) \
    .option("lowerBound", 0) \
    .option("upperBound", 20) \
    .load()

In [6]:
print("Raw stations_df schema:")
stations_df.printSchema()

# Process stations data (cast lat and lon to float)
print("Processing stations data")
stations_df = stations_df.select(
    col("stationid"),
    col("objectid"),
    col("stationnr"),
    col("type"),
    col("street"),
    col("number"),
    col("zipcode"),
    col("district"),
    col("gpscoord"),
    col("additionalinfo"),
    col("labelid"),
    col("cityid")
).distinct()


Raw stations_df schema:
root
 |-- stationid: integer (nullable = true)
 |-- objectid: string (nullable = true)
 |-- stationnr: string (nullable = true)
 |-- type: string (nullable = true)
 |-- street: string (nullable = true)
 |-- number: string (nullable = true)
 |-- zipcode: string (nullable = true)
 |-- district: string (nullable = true)
 |-- gpscoord: string (nullable = true)
 |-- additionalinfo: string (nullable = true)
 |-- labelid: integer (nullable = true)
 |-- cityid: integer (nullable = true)

Processing stations data


In [32]:
print("Displaying first 5 rows of stations_df")
stations_df.show(5)

print("Converting stations DataFrame to dictionary")
# Convert DataFrame to dictionary, NEEDS TO BE DICTIONARY FOR THE JSON 
# extracting lat & lon from gpscoord,
stations = {}
for row in stations_df.collect():
    gpscoord = row["gpscoord"] 
    lat, lon = map(float, gpscoord.strip("()").split(","))
    
    # Build dictionary entry
    stations[row["zipcode"]] = {
        "lat": lat,
        "lon": lon,
        "name": row["street"],
        "city": row["district"],
        "country": "BE"
    }

print(f"Loaded {len(stations)} unique zip codes with valid coordinates")


Displaying first 5 rows of stations_df
+---------+--------+---------+------------+--------------------+------+-------+---------+-----------------+--------------------+-------+------+
|stationid|objectid|stationnr|        type|              street|number|zipcode| district|         gpscoord|      additionalinfo|labelid|cityid|
+---------+--------+---------+------------+--------------------+------+-------+---------+-----------------+--------------------+-------+------+
|        2|   33203|      019| ENKELZIJDIG|          ONTBREKEND|    12|   2000|ANTWERPEN| (51.219,4.40405)|                    |   NULL|  NULL|
|        3|   33204|      020| ENKELZIJDIG|Groenkerkhofstraa...|     2|   2000|ANTWERPEN|(51.2187,4.40066)| thv Nationalestraat|   NULL|  NULL|
|        4|   33205|      035| ENKELZIJDIG|Cockerillkaai (2000)|      |   2000|ANTWERPEN|(51.2104,4.38772)|                    |   NULL|  NULL|
|        1|   33202|      026|DUBBELZIJDIG|         Meir (2000)|    84|   2000|ANTWERPEN|(51.2182

                                                                                

In [33]:

dim_weather_df = spark.read.format("delta").load("spark-warehouse/dimweather")
dim_weather = {row["weather_id"]: row["weather_condition"] for row in dim_weather_df.collect()}
print(f"Loaded dimWeather with conditions: {dim_weather}")

                                                                                

Loaded dimWeather with conditions: {1: 'Unpleasant', 2: 'Pleasant', 3: 'Neutral', 0: 'Unknown'}


In [9]:

# Weather condition mapping based on dimWeather
def map_weather_condition(weathercode):
    if weathercode >= 50:  # Rain, snow, thunderstorms -> Unpleasant
        return {"id": 1, "main": "Unpleasant", "description": "unpleasant weather", "icon": "10d"}
    elif weathercode <= 1:  # Clear, sunny -> Pleasant
        return {"id": 2, "main": "Pleasant", "description": "pleasant weather", "icon": "01d"}
    elif weathercode in [2, 3]:  # Partly cloudy, cloudy -> Neutral
        return {"id": 3, "main": "Neutral", "description": "neutral weather", "icon": "03d"}
    else:  # Unknown
        return {"id": 0, "main": "Unknown", "description": "unknown weather", "icon": "00d"}

In [43]:

def get_weather_data(lat, lon, timestamp, zipcode):
    url = f"https://api.open-meteo.com/v1/forecast?latitude={lat}&longitude={lon}&current_weather=true&timezone=auto"
    print(f"Fetching weather for ZIP: {zipcode}, Lat: {lat}, Lon: {lon}")
    
    response = requests.get(url)
    
    if response.status_code == 200:
        try:
            data = response.json()
            
            current_data = data.get("current_weather", None)
            
            if current_data:
                # Extracting available data from Open-Meteo API response
                temperature = current_data.get("temperature")
                feels_like = current_data.get("temperature")  # No separate "feels_like" available in Open-Meteo API
                temp_min = temperature - 1  # Assumed
                temp_max = temperature + 1.5  # Assumed
                wind_speed = current_data.get("windspeed")
                wind_deg = current_data.get("winddirection")
                wind_gust = wind_speed + 0.5  # Estimating gusts based on wind speed
                weather_code = current_data.get("weathercode")
                
                # Map weather code to description (basic example)
                weather_description = map_weather_condition(weather_code)

                # Prepare the weather data in the required format
                weather_data = {
                    "zipCode": zipcode,
                    "coord": {"lon": lon, "lat": lat},
                    "weather": [{
                        "id": weather_code,  # Assuming weather code matches your example
                        "main": weather_description["main"],  # Like "Rain"
                        "description": weather_description["description"],  # Like "moderate rain"
                        "icon": weather_description["icon"]  # Placeholder icon
                    }],
                    "base": "stations",
                    "main": {
                        "temp": temperature,  # Celsius
                        "feels_like": feels_like,
                        "temp_min": temp_min,
                        "temp_max": temp_max,
                        "pressure": 1015,  # Placeholder pressure
                        "humidity": 50,  # Placeholder humidity
                        "sea_level": 1015,  # Placeholder
                        "grnd_level": 933  # Placeholder
                    },
                    "visibility": 10000,  # Assuming visibility is 10 km
                    "wind": {
                        "speed": wind_speed,  # Wind speed in km/h
                        "deg": wind_deg,  # Wind direction in degrees
                        "gust": wind_gust  # Estimating gusts based on wind speed
                    },
                    "rain": {"1h": 0},  # Placeholder for rain (no data available)
                    "clouds": {"all": 0},  # Placeholder for clouds (no data available)
                    "dt": int(timestamp.timestamp()),  # Unix timestamp
                    "sys": {
                        "type": 2,
                        "id": 2075663,  # Placeholder
                        "country": "BE",  # Country code placeholder
                        "sunrise": int((timestamp - timedelta(hours=3)).timestamp()),  # Placeholder for sunrise
                        "sunset": int((timestamp + timedelta(hours=9)).timestamp())  # Placeholder for sunset
                    },
                    "timezone": 3600,  # Assuming timezone is UTC+1 for Belgium
                    "id": int(zipcode + "00"),  # Placeholder for unique ID
                    "name": "Zocca",  # Placeholder for location name
                    "cod": 200  # Status code for success
                }
                return weather_data
            else:
                print(f"No 'current_weather' data found in response for {zipcode}.")
                return None
        except KeyError as e:
            print(f"Error accessing data for {zipcode}: {e}")
            return None
    else:
        print(f"Error fetching weather for {zipcode}: {response.status_code}")
        return None


In [44]:


for zipcode, data in stations.items():
    print(f"ZIP: {zipcode}, Data: {data}")

# Generate 3 responses per unique zipcode
base_time = datetime(2025, 3, 18, 12, 0, 0)  # Example timestamp
for zipcode, data in stations.items():
    lat, lon = data.get("lat"), data.get("lon")

    if lat is None or lon is None:
        print(f"Skipping ZIP: {zipcode}, missing or invalid coordinates.")
        continue  # Skip invalid ZIP codes

    lat, lon = data.get("lat"), data.get("lon")  # Extract lat and lon from tuple

    for i in range(3):
        trip_time = base_time + timedelta(hours=i)
        weather_data = get_weather_data(
            lat=lat,
            lon=lon,
            timestamp=trip_time,
            zipcode=zipcode
        )

        if weather_data:
            filename = f"{WEATHER_FOLDER}/weather_{zipcode}_{trip_time.strftime('%Y%m%d_%H%M%S')}.json"
            with open(filename, "w") as f:
                json.dump(weather_data, f, indent=4)
            print(f"Saved weather data for {zipcode} at {trip_time} to {filename}")

print(f"All weather responses saved in '{WEATHER_FOLDER}' folder.")

ZIP: 2000, Data: {'lat': 51.2183, 'lon': 4.4029, 'name': 'Schoenmarkt (2000)', 'city': 'ANTWERPEN', 'country': 'BE'}
ZIP: 2018, Data: {'lat': 51.2037, 'lon': 4.42545, 'name': 'Lange Leemstraat (2018)', 'city': 'ANTWERPEN', 'country': 'BE'}
ZIP: 2060, Data: {'lat': 51.23, 'lon': 4.41506, 'name': 'Ellermanstraat (2060)', 'city': 'ANTWERPEN', 'country': 'BE'}
ZIP: 2050, Data: {'lat': 51.2204, 'lon': 4.37858, 'name': 'Blancefloerlaan (2050)', 'city': 'ANTWERPEN', 'country': 'BE'}
ZIP: 2140, Data: {'lat': 51.2103, 'lon': 4.45265, 'name': 'Luitenant Lippenslaan (2140)', 'city': 'BORGERHOUT', 'country': 'BE'}
ZIP: 2600, Data: {'lat': 51.1845, 'lon': 4.44086, 'name': 'Fruithoflaan (2600)', 'city': 'BERCHEM', 'country': 'BE'}
ZIP: 2030, Data: {'lat': 51.2402, 'lon': 4.40841, 'name': 'Mexicostraat (2030)', 'city': 'ANTWERPEN', 'country': 'BE'}
ZIP: 2170, Data: {'lat': 51.2545, 'lon': 4.44178, 'name': 'Frans Adriaenssensstraat (2170)', 'city': 'MERKSEM', 'country': 'BE'}
ZIP: 2100, Data: {'lat': 

In [45]:

def get_station_from_db(zipcode):
    """Fetches latitude and longitude from the database based on the ZIP code."""
    # Query the stations DataFrame to get lat and lon based on the ZIP code
    station_row = stations_df.filter(col("zipcode") == zipcode).first()  # Get the first match
    
    if station_row:
        try:
            # Assuming gpscoord is a string in the format "(lat, lon)"
            gpscoord = station_row["gpscoord"]
            coords = gpscoord.strip("()").split(",")  # Remove parentheses and split by comma
            lat = float(coords[0].strip())  # Convert to float for latitude
            lon = float(coords[1].strip())  # Convert to float for longitude
            return {"lat": lat, "lon": lon}
        except AttributeError:
            print(f"GPS coordinates not found in the correct format for ZIP code {zipcode}.")
            return None
    else:
        print(f"Station with ZIP code {zipcode} not found.")
        return None


In [46]:
def get_weather_for_trip_start(zipcode, trip_start_time):
    """ Retrieves weather data closest to the trip start time for a given postal code """
    
    # Fetch station information (lat/lon) from the database
    station = get_station_from_db(zipcode)
    
    if not station:
        print(f"Station with ZIP code {zipcode} not found in the database.")
        return "unknown weather type"
    
    lat, lon = station["lat"], station["lon"]
    
    # Fetch the weather data for the current time slot closest to trip start time
    weather_data = get_weather_data(lat, lon, trip_start_time, zipcode)

    if weather_data:
        return weather_data["weather"][0]["description"]
    else:
        return "unknown weather type"


In [47]:

# Example usage: linking a trip to the weather condition at the start time
trip_start_time = datetime(2025, 3, 18, 14, 26)  # Example trip start time
zipcode = "2018"  # ZIP code of the station
weather_condition = get_weather_for_trip_start(zipcode, trip_start_time)

print(f"Weather condition at trip start (ZIP: {zipcode}): {weather_condition}")

Fetching weather for ZIP: 2018, Lat: 51.2047, Lon: 4.39625
Weather condition at trip start (ZIP: 2018): pleasant weather
