# Getting Weather Data for Paris 
The Api we are using from (https://www.infoclimat.fr/opendata/) only allows 7 consecutive days of data to be called at a time. The data has a time step of 10 min, for it to be coherent with our other data we need it to have a timestep of 1 hour

In [43]:
from datetime import datetime, timedelta
import requests
import pandas as pd
from io import StringIO

In [44]:
# Define the start and end dates
start_date = datetime(2020, 1, 1)
end_date = datetime(2024, 9, 24)


# Calculate the number of days between the two dates
num_days = (end_date - start_date).days

print(num_days/7, "api calls need to be done")

246.85714285714286 api calls need to be done


In [45]:
# get ip address
def get_public_ip():
    response = requests.get('https://api.ipify.org')
    return response.text

print(get_public_ip())

193.54.23.143


In [46]:
# Define the column names
column_names = ['station_id', 'dh_utc', 'temperature', 'pression', 'humidite', 'point_de_rosee', 'vent_moyen', 'vent_rafales', 'vent_direction', 'pluie_3h', 'pluie_1h']

# Initialize an empty DataFrame to store all the data
weather_data = pd.DataFrame()

# Define the API key
api_key = 'DSpLixyOur5fOZ3c8RQVZu6GJFCKZL4tAF6HWv6NenMJDUuFAiFg'

# Loop over the date range in 7-day increments
current_date = start_date
while current_date < end_date:
    # Calculate the end date for the current 7-day range
    range_end_date = min(current_date + timedelta(days=7), end_date)

    # Format the dates as strings
    start_str = current_date.strftime('%Y-%m-%d')
    end_str = range_end_date.strftime('%Y-%m-%d')

    # Insert the dates into the API URL
    url = f'https://www.infoclimat.fr/opendata/?method=get&format=csv&stations[]=ME099&stations[]=000BV&start={start_str}&end={end_str}&token={api_key}'

    # Make the API request and get the response
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        # Read the response content into a DataFrame
        data = pd.read_csv(StringIO(response.text), sep=';', names=column_names, skiprows=6, na_values=';')

        # Append the data to the main DataFrame
        weather_data = pd.concat([weather_data, data])

    # Move to the next date range
    current_date = range_end_date

In [47]:
weather_data.head()

Unnamed: 0,station_id,dh_utc,temperature,pression,humidite,point_de_rosee,vent_moyen,vent_rafales,vent_direction,pluie_3h,pluie_1h
0,station_id,dh_utc,degC,hPa,%,degC,km/h,km/h,deg,mm,mm
1,000BV,2020-01-01 00:00:00,2,1030,88,0,6.4,,122,,
2,000BV,2020-01-01 00:10:00,2.3,1029.9,89,0.6,6.4,,58,,
3,000BV,2020-01-01 00:20:00,2.1,1029.6,89,0.6,3.2,,23,,
4,000BV,2020-01-01 00:40:00,1.3,1029.6,90,0,6.4,,149,,


In [48]:
# Drop unnecessary columns
weather_data = weather_data.drop(columns=['station_id', 'pression', 'point_de_rosee', 'pluie_3h', 'vent_rafales'])

# Remove all rows that have names of columns as values
weather_data = weather_data[weather_data['dh_utc'] != 'dh_utc']

# Rename the 'dh_utc' column to 'date'
weather_data = weather_data.rename(columns={'dh_utc': 'date'})

# Convert nan to 0 in pluie_1h column
weather_data['pluie_1h'] = weather_data['pluie_1h'].fillna(0)

# Convert 'temperature' and 'pluie_1h' to numeric
weather_data['temperature'] = pd.to_numeric(weather_data['temperature'], errors='coerce')
weather_data['pluie_1h'] = pd.to_numeric(weather_data['pluie_1h'], errors='coerce')

# Convert the 'date' column to datetime format
weather_data['date'] = pd.to_datetime(weather_data['date'])

# Set the date column as the index before resampling
weather_data = weather_data.set_index('date')

In [49]:
df = weather_data.copy()

# Convert numeric columns, handling any non-numeric values
df['temperature'] = pd.to_numeric(df['temperature'], errors='coerce')
df['humidite'] = pd.to_numeric(df['humidite'], errors='coerce')
df['vent_moyen'] = pd.to_numeric(df['vent_moyen'], errors='coerce')
df['vent_direction'] = pd.to_numeric(df['vent_direction'], errors='coerce')
df['pluie_1h'] = pd.to_numeric(df['pluie_1h'], errors='coerce')

# Make sure the index is in datetime format
if not isinstance(df.index, pd.DatetimeIndex):
  df.index = pd.to_datetime(df.index)

# Now resample to hourly intervals
hourly_data = df.resample('H').mean()

# Round the values
hourly_data['temperature'] = hourly_data['temperature'].round(1)
hourly_data['humidite'] = hourly_data['humidite'].round(1)
hourly_data['vent_moyen'] = hourly_data['vent_moyen'].round(1)
hourly_data['vent_direction'] = hourly_data['vent_direction'].round(1)
hourly_data['pluie_1h'] = hourly_data['pluie_1h'].round(2)

In [50]:
hourly_data.head()

Unnamed: 0_level_0,temperature,humidite,vent_moyen,vent_direction,pluie_1h
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-01-01 00:00:00,1.8,89.2,7.1,98.0,0.0
2020-01-01 01:00:00,0.8,90.7,6.4,163.3,0.0
2020-01-01 02:00:00,0.1,93.0,7.2,189.5,0.0
2020-01-01 03:00:00,0.4,94.2,3.8,201.6,0.0
2020-01-01 04:00:00,0.9,94.2,2.2,203.0,0.0


In [51]:
# Handle missing values with more sophisticated interpolation

# Use different interpolation methods based on the nature of the data
# For temperature and humidity, use cubic interpolation as they tend to change smoothly
hourly_data[['temperature', 'humidite']] = hourly_data[['temperature', 'humidite']].interpolate(method='cubic')

# For wind-related features, use spline interpolation with a higher order
hourly_data[['vent_moyen', 'vent_direction']] = hourly_data[['vent_moyen', 'vent_direction']].interpolate(method='spline', order=3)

# For rainfall, use pad (forward fill) first then backfill as it's more discrete
hourly_data['pluie_1h'] = hourly_data['pluie_1h'].fillna(method='pad').fillna(method='bfill')

# Reset index to keep date as a column
hourly_data = hourly_data.reset_index()

  hourly_data['pluie_1h'] = hourly_data['pluie_1h'].fillna(method='pad').fillna(method='bfill')


In [52]:
hourly_data.head()

Unnamed: 0,date,temperature,humidite,vent_moyen,vent_direction,pluie_1h
0,2020-01-01 00:00:00,1.8,89.2,7.1,98.0,0.0
1,2020-01-01 01:00:00,0.8,90.7,6.4,163.3,0.0
2,2020-01-01 02:00:00,0.1,93.0,7.2,189.5,0.0
3,2020-01-01 03:00:00,0.4,94.2,3.8,201.6,0.0
4,2020-01-01 04:00:00,0.9,94.2,2.2,203.0,0.0


In [53]:
hourly_data.to_csv('data/hourly_weather_data.csv')