In [1]:
import requests
import pandas as pd

#API endpoint
url = "http://api.worldweatheronline.com/premium/v1/past-weather.ashx"

# load API key from text file
with open('api_key_weather.txt', 'r') as file:
    api_key = file.read().replace('\n', '')
    
# Parameters
params = {
    "key": str(api_key),
    "q": "Dublin",  # Query location
    "format": "json",
    "date": "2021-05-01",  # Start date for historical data
    "enddate": "2022-08-31",  # End date for historical data
    "includelocation": "yes", 
    "tp": "1"  # Time period: 1 hour
}  

# Making the GET request
response = requests.get(url, params=params)

# Check if the request was successful
if response.status_code == 200:
    # Parse the JSON response
    data = response.json()
    
    # Extracting location data
    location_data = data.get('data', {}).get('nearest_area', [{}])[0]
    latitude = location_data.get('latitude', 'Unknown')
    longitude = location_data.get('longitude', 'Unknown')
    
    # Extracting the weather data
    weather_data = data.get('data', {}).get('weather', [])
    
    # Creating a list to store each day's weather information
    weather_list = []
    for day in weather_data:
        hourly_list = day.get('hourly', [])
        for hourly_data in hourly_list:
            hour_value = hourly_data.get('time', 'Unknown')  # This should extract the time for each hour
            if hour_value != 'Unknown':
                hour_formatted = f"{int(hour_value)//100:02d}:00" if hour_value != '0' else "00:00"
            else:
                hour_formatted = hour_value
            day_data = {
                'date': day['date'],
                'hour': hour_formatted,
                'avgtempC': day['avgtempC'], 
                'maxtempC': day['maxtempC'],
                'mintempC': day['mintempC'],
                'sunHour': day['sunHour'],
                'uvIndex': day['uvIndex'],
                'humidity': hourly_data['humidity'],
                'winddirDegree': hourly_data['winddirDegree'],
                'windspeedKmph': hourly_data['windspeedKmph'],
                'cloudcover': hourly_data['cloudcover'],
                'precipMM': hourly_data['precipMM'],
                'pressure': hourly_data['pressure'],
                'latitude': latitude,
                'longitude': longitude,
            }
            weather_list.append(day_data)

    
    # Convert the list of dictionaries to a pandas DataFrame
    weather_df = pd.DataFrame(weather_list)
    
    # print(weather_df.head())  # Print the first few rows to verify
    
    # Optionally, save the DataFrame to a CSV file
    # weather_df.to_csv('dublin_weather_may2021_aug2022_extended.csv', index=False)
else:
    print("Failed to fetch data. Status Code:", response.status_code)

In [6]:
from datetime import datetime

# Loop through each entry and combine the date and hour into a datetime object
for item in weather_list:
    # Ensure that 'date' and 'hour' are strings and not integers or 'Unknown'
    date_str = item['date']  # 'YYYY-MM-DD'
    hour_str = item['hour']  # 'HH:00' format

    # Combine the date and hour strings
    datetime_str = f"{date_str} {hour_str}"
    
    # Create a datetime object using datetime.strptime
    item['datetime'] = datetime.strptime(datetime_str, "%Y-%m-%d %H:%M")

# Now each item in weather_list has a 'datetime' key with a datetime object as its value

# If you want to create a DataFrame from weather_list, you can do so like this:
weather_df = pd.DataFrame(weather_list)

# If you want 'datetime' to be the index of the DataFrame, you can do that as well:
weather_df.set_index('datetime', inplace=True)


In [7]:
#weather_df.to_csv('dublin_weather_may2021_aug2022_extended.csv', index=False)
weather_df


Unnamed: 0_level_0,date,hour,avgtempC,maxtempC,mintempC,sunHour,uvIndex,humidity,winddirDegree,windspeedKmph,cloudcover,precipMM,pressure,latitude,longitude
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2021-05-01 00:00:00,2021-05-01,00:00,6,8,3,13.0,3,80,305,9,5,0.0,1017,53.333,-6.249
2021-05-01 01:00:00,2021-05-01,01:00,6,8,3,13.0,3,82,306,10,15,0.0,1017,53.333,-6.249
2021-05-01 02:00:00,2021-05-01,02:00,6,8,3,13.0,3,83,306,11,25,0.0,1017,53.333,-6.249
2021-05-01 03:00:00,2021-05-01,03:00,6,8,3,13.0,3,85,307,12,35,0.0,1017,53.333,-6.249
2021-05-01 04:00:00,2021-05-01,04:00,6,8,3,13.0,3,84,308,12,41,0.0,1017,53.333,-6.249
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-06-04 19:00:00,2021-06-04,19:00,13,16,9,12.0,4,78,174,19,57,0.0,1023,53.333,-6.249
2021-06-04 20:00:00,2021-06-04,20:00,13,16,9,12.0,4,83,176,18,39,0.0,1024,53.333,-6.249
2021-06-04 21:00:00,2021-06-04,21:00,13,16,9,12.0,4,88,178,16,20,0.0,1024,53.333,-6.249
2021-06-04 22:00:00,2021-06-04,22:00,13,16,9,12.0,4,88,178,16,43,0.0,1024,53.333,-6.249


In [None]:
# option to Drop 'date' and 'hour' columns, now that we have 'datetime'
#weather_df.drop(['date', 'hour'], axis=1, inplace=True)

In [4]:
# this code to understand why it is 840 rows and not 816, which would be 34 * 24 hours
from collections import Counter

# Assuming weather_list is a list of dictionaries with 'date' and 'hour' as keys
hours_counter = Counter((item['date'], item['hour']) for item in weather_list)

# Find any dates and hours that occur more than once
duplicates = {k: v for k, v in hours_counter.items() if v > 1}

print("Duplicates or extra hours:", duplicates)


Duplicates or extra hours: {}


In [5]:
from collections import Counter

# Assuming weather_list is your list of dictionaries with each dictionary having a 'date' key
date_counts = Counter(item['date'] for item in weather_list)

# Now date_counts is a dictionary where the keys are dates and the values are the counts
for date, count in date_counts.items():
    print(f"Date: {date} has {count} observations")


Date: 2021-05-01 has 24 observations
Date: 2021-05-02 has 24 observations
Date: 2021-05-03 has 24 observations
Date: 2021-05-04 has 24 observations
Date: 2021-05-05 has 24 observations
Date: 2021-05-06 has 24 observations
Date: 2021-05-07 has 24 observations
Date: 2021-05-08 has 24 observations
Date: 2021-05-09 has 24 observations
Date: 2021-05-10 has 24 observations
Date: 2021-05-11 has 24 observations
Date: 2021-05-12 has 24 observations
Date: 2021-05-13 has 24 observations
Date: 2021-05-14 has 24 observations
Date: 2021-05-15 has 24 observations
Date: 2021-05-16 has 24 observations
Date: 2021-05-17 has 24 observations
Date: 2021-05-18 has 24 observations
Date: 2021-05-19 has 24 observations
Date: 2021-05-20 has 24 observations
Date: 2021-05-21 has 24 observations
Date: 2021-05-22 has 24 observations
Date: 2021-05-23 has 24 observations
Date: 2021-05-24 has 24 observations
Date: 2021-05-25 has 24 observations
Date: 2021-05-26 has 24 observations
Date: 2021-05-27 has 24 observations
D