In [64]:
import os
import json
import requests
from dotenv import load_dotenv
from tqdm import tqdm
from geopy.geocoders import ArcGIS
from datetime import datetime, timedelta
import pandas as pd

In [8]:
load_dotenv('.env')

weatherbit_key = os.getenv('WEATHERBIT_KEY')

In [9]:
folder_path = 't20s_male_json'
json_files = os.listdir(folder_path)

countries = ['England', 'Australia', 'Afghanistan', 'India', 'New Zealand', 'South Africa', 'West Indies', 'Bangladesh']
top_matches = []
weather_list = []

In [10]:
for match_file in tqdm(json_files):
    if '.json' in match_file:
        with open(os.path.join(folder_path, match_file), 'r') as f:
            content = f.read()
            data = json.loads(content)
            if set(data['info']['teams']).issubset(countries):
                top_matches.append(match_file) 
                weather = {}
                if 'city' in data['info'].keys():
                    weather['city'] = data['info']['city']
                if 'venue' in data['info'].keys():
                    weather['stadium'] = data['info']['venue']
                else:
                    print("Venue not found")
                if 'dates' in data['info'].keys():
                    weather['date'] = data['info']['dates'][0]
                weather_list.append(weather)


100%|██████████| 2604/2604 [00:01<00:00, 1849.13it/s]


In [52]:
def get_next_day(date_str):
    date_obj = datetime.strptime(date_str, '%Y-%m-%d')
    next_day = date_obj + timedelta(days=1)
    return next_day.strftime('%Y-%m-%d')

In [55]:
geolocator = ArcGIS(timeout=5)

weather_reports = []

url = f'https://api.weatherbit.io/v2.0/history/daily?key={weatherbit_key}'

for match_info in tqdm(weather_list):
    stadium = match_info['stadium']
    loc = geolocator.geocode(stadium)
    date = match_info['date']
    next_date = get_next_day(date)
    match = f'&lat={loc.latitude}&lon={loc.longitude}&start_date={date}&end_date={next_date}'
    weather_info = requests.get(url+match)
    weather_reports.append(weather_info)

100%|██████████| 468/468 [08:11<00:00,  1.05s/it]


In [60]:
weather_reports = [report.json() for report in weather_reports]

In [65]:
flattened_data = []
for report in weather_reports:
    for data_entry in report['data']:
        # Combine the top-level keys with the nested "data" keys
        combined_entry = {**report, **data_entry}
        # Remove the original "data" key to avoid duplication
        combined_entry.pop('data', None)
        flattened_data.append(combined_entry)

df = pd.DataFrame(flattened_data)

In [68]:
df.to_csv('weather_reports.csv')

In [None]:
df = pd.read_csv('weather_reports.csv')
print(df.head(1))

   Unnamed: 0  lat   lon            valid_date  max_temp  min_temp  app_max_temp  app_min_temp  max_wind_spd  rh  clouds  weather_reports  
0           0  37.78   -122.4  2023-03-26T00:00:00    15.4     10.4        15.4          10.4           7.7  78      75   {'data': [...}  

