In [1]:
import pandas as pd
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut

In [2]:
matches=pd.read_csv('../../preprocessing/data/matches.csv')

### First we need to get coordinates for the cities of tournaments

In [3]:
geolocator = Nominatim(user_agent="geoapi")

In [4]:
def get_lat_long(location: str):
    if location[-1].isdigit():
        location = location[:-2]
    if location == 'Queens Club':
        location = 'West Kensington London'
    try:
        loc = geolocator.geocode(location, timeout=10)
        if loc:
            return loc.latitude, loc.longitude
        return None, None
    except GeocoderTimedOut:
        return None, None

In [5]:
unique_locations = matches['tournament_location'].unique()

In [6]:
location_dict = {location: get_lat_long(location) for location in unique_locations}

In [7]:
location_dict['Cordoba'] = (-31.4, -64.183)
location_dict['Santiago'] = (-33.45, -70.67)
location_dict['Los Cabos'] = (22.883, -109.917)
location_dict['Cologne 1'] = (50.933, 6.95)
location_dict['Cologne 2'] = (50.933, 6.95)

In [8]:
matches['latitude'] = matches['tournament_location'].map(lambda x: location_dict[x][0])
matches['longitude'] = matches['tournament_location'].map(lambda x: location_dict[x][1])

### Now we can get weather from API

In [9]:
import requests

def get_weather(latitude, longitude, date, time, variables):
    url = f"https://archive-api.open-meteo.com/v1/archive?latitude={latitude}&longitude={longitude}&hourly={','.join(variables)}&start_date={date}&end_date={date}&timezone=UTC"
    response = requests.get(url)
    data = response.json()
    if not data.get("hourly"):
        print(url, data)
        return None
    datetime_str = f"{date}T{time}"
    for i, hourly_time in enumerate(data["hourly"]["time"]):
        if hourly_time == datetime_str:
            return {var: data["hourly"][var][i] for var in variables}
	
    return None  

# Example
latitude = -34.92
longitude = 138.6
date = "2017-01-01"
time = "06:00"  
variables = ["temperature_2m", "relative_humidity_2m", "windspeed_10m", "apparent_temperature"]

weather_data = get_weather(latitude, longitude, date, time, variables)
print(weather_data)

{'temperature_2m': 20.1, 'relative_humidity_2m': 62, 'windspeed_10m': 28.7, 'apparent_temperature': 16.8}


In [12]:
from datetime import datetime, timedelta

def convert_utc_plus1_to_utc(date, time):
    dt = datetime.strptime(f"{date} {time}", "%Y-%m-%d %H:%M")
	
    dt_utc = dt - timedelta(hours=1)
    return dt_utc.strftime("%Y-%m-%d"), dt_utc.strftime("%H:%M")

def round_time_to_nearest_hour(time_str):
    time_obj = datetime.strptime(time_str, "%H:%M")

    if time_obj.minute >= 30:
        time_obj += timedelta(minutes=60 - time_obj.minute)
    else:
        time_obj -= timedelta(minutes=time_obj.minute)

    return time_obj.strftime("%H:00")

In [13]:
def fetch_weather_data(row):
	date_utc, time_utc = convert_utc_plus1_to_utc(row["Date"], row["time"])
	rounded_time_utc = round_time_to_nearest_hour(time_utc)
	return get_weather(row["latitude"], row["longitude"], date_utc, rounded_time_utc, variables)

In [14]:
def process_partial_chunk(df, func, start_index, chunk_size):
    end_index = min(start_index + chunk_size, len(df))  
    chunk = df.iloc[start_index:end_index]
    print(f"Processing rows {start_index} to {end_index}...")
    
    chunk_results = chunk.apply(func, axis=1)
    chunk_results.to_csv(f"../data/weather_chunks/{start_index}_{end_index}.csv")
    result_df = pd.DataFrame(chunk_results.tolist(), index=chunk.index)
    print(f"results saved to ../data/weather_chunks/{start_index}_{end_index}.csv")
    return result_df

### To odpaliłem u siebie

In [None]:
weather_data_chunk_1 = process_partial_chunk(matches, fetch_weather_data, start_index=0, chunk_size=500)

In [None]:
weather_data_chunk_2 = process_partial_chunk(matches, fetch_weather_data, start_index=500, chunk_size=500)

In [None]:
weather_data_chunk_3 = process_partial_chunk(matches, fetch_weather_data, start_index=1000, chunk_size=500)

In [None]:
weather_data_chunk_4 = process_partial_chunk(matches, fetch_weather_data, start_index=1500, chunk_size=500)

In [None]:
weather_data_chunk_5 = process_partial_chunk(matches, fetch_weather_data, start_index=2000, chunk_size=500)

In [None]:
weather_data_chunk_6 = process_partial_chunk(matches, fetch_weather_data, start_index=2500, chunk_size=500)

In [None]:
weather_data_chunk_7 = process_partial_chunk(matches, fetch_weather_data, start_index=3000, chunk_size=500)

In [None]:
weather_data_chunk_8 = process_partial_chunk(matches, fetch_weather_data, start_index=3500, chunk_size=500)

In [None]:
weather_data_chunk_10 = process_partial_chunk(matches, fetch_weather_data, start_index=4500, chunk_size=500)

### To odpal u siebie

In [None]:
weather_data_chunk_11 = process_partial_chunk(matches, fetch_weather_data, start_index=5000, chunk_size=500)

### I tak dalej

In [None]:
weather_data_chunk_12 = process_partial_chunk(matches, fetch_weather_data, start_index=5500, chunk_size=500)

In [None]:
weather_data_chunk_13 = process_partial_chunk(matches, fetch_weather_data, start_index=6000, chunk_size=500)

In [None]:
weather_data_chunk_14 = process_partial_chunk(matches, fetch_weather_data, start_index=6500, chunk_size=500)

In [None]:
weather_data_chunk_15 = process_partial_chunk(matches, fetch_weather_data, start_index=7000, chunk_size=500)

In [None]:
weather_data_chunk_16 = process_partial_chunk(matches, fetch_weather_data, start_index=7500, chunk_size=500)

In [None]:
weather_data_chunk_17 = process_partial_chunk(matches, fetch_weather_data, start_index=8000, chunk_size=500)

In [None]:
weather_data_chunk_21 = process_partial_chunk(matches, fetch_weather_data, start_index=10000, chunk_size=500)

In [None]:
weather_data_chunk_22 = process_partial_chunk(matches, fetch_weather_data, start_index=10500, chunk_size=500)

In [None]:
weather_data_chunk_23 = process_partial_chunk(matches, fetch_weather_data, start_index=11000, chunk_size=500)

In [None]:
weather_data_chunk_24 = process_partial_chunk(matches, fetch_weather_data, start_index=11500, chunk_size=500)

In [None]:
weather_data_chunk_25 = process_partial_chunk(matches, fetch_weather_data, start_index=12000, chunk_size=500)

In [None]:
weather_data_chunk_26 = process_partial_chunk(matches, fetch_weather_data, start_index=12500, chunk_size=500)

In [None]:
weather_data_chunk_27 = process_partial_chunk(matches, fetch_weather_data, start_index=13000, chunk_size=500)

In [None]:
weather_data_chunk_28 = process_partial_chunk(matches, fetch_weather_data, start_index=13500, chunk_size=500)

In [None]:
weather_data_chunk_29 = process_partial_chunk(matches, fetch_weather_data, start_index=14000, chunk_size=500)

In [None]:
weather_data_chunk_18 = process_partial_chunk(matches, fetch_weather_data, start_index=8500, chunk_size=500)

In [None]:
weather_data_chunk_19 = process_partial_chunk(matches, fetch_weather_data, start_index=9000, chunk_size=500)

In [None]:
weather_data_chunk_20 = process_partial_chunk(matches, fetch_weather_data, start_index=9500, chunk_size=500)

## Można mergeować 

In [16]:
import os
weather_files_path = "../../data/weather_chunks/"

weather_files = [f for f in os.listdir(weather_files_path) if f.endswith('.csv')]

weather_files.sort(key=lambda x: int(x.split('_')[0]))

weather_dfs = []
for file in weather_files:
    file_path = os.path.join(weather_files_path, file)
    weather_dfs.append(pd.read_csv(file_path))

weather_df = pd.concat(weather_dfs, ignore_index=True)

print(f"Concatenated Weather Data Shape: {weather_df.shape}")

Concatenated Weather Data Shape: (14111, 2)


In [17]:
weather_df

Unnamed: 0.1,Unnamed: 0,0
0,0,"{'temperature_2m': 28.9, 'relative_humidity_2m..."
1,1,"{'temperature_2m': 27.0, 'relative_humidity_2m..."
2,2,"{'temperature_2m': 28.0, 'relative_humidity_2m..."
3,3,"{'temperature_2m': 31.2, 'relative_humidity_2m..."
4,4,"{'temperature_2m': 29.1, 'relative_humidity_2m..."
...,...,...
14106,14106,"{'temperature_2m': 7.8, 'relative_humidity_2m'..."
14107,14107,"{'temperature_2m': 8.2, 'relative_humidity_2m'..."
14108,14108,"{'temperature_2m': 10.1, 'relative_humidity_2m..."
14109,14109,"{'temperature_2m': 6.6, 'relative_humidity_2m'..."


In [18]:
import ast
weather_df['parsed_data'] = weather_df['0'].apply(ast.literal_eval)

weather_data_expanded = pd.json_normalize(weather_df['parsed_data'])

weather_df = pd.concat([weather_df, weather_data_expanded], axis=1)

weather_df = weather_df.drop(columns=['0', 'Unnamed: 0', 'parsed_data'])

In [19]:
weather_df

Unnamed: 0,temperature_2m,relative_humidity_2m,windspeed_10m,apparent_temperature
0,28.9,68,15.2,31.8
1,27.0,77,9.6,30.9
2,28.0,28,5.8,27.9
3,31.2,66,6.1,39.2
4,29.1,72,1.5,35.1
...,...,...,...,...
14106,7.8,82,22.2,3.3
14107,8.2,79,12.7,5.1
14108,10.1,67,14.8,6.6
14109,6.6,88,13.7,3.3


In [20]:
assert len(matches) == len(weather_df), "Row mismatch between matches and weather data"

matches_with_weather = pd.concat([matches.reset_index(drop=True), weather_df.reset_index(drop=True)], axis=1)

matches_with_weather

Unnamed: 0,tournament_location,tournament_name,Date,indoor_or_outdoor,Surface,Round,W1,L1,W2,L2,...,loser_entry_Q,loser_entry_WC,winner_is_seeded,loser_is_seeded,latitude,longitude,temperature_2m,relative_humidity_2m,windspeed_10m,apparent_temperature
0,Brisbane,Brisbane International,2017-12-31,Outdoor,Hard,1st Round,6.0,4.0,7.0,6.0,...,False,False,0,1,-27.468968,153.023499,28.9,68,15.2,31.8
1,Brisbane,Brisbane International,2017-12-31,Outdoor,Hard,1st Round,7.0,6.0,6.0,4.0,...,False,False,0,0,-27.468968,153.023499,27.0,77,9.6,30.9
2,Brisbane,Brisbane International,2018-01-01,Outdoor,Hard,1st Round,6.0,4.0,3.0,6.0,...,False,False,0,0,-27.468968,153.023499,28.0,28,5.8,27.9
3,Brisbane,Brisbane International,2018-01-01,Outdoor,Hard,1st Round,6.0,3.0,6.0,2.0,...,False,False,0,0,-27.468968,153.023499,31.2,66,6.1,39.2
4,Brisbane,Brisbane International,2018-01-01,Outdoor,Hard,1st Round,6.0,4.0,7.0,5.0,...,True,False,1,0,-27.468968,153.023499,29.1,72,1.5,35.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14106,Sofia,Sofia Open,2023-11-09,Indoor,Hard,Quarterfinals,6.0,7.0,6.0,4.0,...,False,False,1,1,42.697703,23.321736,7.8,82,22.2,3.3
14107,Sofia,Sofia Open,2023-11-09,Indoor,Hard,Quarterfinals,6.0,3.0,7.0,6.0,...,False,False,0,1,42.697703,23.321736,8.2,79,12.7,5.1
14108,Sofia,Sofia Open,2023-11-10,Indoor,Hard,Semifinals,6.0,3.0,6.0,4.0,...,False,False,0,1,42.697703,23.321736,10.1,67,14.8,6.6
14109,Sofia,Sofia Open,2023-11-10,Indoor,Hard,Semifinals,6.0,2.0,7.0,6.0,...,False,False,1,0,42.697703,23.321736,6.6,88,13.7,3.3


In [22]:
matches_with_weather[["match_id", "temperature_2m", "relative_humidity_2m", "windspeed_10m", "apparent_temperature"]].to_csv("../../data/created_features_separate/weather.csv", index=False)