In [21]:
import pandas as pd
import numpy as np

In [22]:
names = ["Brazil", "Colombia", "Vietnam", "Indonesia", "Honduras"]
data_dict = {}  

for name in names:
    input_file = f"../raw_data/weather_data_{name}.csv"  
    data_dict[name] = pd.read_csv(input_file)  

In [27]:
for name in names:
    df = data_dict[name]
    
    cols = ["daylight_duration", "sunshine_duration"]
    df[cols] = df[cols].apply(lambda row: row / 3600 if row.max() > 24 else row, axis=1)

    df["date"] = pd.to_datetime(df["date"])
    df["month"] = df["date"].dt.month
    df["year"] = df["date"].dt.year
    
    df["Date"] = df.apply(lambda row: f"{row['month']:02d}/{row['year']}", axis=1)
    
    # Drop the 'year' and 'month' columns
    df = df.drop(columns=["year", "month"])
    
    numeric_cols = df.select_dtypes(include=np.number).columns
    grouped_mean = df.groupby("Date")[numeric_cols].mean().reset_index()

    grouped_mean = grouped_mean.loc[:, (grouped_mean != 0).any(axis=0)]

    # Specify the date format explicitly
    grouped_mean["Date"] = pd.to_datetime(grouped_mean["Date"], format="%m/%Y").dt.strftime("%m/%Y")
    grouped_mean = grouped_mean[grouped_mean["Date"].str[-4:] != '2025']
    grouped_mean = grouped_mean.sort_values(by='Date')
    grouped_mean = grouped_mean.reset_index(drop=True)
    
    desired_columns = ["Date", "temperature_2m_mean", "daylight_duration", 
                        "precipitation_sum", "wind_speed_10m_max", 
                        "et0_fao_evapotranspiration"]
    grouped_mean = grouped_mean[desired_columns]
    
    # Rename columns except for 'Date'
    grouped_mean.columns = [f"{name}_{col}" if col != "Date" else col for col in grouped_mean.columns] 

    output_file = f"../clean_data/{name}_weather_cleaned.csv"
    grouped_mean.to_csv(output_file, index=False)
