In [None]:
import pandas as pd
import re
import time

# List of locations for weather data
place_list = ['japan', 'korea', 'kyoto', 'liestal', 'usa', 'vancouver', 'washintondc', 'nyc']
df_list = []

# Read weather data for each location and store in a list
for place in place_list:
    df = pd.read_csv(rf'D:\JupyterNotebooks\Cherrybloom\final_weather\{place}_weather.csv')
    df_list.append(df)

# Concatenate all data into a single DataFrame
df = pd.concat(df_list)

# Function to remove periods with missing TMAX or TMIN values for 30 consecutive days or more
def remove_long_missing_periods(group):
    group = group.sort_values(by='DATE').reset_index(drop=True)
    group['TMAX_missing'] = group['TMAX'].isna()
    group['TMIN_missing'] = group['TMIN'].isna()

    # Calculate consecutive missing days
    group['TMAX_missing_cumsum'] = group['TMAX_missing'].astype(int).groupby(group['TMAX_missing'].diff().ne(0).cumsum()).cumsum()
    group['TMIN_missing_cumsum'] = group['TMIN_missing'].astype(int).groupby(group['TMIN_missing'].diff().ne(0).cumsum()).cumsum()

    # Filter out periods where missing data lasts for 30 or more days
    mask = (group['TMAX_missing_cumsum'] >= 15) | (group['TMIN_missing_cumsum'] >= 15)
    return group[~mask].drop(columns=['TMAX_missing', 'TMIN_missing', 'TMAX_missing_cumsum', 'TMIN_missing_cumsum'])

# Apply function to remove long missing periods based on latitude and longitude
df = df.groupby(['lat', 'long'], group_keys=False).apply(remove_long_missing_periods)

# Function to compute 30-day rolling mean and fill missing values
def rolling_mean_cudf(group, col):
    group = group.sort_values(by='DATE').reset_index(drop=True)

    # Compute rolling mean
    rolling_means = group[col].fillna(method='ffill').rolling(window=30, min_periods=1).mean()

    # Fill missing values with rolling mean
    group[col] = group[col].fillna(rolling_means)

    return group

# Apply rolling mean calculation for TMAX and TMIN
df = df.groupby(['lat', 'long'], group_keys=False).apply(lambda g: rolling_mean_cudf(g, 'TMAX'))
df = df.groupby(['lat', 'long'], group_keys=False).apply(lambda g: rolling_mean_cudf(g, 'TMIN'))

# Function to remove non-continuous date periods
def remove_non_continuous_dates(group):
    group = group.sort_values("DATE").reset_index(drop=True)
    group["date_diff"] = group["DATE"].diff().dt.days  # Calculate date difference

    # Find discontinuous dates where the difference is greater than 1
    discontinuous_rows = group[group["date_diff"] > 1]

    if not discontinuous_rows.empty:
        # Get the last non-continuous date
        last_discontinuous_date = discontinuous_rows.iloc[-1]["DATE"]
        # Remove data before and including the last discontinuous date
        group = group[group["DATE"] > last_discontinuous_date]

    return group.drop(columns=["date_diff"])

# Apply function to remove non-continuous date periods
df = df.groupby(["lat", "long"], group_keys=False).apply(remove_non_continuous_dates)

# Function to determine "seasonal year" (March-February as one year)
def get_seasonal_year(date):
    return date.year if date.month >= 3 else date.year - 1

df['seasonal_year'] = df['DATE'].apply(get_seasonal_year)

# Function to categorize seasons based on months
def assign_season(date):
    month = date.month
    if month in [12, 1, 2]:
        return 'Winter'
    elif month in [3, 4, 5]:
        return 'Spring'
    elif month in [6, 7, 8]:
        return 'Summer'
    else:
        return 'Fall'

df['season'] = df['DATE'].apply(assign_season)

# Ensure temperature and precipitation columns are numeric
df['TMAX'] = pd.to_numeric(df['TMAX'], errors='coerce')
df['TMIN'] = pd.to_numeric(df['TMIN'], errors='coerce')
df['PRCP'] = pd.to_numeric(df['PRCP'], errors='coerce')

# Function to compute seasonal averages and sums
def compute_seasonal_features(group):
    seasonal_features = {}
    for season in ['Winter', 'Spring', 'Summer', 'Fall']:
        season_data = group[group['season'] == season]

        # Drop NaN values for calculations
        season_data = season_data.dropna(subset=['TMAX', 'TMIN', 'PRCP'])

        # Calculate seasonal features
        if not season_data.empty:
            seasonal_features[f'Tmax_{season}'] = season_data['TMAX'].mean()
            seasonal_features[f'Tmin_{season}'] = season_data['TMIN'].mean()
            seasonal_features[f'Prcp_{season}'] = season_data['PRCP'].sum()
        else:
            seasonal_features[f'Tmax_{season}'] = np.nan
            seasonal_features[f'Tmin_{season}'] = np.nan
            seasonal_features[f'Prcp_{season}'] = np.nan

    return pd.Series(seasonal_features)

# Apply seasonal feature computation
seasonal_features_df = df.groupby(['lat', 'long', 'seasonal_year']).apply(compute_seasonal_features).reset_index()

# Function to calculate Growing Degree Days (GDD)
def calculate_gdd(df, base_temp=5):
    df['GDD'] = df['TMAX'] - base_temp
    df['GDD_cumsum'] = df.groupby(['lat', 'long', 'seasonal_year'])['GDD'].cumsum()

    # Compute rolling GDD sums
    df['GDD_30d'] = df.groupby(['lat', 'long'])['GDD'].rolling(30, min_periods=1).sum().reset_index(level=[0,1], drop=True)
    df['GDD_60d'] = df.groupby(['lat', 'long'])['GDD'].rolling(60, min_periods=1).sum().reset_index(level=[0,1], drop=True)
    df['GDD_120d'] = df.groupby(['lat', 'long'])['GDD'].rolling(120, min_periods=1).sum().reset_index(level=[0,1], drop=True)

    df['GDD_rate_change_30_60'] = df['GDD_30d'] - df['GDD_60d']
    df['GDD_rate_change_60_120'] = df['GDD_60d'] - df['GDD_120d']

    return df

df = calculate_gdd(df)

# Compute moving average temperatures
df['TAVG'] = (df['TMAX'] + df['TMIN']) / 2
df['TAVG_7d'] = df.groupby(['lat', 'long'])['TAVG'].rolling(7, min_periods=1).mean().reset_index(level=[0,1], drop=True)
df['TAVG_30d'] = df.groupby(['lat', 'long'])['TAVG'].rolling(30, min_periods=1).mean().reset_index(level=[0,1], drop=True)

# Compute precipitation-related features
df['PRCP_cumsum'] = df.groupby(['lat', 'long', 'seasonal_year'])['PRCP'].cumsum()
df['PRCP_7d_cumsum'] = df.groupby(['lat', 'long'])['PRCP'].rolling(7, min_periods=1).sum().reset_index(level=[0,1], drop=True)

# Compute frost and heat day counts
df['Frost_days_30d'] = df.groupby(['lat', 'long'])['TMIN'].transform(lambda x: (x < 0).rolling(30, min_periods=1).sum())
df['Heat_days_30d'] = df.groupby(['lat', 'long'])['TMAX'].transform(lambda x: (x > 25).rolling(30, min_periods=1).sum())

# Extract last available data for February in each seasonal year
df = df[df['DATE'].dt.month == 2].groupby(['lat', 'long', 'seasonal_year']).last().reset_index()

df.to_csv('training_data.csv', index=False)