In [None]:
import pandas as pd
import requests
import math
import numpy as np
import json
import sklearn.processing import LabelEncoder

## 1. Population Density Feature

In [None]:
df_population_2021 = pd.read_csv("data/processed/df_population_2021.csv")
df_population_2022 = pd.read_csv("data/processed/df_population_2022.csv")

In [None]:
df_pop_unique = (
    df_population_2021[['fips_code', 'population_density']]
    .drop_duplicates(subset='fips_code')
)

# join by fips_code
df_outages_2021 = pd.merge(
    df_outages_2021,
    df_pop_unique,
    on='fips_code',
    how='left'
)

print(df_outages_2021.head())
print(df_outages_2021.info())

In [None]:

df_pop_unique = (
    df_population_2022[['fips_code', 'population_density']]
    .drop_duplicates(subset='fips_code')
)

# join by fips_code
df_outages_2022 = pd.merge(
    df_outages_2022,
    df_pop_unique,
    on='fips_code',
    how='left'
)

print(df_outages_2022.head())
print(df_outages_2022.info())

## 2. Outage Proxy Features (Rolling Stats)

In [None]:
# rolling mean 12 h
df_outages_2021['rolling_mean_12h'] = (
    df_outages_2021
    .groupby('fips_code')['sum']
    .transform(lambda x: x.rolling(window=12, min_periods=1).mean())
)

# rolling max 12 h
df_outages_2021['rolling_max_12h'] = (
    df_outages_2021
    .groupby('fips_code')['sum']
    .transform(lambda x: x.rolling(window=12, min_periods=1).max())
)

In [None]:
# rolling mean 12 h
df_outages_2022['rolling_mean_12h'] = (
    df_outages_2022
    .groupby('fips_code')['sum']
    .transform(lambda x: x.rolling(window=12, min_periods=1).mean())
)

# rolling max 12 h
df_outages_2022['rolling_max_12h'] = (
    df_outages_2022
    .groupby('fips_code')['sum']
    .transform(lambda x: x.rolling(window=12, min_periods=1).max())
)

## 3. Day of Week

In [None]:
df_outages_2021['day_of_week_num'] = (
    df_outages_2021['run_start_time'].dt.dayofweek
)
df_outages_2022['day_of_week_num'] = (
    df_outages_2022['run_start_time'].dt.dayofweek
)

## 4. Weather Features

In [None]:
df_weather_data_2021 = pd.read_csv("data/processed/merged_weather_data_2021.csv")
df_weather_data_2022 = pd.read_csv("data/processed/merged_weather_data_2022.csv")

In [None]:
df_weather_data_2021['valid'] = pd.to_datetime(df_weather_data_2021['valid'], errors='coerce')
df_weather_data_2021.sort_values(['Name', 'valid'], inplace=True)

In [None]:
df_weather_data_2022['valid'] = pd.to_datetime(df_weather_data_2022['valid'], errors='coerce')
df_weather_data_2022.sort_values(['Name', 'valid'], inplace=True)

In [None]:
# remane columns
df_weather_data_2021.rename(
    columns={
        'pred_dwpf': 'dwpf',
        'pred_tmpf': 'tmpf',
        'pred_relh': 'relh',
        'gradient_value': 'relh_grad',
        'pred_alti': 'alti',
        'max_gust': 'gust',
        'pred_sknt': 'sknt',
        'pred_u': 'drct_u',
        'pred_v': 'drct_v',
        'pred_mslp': 'mslp'
    },
    inplace=True
)
df_weather_data_2021.drop(columns=['x_c', 'y_c'], inplace=True)

print(df_weather_data_2021.info())

In [None]:
# remane columns
df_weather_data_2022.rename(
    columns={
        'pred_dwpf': 'dwpf',
        'pred_tmpf': 'tmpf',
        'pred_relh': 'relh',
        'gradient_value': 'relh_grad',
        'pred_alti': 'alti',
        'max_gust': 'gust',
        'pred_sknt': 'sknt',
        'pred_u': 'drct_u',
        'pred_v': 'drct_v',
        'pred_mslp': 'mslp'
    },
    inplace=True
)
df_weather_data_2022.drop(columns=['x_c', 'y_c'], inplace=True)

print(df_weather_data_2022.info())

In [None]:
# convert to datetime format
df_outages_2021['run_start_time'] = pd.to_datetime(df_outages_2021['run_start_time'])
df_weather_data_2021['valid'] = pd.to_datetime(df_weather_data_2021['valid'])

# merge, keep all raws present in df_outages_2021
df_final_2021 = pd.merge(
    df_outages_2021,
    df_weather_data_2021,
    left_on=['run_start_time', 'county'],
    right_on=['valid', 'Name'],
    how='left'
)

df_final_2021.drop(columns=['valid', 'county'], inplace=True)

print(df_final_2021.info())
print(df_final_2021.head())

In [None]:
# convert to datetime format
df_outages_2022['run_start_time'] = pd.to_datetime(df_outages_2022['run_start_time'])
df_weather_data_2022['valid'] = pd.to_datetime(df_weather_data_2022['valid'])

# merge, keep all raws present in df_outages_2022
df_final_2022 = pd.merge(
    df_outages_2022,
    df_weather_data_2022,
    left_on=['run_start_time', 'county'],
    right_on=['valid', 'Name'],
    how='left'
)

df_final_2022.drop(columns=['valid', 'county'], inplace=True)

print(df_final_2022.info())
print(df_final_2022.head())

## 5. Weather Temporal Aggregates

In [None]:
# sort each county (fips_code) by chronological order
df_final_2021.sort_values(by=['fips_code', 'run_start_time'], inplace=True)


# lags for parameters where sudden changes matter
weather_cols_lag = ['dwpf', 'tmpf', 'drct_u', 'drct_v']
lag_hours = [6, 12, 24, 48]

for col in weather_cols_lag:
    for lag in lag_hours:
        df_final_2021[f'{col}_lag_{lag}h'] = df_final_2021.groupby(['fips_code'])[col].shift(lag)


# rolling sum for accumulated metrics
weather_cols_rolling_sum = ['p0li']
rolling_windows_sum = [6, 12, 24, 48]

for col in weather_cols_rolling_sum:
    for window in rolling_windows_sum:
        df_final_2021[f'{col}_rolling_sum_{window}h'] = (
            df_final_2021.groupby(['fips_code'])[col]
              .rolling(window=window, min_periods=1)
              .sum()
              .reset_index(level=['fips_code'], drop=True)
        )


# rolling mean to capture trends
weather_cols_rolling_mean = ['alti', 'mslp', 'relh']
rolling_windows_mean = [6, 12, 24, 48]

for col in weather_cols_rolling_mean:
    for window in rolling_windows_mean:
        df_final_2021[f'{col}_rolling_mean_{window}h'] = (
            df_final_2021.groupby(['fips_code'])[col]
              .rolling(window=window, min_periods=1)
              .mean()
              .reset_index(level=['fips_code'], drop=True)
        )


# rolling max to capture peak values
weather_cols_rolling_max = ['gust', 'sknt', 'relh_grad']
rolling_windows_max = [6, 12, 24, 48]

for col in weather_cols_rolling_max:
    for window in rolling_windows_max:
        df_final_2021[f'{col}_rolling_max_{window}h'] = (
            df_final_2021.groupby(['fips_code'])[col]
              .rolling(window=window, min_periods=1)
              .max()
              .reset_index(level=['fips_code'], drop=True)
        )

# lags and rolling sum for binary flags
binary_flags = ['ts_flag', 'hr_flag', 'sq_flag']
rolling_windows_flag = [6, 12, 24, 48]

for col in binary_flags:
    for window in rolling_windows_flag:
        df_final_2021[f'{col}_rolling_sum_{window}h'] = (
            df_final_2021.groupby(['fips_code'])[col]
              .rolling(window=window, min_periods=1)
              .sum()
              .reset_index(level=['fips_code'], drop=True)
        )

df_final_2021.reset_index(drop=True, inplace=True)
print(df_final_2021.info())
print(df_final_2021.head())

In [None]:
# sort each county (fips_code) by chronological order
df_final_2022.sort_values(by=['fips_code', 'run_start_time'], inplace=True)

# lags for parameters where sudden changes matter
weather_cols_lag = ['dwpf', 'tmpf', 'drct_u', 'drct_v']
lag_hours = [6, 12, 24, 48]

for col in weather_cols_lag:
    for lag in lag_hours:
        df_final_2022[f'{col}_lag_{lag}h'] = df_final_2022.groupby(['fips_code'])[col].shift(lag)

# rolling sum for accumulated metrics
weather_cols_rolling_sum = ['p0li']
rolling_windows_sum = [6, 12, 24, 48]

for col in weather_cols_rolling_sum:
    for window in rolling_windows_sum:
        df_final_2022[f'{col}_rolling_sum_{window}h'] = (
            df_final_2022.groupby(['fips_code'])[col]
              .rolling(window=window, min_periods=1)
              .sum()
              .reset_index(level=['fips_code'], drop=True)
        )

# rolling mean to capture trends
weather_cols_rolling_mean = ['alti', 'mslp', 'relh']
rolling_windows_mean = [6, 12, 24, 48]  #12, 24. 6, 12

for col in weather_cols_rolling_mean:
    for window in rolling_windows_mean:
        df_final_2022[f'{col}_rolling_mean_{window}h'] = (
            df_final_2022.groupby(['fips_code'])[col]
              .rolling(window=window, min_periods=1)
              .mean()
              .reset_index(level=['fips_code'], drop=True)
        )

# rolling max to capture peak values
weather_cols_rolling_max = ['gust', 'sknt', 'relh_grad']
rolling_windows_max = [6, 12, 24, 48]  #12, 24. 6, 12

for col in weather_cols_rolling_max:
    for window in rolling_windows_max:
        df_final_2022[f'{col}_rolling_max_{window}h'] = (
            df_final_2022.groupby(['fips_code'])[col]
              .rolling(window=window, min_periods=1)
              .max()
              .reset_index(level=['fips_code'], drop=True)
        )

# lags and rolling sum for binary flags
binary_flags = ['ts_flag', 'hr_flag', 'sq_flag']
rolling_windows_flag = [6, 12, 24, 48]  #12, 24. 6, 12

for col in binary_flags:
    for window in rolling_windows_flag:
        df_final_2022[f'{col}_rolling_sum_{window}h'] = (
            df_final_2022.groupby(['fips_code'])[col]
              .rolling(window=window, min_periods=1)
              .sum()
              .reset_index(level=['fips_code'], drop=True)
        )

df_final_2022.reset_index(drop=True, inplace=True)
print(df_final_2022.info())
print(df_final_2022.head())

## 6. Target Definition

In [None]:
# Target difinition:
# 1. Define anomaly based on 90Q with MinMax normalization
# 2. 48H shift for anomaly_flag
# 3. 48H shift for sum

df_2021 = df_final_2021.copy()

# 1) sort
df_2021.sort_values(['fips_code','run_start_time'], inplace=True)

# 2) min and max of 'sum' for each fips_code
df_2021['sum_min'] = df_2021.groupby('fips_code')['sum'].transform('min')
df_2021['sum_max'] = df_2021.groupby('fips_code')['sum'].transform('max')

# 3) MinMax scaler
df_2021['sum_norm'] = np.where(
    df_2021['sum_min'] != df_2021['sum_max'],
    (df_2021['sum'] - df_2021['sum_min']) / (df_2021['sum_max'] - df_2021['sum_min']),
    0.0
)

# 4) 0.99 Q of sum_norm for each fips_code
q = 0.90
quantile_stats_2021 = (
    df_2021.groupby('fips_code')['sum_norm']
           .quantile(q)
           .reset_index(name=f'quantile_{int(q*100)}')
)

# 5) merge Quantile stats back
df_2021 = pd.merge(df_2021, quantile_stats_2021, on='fips_code', how='left')

# 6) anomaly_flag: 1 if sum_norm >= quantile
df_2021['anomaly_flag'] = (
    df_2021['sum_norm'] >= df_2021[f'quantile_{int(q*100)}']
).astype(int)

# 7) drop columns
cols_drop = [
    'sum_min','sum_max','sum_norm',
    f'quantile_{int(q*100)}'
]
df_2021.drop(columns=cols_drop, inplace=True, errors='ignore')

# 8) second stage
df_2021_binary = df_2021.copy()

# 9) time_48h = +48 часов
df_2021_binary['time_48h'] = df_2021_binary['run_start_time'] + pd.Timedelta(hours=48)

# 10) prepare df_shifted_2021
df_shifted_2021 = df_2021_binary[['Name','run_start_time','sum','anomaly_flag']].copy()
df_shifted_2021.rename(columns={
    'run_start_time': 'time_48h_ahead',
    'sum': 'sum_48',
    'anomaly_flag': 'target_anomaly_48h'
}, inplace=True)

# 11) merge
df_2021_binary = pd.merge(
    df_2021_binary,
    df_shifted_2021,
    left_on=['Name','time_48h'],
    right_on=['Name','time_48h_ahead'],
    how='left'
)

# 12) fillna для sum_48 / target_anomaly_48h
df_2021_binary['sum_48'] = df_2021_binary['sum_48'].fillna(0)
df_2021_binary['target_anomaly_48h'] = df_2021_binary['target_anomaly_48h'].fillna(0).astype(int)



df_2021_binary.drop(columns=['time_48h','time_48h_ahead'], inplace=True, errors='ignore')

print("=== 2021 final shape:", df_2021_binary.shape)
print(df_2021_binary[['Name','run_start_time','anomaly_flag','target_anomaly_48h','sum_48']].head(15))

In [None]:

df_2022 = df_final_2022.copy()

# 1) sort
df_2022.sort_values(['fips_code','run_start_time'], inplace=True)

# 2) min and max of 'sum' for each fips_code
df_2022['sum_min'] = df_2022.groupby('fips_code')['sum'].transform('min')
df_2022['sum_max'] = df_2022.groupby('fips_code')['sum'].transform('max')

# 3) MinMax scaler
df_2022['sum_norm'] = np.where(
    df_2022['sum_min'] != df_2022['sum_max'],
    (df_2022['sum'] - df_2022['sum_min']) / (df_2022['sum_max'] - df_2022['sum_min']),
    0.0
)

# 4) 0.99 Q of sum_norm for each fips_code
q = 0.90
quantile_stats_2022 = (
    df_2022.groupby('fips_code')['sum_norm']
           .quantile(q)
           .reset_index(name=f'quantile_{int(q*100)}')
)

# 5) merge Quantile stats back
df_2022 = pd.merge(df_2022, quantile_stats_2022, on='fips_code', how='left')

# 6) anomaly_flag: 1 if sum_norm >= quantile
df_2022['anomaly_flag'] = (
    df_2022['sum_norm'] >= df_2022[f'quantile_{int(q*100)}']
).astype(int)

# 7) drop columns
cols_drop = [
    'sum_min','sum_max','sum_norm',
    f'quantile_{int(q*100)}'
]
df_2022.drop(columns=cols_drop, inplace=True, errors='ignore')

# 8) second stage
df_2022_binary = df_2022.copy()

# 9) time_48h = +48 часов
df_2022_binary['time_48h'] = df_2022_binary['run_start_time'] + pd.Timedelta(hours=48)

# 10) prepare df_shifted_2021
df_shifted_2022 = df_2022_binary[['Name','run_start_time','sum','anomaly_flag']].copy()
df_shifted_2022.rename(columns={
    'run_start_time': 'time_48h_ahead',
    'sum': 'sum_48',
    'anomaly_flag': 'target_anomaly_48h'
}, inplace=True)

# 11) merge
df_2022_binary = pd.merge(
    df_2022_binary,
    df_shifted_2022,
    left_on=['Name','time_48h'],
    right_on=['Name','time_48h_ahead'],
    how='left'
)

# 12) fillna для sum_48 / target_anomaly_48h
df_2022_binary['sum_48'] = df_2022_binary['sum_48'].fillna(0)
df_2022_binary['target_anomaly_48h'] = df_2022_binary['target_anomaly_48h'].fillna(0).astype(int)


df_2022_binary.drop(columns=['time_48h','time_48h_ahead'], inplace=True, errors='ignore')

print("=== 2022 final shape:", df_2022_binary.shape)
print(df_2022_binary[['Name','run_start_time','anomaly_flag','target_anomaly_48h','sum_48']].head(15))

## 6. Vertical Concatenate

In [None]:

df_2021_binary.sort_values(by='run_start_time', inplace=True)
df_2022_binary.sort_values(by='run_start_time', inplace=True)


df = pd.concat([df_2021_binary, df_2022_binary], ignore_index=True)
df.sort_values(by='run_start_time', inplace=True)
df.info()
print(df.head())
print(df.tail())

## 7. IDW Aggregates

In [None]:
def build_distance_matrix(df):
    """
      dist_matrix[fips_i][fips_j] = Euclidean distance between counties (in meters)
    """
    # extract counties based on 'fips_code'
    unique_counties = df[['fips_code', 'x', 'y']].drop_duplicates().dropna(subset=['fips_code', 'x', 'y'])
    codes = unique_counties['fips_code'].tolist()


    # creates a dict for county's fips_code to its x, y
    coords_dict = {row['fips_code']: (row['x'], row['y']) for idx, row in unique_counties.iterrows()}

    # building the distance matrix for each county, computing the Euclidean distance to every other county
    dist_matrix = {}
    for code_i in codes:
        dist_matrix[code_i] = {}
        x_i, y_i = coords_dict[code_i]
        for code_j in codes:
            if code_i == code_j:
                dist_matrix[code_i][code_j] = 0.0
            else:
                x_j, y_j = coords_dict[code_j]
                dist_matrix[code_i][code_j] = math.hypot(x_i - x_j, y_i - y_j)
    return dist_matrix

In [None]:
def compute_idw_feature(df, dist_matrix, feature, p=2):
    """
    This function computes IDW weighted averaged values of neighboring counties.
    The result is stored in a new column named "IDW_{feature}".

    Parameters:
      - df (Dataframe)
      - dist_matrix (a distance dictionary obtained from (build_distance_matrix)
      - feature (specified features)
      - p (exponents for IDW weights weight =1/d^2 on inverse square law)
    """
    # Sort the DataFrame by timestamp and county identifier
    # Sort by timestamp and fips_code
    df = df.sort_values(['run_start_time', 'fips_code']).copy()
    results = []

    # group by timestamp
    for t, group in df.groupby('run_start_time'):
        # create a dict for the current time slice
        feature_dict = dict(zip(group['fips_code'], group[feature]))

        # Process each row in the current time slice
        for idx, row in group.iterrows():
            current_code = row['fips_code']
            sum_w = 0.0         # sum of weights
            sum_weighted = 0.0  # sum of weights feature values

            # iteration over all counties in the same time slice
            for other_code, other_val in feature_dict.items():
                if other_code == current_code:
                    continue  # skip the current county itself

                # get the distance from the current county to the other county
                d = dist_matrix[current_code].get(other_code, None)
                if d is not None and d > 0:
                    w = 1.0 / (d ** p) # Compute the weight as 1/d^p
                    sum_w += w
                    sum_weighted += w * other_val

            # Compute IDW if neighbours are found, otherwise assign NaN
            idw_val = sum_weighted / sum_w if sum_w > 0 else np.nan
            row[f"IDW_{feature}"] = idw_val
            results.append(row)

    return pd.DataFrame(results)

## 8. County Encoding

In [None]:
# Encoding County Name
le = LabelEncoder()
df_result['county_encoded'] = le.fit_transform(df_result['Name'])

# save Labelencoded classes
county_mapping = {county: int(idx) for idx, county in enumerate(le.classes_)}

# Delete column 'county'
df_result.drop(columns=['Name'], inplace=True)

# Save JSON for future reconciliation
with open('county_mapping.json', 'w') as f:
    json.dump(county_mapping, f)

## 9. NaN Handling

In [None]:
df_result.dropna(inplace=True)