In [None]:
import pandas as pd
import numpy as np
import datetime
import os
import json
import matplotlib.pyplot as plt
import geopandas as gpd
import datetime
from shapely import geometry

In [None]:
state_boundary_file_path = r"data/input/us-state-boundaries.json"

with open(state_boundary_file_path) as file:
    data = json.load(file)
CA = data[0]['st_asgeojson']['geometry']['coordinates'][0][0]
CA = geometry.Polygon(CA)
CA

In [None]:
out_df = pd.read_csv("data/input/static_variables.csv", index_col=[0])
out_df

### Temperature and precipitation

In [None]:
coord_dict = {}

variable_dict = {"PRCP": None, "TAVG": None}

path = "data/input/WEATHER"

for root, dirs, files in os.walk(path):
    for file in files:
        if not file.endswith(".csv"):
            continue
        df = pd.read_csv(os.path.join(root, file))
        
        for variable in variable_dict:
            if variable_dict[variable] is None:
                variable_dict[variable] = df.pivot(index="DATE",
                                                   columns=["STATION", "LATITUDE", "LONGITUDE"],
                                                   values=variable)
                
            else:
                sub_df = df.pivot(index="DATE",
                                  columns=["STATION", "LATITUDE", "LONGITUDE"],
                                  values=variable)
                variable_dict[variable] = pd.concat([variable_dict[variable], sub_df], axis=1)
    break
    
for variable, val in variable_dict.items():
    val.to_csv(f"data/input/WEATHER/{variable}.csv")

In [None]:
variable_dict = {'PRCP': None, 'TAVG': None}

path = "data/input/WEATHER"

for variable in variable_dict:
    df = pd.read_csv(f"data/input/WEATHER/{variable}.csv", index_col=[0])
    df = df[[col for col in df.columns if '.' not in col]]
    
    for i, row in df[240:].iterrows():
        sub_df = df.loc[["LATITUDE", "LONGITUDE", i], row[~pd.isna(row)].index]
        vals = sub_df.values.T
        month_df = pd.DataFrame({"LONGITUDE": vals[:, 1], "LATITUDE": vals[:, 0], "VALUES": vals[:, 2]})
        
        month_df['to_drop'] = month_df.apply(lambda x: '_'.join([str(x["LONGITUDE"]), str(x["LATITUDE"])]), axis=1)
        month_df = month_df.drop_duplicates(subset=['to_drop'])
        month_df = month_df.drop(columns=['to_drop'])

        month_df.to_csv(f"data/input/WEATHER/{variable}/station_data/{i}.csv", index=False)

### Drought index

In [None]:
import netCDF4
from datetime import datetime

In [None]:
url = 'http://thredds.northwestknowledge.net/thredds/dodsC/agg_met_pdsi_1979_CurrentYear_CONUS.nc?lat[170:1:425],lon[0:1:250],daily_mean_palmer_drought_severity_index[2230:1:3100][170:1:425][0:1:250],day[2530:1:3100]'

In [None]:
dataset = netCDF4.Dataset(url)

In [None]:
dataset.variables.keys()

In [None]:
lats = dataset.variables['lat'][0:].data.flatten()
lons = dataset.variables['lon'][0:].data.flatten()
days = dataset.variables['day'][0:].data

xx, yy = np.meshgrid(lons[::6], lats[::6])
xx = xx.flatten()
yy = yy.flatten()

In [None]:
for i, day in enumerate(days[::2]):
    dt = datetime.fromordinal(datetime(1900, 1, 1).toordinal() + int(day) - 2).strftime("%Y-%m-%d")
    
    start = time.time()
    data = dataset.variables['daily_mean_palmer_drought_severity_index'][i][::6, ::6]
    
    df = pd.DataFrame({"lon": xx, "lat": yy, "values": data.flatten()})
    df = df.dropna().reset_index(drop=True)
    
    if i == 0:
        mask = df.apply(lambda x: CA.contains(geometry.Point(x["lon"], x["lat"])), axis=1).values
    
    df = df.loc[mask].reset_index(drop=True)
    df.to_csv(f"data/input/DROUGHT/grid_data/{dt}.csv")

## After kriging

In [None]:
for variable in ["PRCP", "TAVG"]:
    for year in range(2015, 2022):
        for month in range(1, 13):
            dfs = []
            for root, dirs, files in os.walk(f"data/input/WEATHER/{variable}/kriged_data"):
                for file in files:
                    y, m, _ = [int(x) for x in file[:-4].split("-")]
                    if (year == y) and (month == m):
                        dfs.append(pd.read_csv(os.path.join(root, file), index_col=[0]))
                    if len(dfs) == 9:
                        break
            pd.concat(dfs).to_csv(f'data/input/WEATHER/{variable}/monthly_data/{year}-{month}.csv')

In [None]:
month_dict = {}

for root, _, files in os.walk("data/input/DROUGHT/kriged_data"):
    for file in files:
        name_split = file.split('-')
        if (name_split[0], name_split[1]) not in month_dict:
            month_dict[(name_split[0], name_split[1])] = []
        month_dict[(name_split[0], name_split[1])].append(os.path.join(root, file))
month_dict

In [None]:
for key, paths in month_dict.items():
    print(key)
    daily_dict = {}
    for path in paths:
        daily_split = path.split('-')
        if (daily_split[0][-4:], daily_split[1], daily_split[2][:-6]) not in daily_dict:
            daily_dict[(daily_split[0][-4:], daily_split[1], daily_split[2][:-6])] = []
        daily_dict[(daily_split[0][-4:], daily_split[1], daily_split[2][:-6])].append(path)
    df = None
    for day, day_paths in daily_dict.items():
        sub_df = None
        for day_path in day_paths:
            if sub_df is None:
                sub_df = pd.read_csv(day_path, index_col=[0])
            else:
                sub_df = pd.concat([sub_df, pd.read_csv(day_path, index_col=[0])])
        if df is None:
            df = sub_df.copy()
        else:
            df['kriged_val'] += sub_df['kriged_val']
    
    df['kriged_val'] = df['kriged_val'] / len(daily_dict)
    
    df.to_csv(f"data/input/DROUGHT/monthly_data/{key[0]}-{key[1]}.csv")

In [None]:
year_map = {1: [-1, -1, -1], 2: [0, -1, -1], 3: [0, 0, -1]}
year_map.update({i: [0, 0, 0] for i in range(4, 13)})

month_map = {1: [12, 11, 10], 2: [1, 12, 11], 3: [2, 1, 12]}
month_map.update({i: [i-1, i-2, i-3] for i in range(4, 13)})

In [None]:
for year in range(2015, 2022):
    for month in range(1, 13):
    month_df = pd.DataFrame()
    for variable in ["PRCP", "TAVG"]:
        for i, (m, y) in enumerate(zip(month_map[int(month)], [int(year) + z for z in year_map[int(month)]])):
            vals = pd.read_csv(f"data/input/WEATHER/{variable}/monthly_data/{y}-{m}.csv", 
                               index_col=[0])["kriged_val"].values
            print(variable, m, y, vals.shape)
            month_df[f"{variable}_prev{i+1}"] = vals
    
    for i, (m, y) in enumerate(zip(month_map[int(month)], [int(year) + z for z in year_map[int(month)]])):
        if len(str(m)) == 1:
            m = f"0{m}"
        vals = pd.read_csv(f"data/input/DROUGHT/monthly_data/{y}-{m}.csv", 
                           index_col=[0])["kriged_val"].values
        month_df[f"PDSI_prev{i+1}"] = vals

    month_df.to_csv(f"data/datasets/raw_datasets/{year}-{month}.csv")

### Target variable

In [None]:
gdf = gpd.read_file("data/input/FIRE_DATABASE/S_USA.FinalFirePerimeter.gdb")

gdf.head()

In [None]:
isin_CA = set()

for _, row in gdf.iterrows():
    if row["geometry"] is None:
        continue
    polygons = list(row["geometry"].geoms)
    for poly in polygons:
        if CA.intersects(poly):
            isin_CA.add(_)

In [None]:
df = gdf.loc[list(isin_CA)]

indices = df[df["FIREYEAR"] >= 2015]['DISCOVERYDATETIME'].dropna().index

df = df.loc[indices]
df["DISCOVERYDATETIME"] = pd.to_datetime(df["DISCOVERYDATETIME"]).dt.date

In [None]:
fire_boundaries = {}

for i, fire in df["geometry"].to_dict().items():
    fire_polygons = list(fire.geoms)
    for j, fire_poly in enumerate(fire_polygons):
        lon_max, lon_min = np.max(fire_poly.exterior.xy[0]), np.min(fire_poly.exterior.xy[0])
        lat_max, lat_min = np.max(fire_poly.exterior.xy[1]), np.min(fire_poly.exterior.xy[1])
        fire_boundaries[(i, j)] = (lon_max, lon_min, lat_max, lat_min)

In [None]:
static_df = pd.read_csv("data/datasets/static_variables.csv", index_col=[0])

for year in range(2015, 2022):
    for month in range(1, 13):
        month_df = pd.read_csv(f"data/datasets/raw_datasets/{y}-{m}.csv", index_col=[0])

        if month == 12:
            start = datetime.date(year, month, 1)
            end = datetime.date(year+1, 1, 1)
        else:
            start = datetime.date(year, month, 1)
            end = datetime.date(year, month + 1, 1)
        sub_df = df[(df["DISCOVERYDATETIME"] >= start) & (df["DISCOVERYDATETIME"] < end)]
        
        monthly_fires = {}
        
        for i, fire in sub_df["geometry"].to_dict().items():
                fire_polygons = list(fire.geoms)
                for j, fire_poly in enumerate(fire_polygons):
                    monthly_fires[(i, j)] = fire_poly
        
        month_target_var = []
        
        for lon, lat in static_df[["lon", "lat"]].values:
            k = 0
            for i, fire in monthly_fires.items():
                f_bound = fire_boundaries[i]
                if lon >= f_bound[0]:
                    continue
                elif lon <= f_bound[1]:
                    continue
                if lat >= f_bound[2]:
                    continue
                elif lat <= f_bound[3]:
                    continue
                else:
                    if fire.contains(geometry.Point(lon, lat)):
                        month_target_var.append(1)
                        k = 1
                        break
            if not k:
                month_target_var.append(0)
                    
        month_df['TARGET'] = month_target_var
        
        month_df.to_csv(f"data/datasets/raw_datasets/{y}-{m}.csv")