In [1]:
import os
import geopandas as gpd
import pandas as pd
import numpy as np
import zipfile

import warnings
warnings.filterwarnings('ignore')

## Fire Data

In [2]:
def list_unzip_files(folder_path, unzip_dbf=False):
    # List all files in the folder
    all_files = []
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            relative_path = os.path.relpath(os.path.join(root, file), folder_path)
            all_files.append(relative_path)
    # If there are any dbf zip files, unzip them
    if unzip_dbf:
        for file_path in files:
            if file_path.endswith('.zip'):
                if not os.path.exists(f"{folder_path}/{file_path[:-4]}"):
                    with zipfile.ZipFile(f"{folder_path}/{file_path}", 'r') as zip_ref:
                        zip_ref.extractall(folder_path)
    return all_files

In [3]:
def load_data(files, folder_path):
    dataframes = []
    # Load all shp and csv files
    for file_path in files:
        if file_path.endswith('.shp'):
            df = gpd.read_file(f"{folder_path}/{file_path}")
        elif file_path.endswith('.csv'):
            df = pd.read_csv(f"{folder_path}/{file_path}")
        else:
            continue
        df.columns = map(str.upper, df.columns)
        dataframes.append(df)
    # Concatenate all dataframes
    fire_data = pd.concat(dataframes, ignore_index=True)
    return fire_data

In [4]:
def preprocess_data(data):
    # Drop unnecessary columns
    data.drop(columns=['COUNTRY_ID', 'SCAN', 'TRACK', 'SATELLITE', 'CONFIDENCE', 'VERSION', 'FRP', 'ACQ_TIME',
                            'DAYNIGHT', 'BRIGHTNESS', 'BRIGHT_T31', 'TYPE', 'GEOMETRY', 'INSTRUMENT'], inplace=True)
    # Drop rows with missing or nan values
    data.dropna(inplace=True)
    # Transform the date to datetime format
    data['ACQ_DATE'] = data['ACQ_DATE'].apply(lambda x: pd.to_datetime(x).date())
    # Generate the day of the year
    data['DAY_OF_YEAR'] = data['ACQ_DATE'].apply(lambda x: x.timetuple().tm_yday)
    # Add an ID column and make it the first column
    data['FIRE_ID'] = data.index
    data = data[['FIRE_ID'] + [col for col in data.columns if col != 'FIRE_ID']]
    # Generate a grid cell for each datapoint
    data['GRID_CELL'] = data['LATITUDE'].astype(str) + '_' + data['LONGITUDE'].astype(str)
    return data

In [5]:
def filter_date(data, start_date, end_date):
    # Filter the fire data based on the input parameters
    filtered_data = data[(data['ACQ_DATE'] >= start_date) & (data['ACQ_DATE'] <= end_date)]
    # Reset the index
    filtered_data.reset_index(drop=True, inplace=True)
    return filtered_data

In [6]:
def filter_border(data, lat, lng, path_to_border, add_oblast_id=False):
    # Load the shapefile containing the administrative borders
    borders = gpd.read_file(path_to_border)
    # Ensure the data is a GeoDataFrame
    data_gdf = gpd.GeoDataFrame(data, geometry=gpd.points_from_xy(data[lng], data[lat]))
    # Set the same coordinate reference system (CRS) for both GeoDataFrames
    data_gdf.set_crs(epsg=4326, inplace=True)
    borders.set_crs(epsg=4326, inplace=True)
    # Perform a spatial join to filter datapoints within the borders
    data_inside = gpd.sjoin(data_gdf, borders, how='inner')
    # Drop the geometry column
    data_inside.drop(columns=['geometry', 'source', 'name', 'index_right'], inplace=True)
    # Reset the index
    data_inside.reset_index(drop=True, inplace=True)
    # Make sure all columns are in uppercase
    data_inside.columns = map(str.upper, data_inside.columns)
    if add_oblast_id:
        # Rename the ID column to OBLAST_ID
        data_inside.rename(columns={'ID': 'OBLAST_ID'}, inplace=True)
    else:
        # Drop the ID column
        data_inside.drop(columns=['ID'], inplace=True)
    return data_inside

In [7]:
def generate_cell(data, new_size, round_function):
    # Apply the function to the latitude and longitude columns
    data[f'LATITUDE_{new_size}'] = data['LATITUDE'].apply(round_function)
    data[f'LONGITUDE_{new_size}'] = data['LONGITUDE'].apply(round_function)
    # Generate a grid cell for each datapoint
    data[f'GRID_CELL_{new_size}'] = data[f'LATITUDE_{new_size}'].astype(str) + '_' + data[f'LONGITUDE_{new_size}'].astype(str)
    # Group by GRID_CELL_50KM and find the most frequent OBLAST_ID for each group
    oblast_id = data.groupby(f'GRID_CELL_{new_size}')['OBLAST_ID'].agg(lambda x: x.value_counts().idxmax()).reset_index()
    # Rename the column
    oblast_id.rename(columns={'OBLAST_ID': f'OBLAST_ID_{new_size}'}, inplace=True)
    # Merge the result back to the original dataframe
    data = data.merge(oblast_id, on=f'GRID_CELL_{new_size}', how='left')
    return data

# Reduce the precision of the coordinates using two decimal places, which is approximately 800m
# (https://support.oxts.com/hc/en-us/articles/115002885125-Level-of-Resolution-of-Longitude-and-Latitude-Measurements)
def round_to_1km(value):
    # round to nearest 1km
    return round(value, 2)

def round_to_50km(value):
    # round to nearest 50km
    return round(value / 0.5) * 0.5

In [8]:
def generate_counts(data, resolutions):
    # Sort the data by ACQ_DATE
    data.sort_values('ACQ_DATE', inplace=True)
    # Iterate over all resolutions
    for res in resolutions:
        # Generate the number of fires per grid cell for the specific day (ACQ_DATE)
        data[f'FIRE_COUNT_CELL_{res}'] = data.groupby([f'GRID_CELL_{res}', 'ACQ_DATE'])['ACQ_DATE'].transform('count')
    return data

In [9]:
# Define the folder paths for shapefiles and csv files
folder_path = 'input_data/raw/fires'
files = list_unzip_files(folder_path, unzip_dbf=True)

# Load the data
fire_data = load_data(files, folder_path)
fire_data.head()

Unnamed: 0,LATITUDE,LONGITUDE,BRIGHTNESS,SCAN,TRACK,ACQ_DATE,ACQ_TIME,SATELLITE,INSTRUMENT,CONFIDENCE,VERSION,BRIGHT_T31,FRP,DAYNIGHT,TYPE,GEOMETRY,COUNTRY_ID
0,48.2113,38.2567,302.8,2.0,1.4,2000-11-01 00:00:00,812,Terra,MODIS,47,6.03,286.0,16.8,D,2.0,POINT (38.2567 48.2113),
1,48.4833,38.7947,300.4,1.9,1.3,2000-11-01 00:00:00,812,Terra,MODIS,31,6.03,287.0,11.9,D,2.0,POINT (38.7947 48.4833),
2,47.1553,37.5356,309.4,2.1,1.4,2000-11-01 00:00:00,813,Terra,MODIS,69,6.03,287.0,28.8,D,2.0,POINT (37.5356 47.1553),
3,51.0532,25.4886,353.8,1.0,1.0,2000-11-01 00:00:00,950,Terra,MODIS,97,6.03,290.0,76.4,D,0.0,POINT (25.4886 51.0532),
4,50.5311,25.5214,304.6,1.1,1.0,2000-11-01 00:00:00,950,Terra,MODIS,59,6.03,288.1,6.7,D,0.0,POINT (25.5214 50.5311),


In [10]:
# Preprocess the fire data
fire_data = preprocess_data(fire_data)
fire_data.head()

Unnamed: 0,FIRE_ID,LATITUDE,LONGITUDE,ACQ_DATE,DAY_OF_YEAR,GRID_CELL
0,0,48.2113,38.2567,2000-11-01,306,48.2113_38.2567
1,1,48.4833,38.7947,2000-11-01,306,48.4833_38.7947
2,2,47.1553,37.5356,2000-11-01,306,47.1553_37.5356
3,3,51.0532,25.4886,2000-11-01,306,51.0532_25.4886
4,4,50.5311,25.5214,2000-11-01,306,50.5311_25.5214


In [11]:
# Filter the fire data based on a specific date range
start_date = pd.to_datetime('2015-01-01').date()
end_date = pd.to_datetime('today').date()
fire_data = filter_date(fire_data, start_date, end_date)
fire_data.head()

Unnamed: 0,FIRE_ID,LATITUDE,LONGITUDE,ACQ_DATE,DAY_OF_YEAR,GRID_CELL
0,423607,48.1676,30.6211,2015-01-12,12,48.1676_30.6211
1,423608,48.1637,30.6265,2015-01-13,13,48.1637_30.6265
2,423609,48.1672,30.609,2015-01-13,13,48.1672_30.609
3,423610,49.854,24.2832,2015-01-13,13,49.854_24.2832
4,423611,49.3536,24.0909,2015-01-13,13,49.3536_24.0909


In [12]:
# Filter the fire data based on the administrative borders of Ukraine
# https://simplemaps.com/gis/country/ua
fire_data = filter_border(fire_data, 'LATITUDE', 'LONGITUDE', 'input_data/raw/ukr_borders/ua.shp', add_oblast_id=True)
fire_data.head()

Unnamed: 0,FIRE_ID,LATITUDE,LONGITUDE,ACQ_DATE,DAY_OF_YEAR,GRID_CELL,OBLAST_ID
0,423607,48.1676,30.6211,2015-01-12,12,48.1676_30.6211,UA35
1,423608,48.1637,30.6265,2015-01-13,13,48.1637_30.6265,UA48
2,423609,48.1672,30.609,2015-01-13,13,48.1672_30.609,UA35
3,423610,49.854,24.2832,2015-01-13,13,49.854_24.2832,UA46
4,423611,49.3536,24.0909,2015-01-13,13,49.3536_24.0909,UA46


In [13]:
# Assign each datapoint to a grid cell with a size of 1 km
fire_data = generate_cell(fire_data, '1KM', round_to_1km)
# Assign each datapoint to a grid cell with a size of 50 km
fire_data = generate_cell(fire_data, '50KM', round_to_50km)
fire_data.head()

Unnamed: 0,FIRE_ID,LATITUDE,LONGITUDE,ACQ_DATE,DAY_OF_YEAR,GRID_CELL,OBLAST_ID,LATITUDE_1KM,LONGITUDE_1KM,GRID_CELL_1KM,OBLAST_ID_1KM,LATITUDE_50KM,LONGITUDE_50KM,GRID_CELL_50KM,OBLAST_ID_50KM
0,423607,48.1676,30.6211,2015-01-12,12,48.1676_30.6211,UA35,48.17,30.62,48.17_30.62,UA35,48.0,30.5,48.0_30.5,UA35
1,423608,48.1637,30.6265,2015-01-13,13,48.1637_30.6265,UA48,48.16,30.63,48.16_30.63,UA48,48.0,30.5,48.0_30.5,UA35
2,423609,48.1672,30.609,2015-01-13,13,48.1672_30.609,UA35,48.17,30.61,48.17_30.61,UA35,48.0,30.5,48.0_30.5,UA35
3,423610,49.854,24.2832,2015-01-13,13,49.854_24.2832,UA46,49.85,24.28,49.85_24.28,UA46,50.0,24.5,50.0_24.5,UA46
4,423611,49.3536,24.0909,2015-01-13,13,49.3536_24.0909,UA46,49.35,24.09,49.35_24.09,UA46,49.5,24.0,49.5_24.0,UA46


In [14]:
# Generate the number of fires per grid cell for the specific day
fire_data = generate_counts(fire_data, ['1KM', '50KM'])
fire_data.head()

Unnamed: 0,FIRE_ID,LATITUDE,LONGITUDE,ACQ_DATE,DAY_OF_YEAR,GRID_CELL,OBLAST_ID,LATITUDE_1KM,LONGITUDE_1KM,GRID_CELL_1KM,OBLAST_ID_1KM,LATITUDE_50KM,LONGITUDE_50KM,GRID_CELL_50KM,OBLAST_ID_50KM,FIRE_COUNT_CELL_1KM,FIRE_COUNT_CELL_50KM
193230,921181,47.153061,37.532589,2015-01-01,1,47.153061_37.532589,UA14,47.15,37.53,47.15_37.53,UA14,47.0,37.5,47.0_37.5,UA14,1,4
193233,921184,47.091522,37.612076,2015-01-01,1,47.091522_37.612076,UA14,47.09,37.61,47.09_37.61,UA14,47.0,37.5,47.0_37.5,UA14,3,4
193231,921182,47.089188,37.610622,2015-01-01,1,47.089188_37.610622,UA14,47.09,37.61,47.09_37.61,UA14,47.0,37.5,47.0_37.5,UA14,3,4
193232,921183,47.089619,37.611103,2015-01-01,1,47.089619_37.611103,UA14,47.09,37.61,47.09_37.61,UA14,47.0,37.5,47.0_37.5,UA14,3,4
193234,921185,50.509277,28.744947,2015-01-02,2,50.509277_28.744947,UA18,50.51,28.74,50.51_28.74,UA18,50.5,28.5,50.5_28.5,UA18,1,1


In [15]:
# Save the preprocessed fire data
fire_data.to_csv('input_data/processed/fire_data.csv', index=False)

In [2]:
# Load the preprocessed fire data
fire_data = pd.read_csv('input_data/processed/fire_data.csv')

## Cells Static

In [73]:
def generate_grid(data, col_list):
    # Extract the unique cells from the data
    grid = data[col_list].groupby(col_list).count().reset_index()
    return grid

In [74]:
def generate_population_density(data, path_to_population, resolution, round_function, percentile=0.95):
    # Load the population density data
    pop_density_df = pd.read_csv(path_to_population)
    # Rename the columns
    pop_density_df.columns = ['LONGITUDE', 'LATITUDE', 'POP_DENSITY']
    # Round the latitude and longitude
    pop_density_df[f'LATITUDE_{resolution}'] = pop_density_df['LATITUDE'].apply(round_function)
    pop_density_df[f'LONGITUDE_{resolution}'] = pop_density_df['LONGITUDE'].apply(round_function)
    # Calculate the average population density for each grid cell
    pop_density_avg = pop_density_df.groupby([f'LATITUDE_{resolution}', f'LONGITUDE_{resolution}'])['POP_DENSITY'].mean().reset_index()
    # Create a unique identifier for each grid cell
    pop_density_avg[f'GRID_CELL_{resolution}'] = pop_density_avg[f'LATITUDE_{resolution}'].astype(str) + '_' + pop_density_avg[f'LONGITUDE_{resolution}'].astype(str)
    # Assign all rows with a value greater the xth percentile to the xth percentile
    percentile_threshold = pop_density_avg['POP_DENSITY'].quantile(percentile)
    pop_density_avg[f'POP_DENSITY'] = pop_density_avg.apply(lambda x: percentile_threshold if x['POP_DENSITY'] > percentile_threshold else x['POP_DENSITY'], axis=1)
    # Rename the population density column
    pop_density_avg.rename(columns={'POP_DENSITY': f'POP_DENSITY_{resolution}'}, inplace=True)
    # Merge the population density data with the grid data
    data = data.merge(pop_density_avg, on=[f'GRID_CELL_{resolution}', f'LONGITUDE_{resolution}', f'LATITUDE_{resolution}'], how='left')
    # Fill missing values with the mean population density
    data[f'POP_DENSITY_{resolution}'].fillna(data[f'POP_DENSITY_{resolution}'].mean(), inplace=True)
    return data

In [75]:
# # Transform the coordinates of the pixels from UTM to WGS84
# # Define the WGS84 projection (EPSG:4326)
# wgs84_proj = Proj('epsg:4326')

# def load_tif_with_reduced_resolution(tif_path, scale_factor):
#     with rio.open(tif_path) as src:
#         # Calculate the new shape
#         new_height = int(src.height * scale_factor)
#         new_width = int(src.width * scale_factor)
#         # Read the data with resampling
#         data = src.read(
#             out_shape=(src.count, new_height, new_width),
#             resampling=rio.enums.Resampling.bilinear
#         )
#         # Scale the transform
#         transform_scaled = src.transform * src.transform.scale(
#             (src.width / data.shape[-1]),
#             (src.height / data.shape[-2])
#         )
#         # Get the UTM projection from the metadata
#         utm_proj = Proj(src.crs)
#         return data, np.array(transform_scaled), utm_proj

# def transform_tif_coordinates(data, transform_tif, utm_proj):
#     # transform the coordinates of the pixels
#     lon = np.zeros(data.shape[1:])
#     lat = np.zeros(data.shape[1:])
#     for i in range(data.shape[1]):
#         for j in range(data.shape[2]):
#             lat[i][j], lon[i][j] = transform(utm_proj, wgs84_proj, transform_tif[2] + transform_tif[0] * j, transform_tif[5] + transform_tif[4] * i)
#     return lon, lat

# def convert_to_dataframe(data, lon, lat, map_dict):
#     # Create a dataframe with the class of the pixel and its longitude and latitude value
#     df = pd.DataFrame(data[0].flatten(), columns=['class'])
#     # get longitude and latitude values and round them to 2 decimal places
#     df['LONGITUDE'] = lon.flatten().round(2)
#     df['LATITUDE'] = lat.flatten().round(2)
#     # Delete all rows with class 0 (no data)
#     df = df[df['class'] != 0]
#     # Map the classes to the new classes
#     df['CLASS'] = df['class'].map(map_dict)
#     df.drop(columns=['class'], inplace=True)
#     # Make sure all columns are in uppercase
#     df.columns = map(str.upper, df.columns)
#     return df

# # Load the TIFF file with reduced resolution
# # Reduce resolution by 95%, i.e., 10m resolution to 200m resolution
# scale_factor = 0.05
# # Dictionary to merge some of the classes into new classes
# # Original classes: 0: 'No data', 1: 'Water', 2: 'Trees', 4: 'Flooded vegetation', 5: 'Crops', 7: 'Built Area', 8: 'Bare ground', 9: 'Snow and ice', 10: 'Clouds', 11: 'Rangeland'
# # New classes: 
# # 0: ['No data', 'Water', 'Snow and ice', 'Clouds'], 
# # 1: ['Trees'], 
# # 2: ['Flooded vegetation', 'Bare ground', 'Rangeland'], 
# # 3: ['Crops'], 
# # 4: ['Built Area']
# class_transform = {0: 0, 1: 0, 2: 1, 4: 2, 5: 3, 7: 4, 8: 2, 9: 0, 10: 0, 11: 2}

# # Iterate over all tif files in the folder
# for tif_file in list_unzip_files('input_data/raw/land_use'):
#     if tif_file.endswith('.tif'):
#         print(f"Initialized {tif_file}")
#         # Load the TIFF file with reduced resolution
#         data, transform_scaled, utm_proj = load_tif_with_reduced_resolution(f"input_data/raw/land_use/{tif_file}", scale_factor)
#         # Transform the coordinates of the pixels
#         lon, lat = transform_tif_coordinates(data, transform_scaled, utm_proj)
#         # Convert the data to a dataframe
#         land_use_df = convert_to_dataframe(data, lon, lat, class_transform)
#         # Save dataframe as a csv file
#         land_use_df.to_csv(f'input_data/processed/{tif_file[:-4]}_005.csv', index=False)
#         print(f"Processed {tif_file}")

In [76]:
def load_land_use(path_to_land_use, resolution):
    # Load all land use csv files in the folder
    dataframes = []
    for csv_file in list_unzip_files(path_to_land_use):
        if csv_file.endswith(f'{resolution}.csv') and csv_file[:-8] in [x[:-4] for x in list_unzip_files(path_to_land_use)]:
            dataframes.append(pd.read_csv(f"{path_to_land_use}/{csv_file}"))
    # Merge the dataframes
    merged_df = pd.concat(dataframes, ignore_index=True)
    # Reset the index
    merged_df.reset_index(drop=True, inplace=True)
    return merged_df

In [77]:
def calculate_land_use(data, resolution, round_function):
    # Round the latitude and longitude
    data[f'LATITUDE_{resolution}'] = data['LATITUDE'].apply(round_function)
    data[f'LONGITUDE_{resolution}'] = data['LONGITUDE'].apply(round_function)
    # Add the GRID_CELL column
    data[f'GRID_CELL_{resolution}'] = data[f'LATITUDE_{resolution}'].astype(str) + '_' + data[f'LONGITUDE_{resolution}'].astype(str)
    # Drop the latitude and longitude columns
    data.drop(columns=['LATITUDE', 'LONGITUDE', f'LATITUDE_{resolution}', f'LONGITUDE_{resolution}'], inplace=True)
    # One hot encode the class column in land_use_data
    data_agg = pd.get_dummies(data, columns=['CLASS'], prefix='LAND_USE_CLASS', drop_first=True, dummy_na=True)
    # Add the resolution to the column names
    data_agg.columns = [f'{col}_{resolution}' if 'LAND_USE_CLASS' in col else col for col in data_agg.columns]
    # Rename the nan class to 'LAND_USE_CLASS_0'
    data_agg.rename(columns={f'LAND_USE_CLASS_nan_{resolution}': f'LAND_USE_CLASS_0_{resolution}'}, inplace=True)
    # Calculate the percentage of each class for each grid cell
    data_agg = data_agg.groupby([f'GRID_CELL_{resolution}']).mean()
    data_agg.reset_index(inplace=True)
    return data_agg

def merge_land_use(data, land_use_data, resolution):
    # Merge the land use data with the grid data
    data = data.merge(land_use_data, on=f'GRID_CELL_{resolution}', how='left')
    # Fill missing values with 0
    data.fillna(0, inplace=True)
    return data

In [78]:
# Generate the grid
grid = generate_grid(fire_data, ['LATITUDE', 'LONGITUDE', 'GRID_CELL', 'OBLAST_ID',
                                      'LATITUDE_1KM', 'LONGITUDE_1KM', 'GRID_CELL_1KM', 'OBLAST_ID_1KM',
                                      'LATITUDE_50KM', 'LONGITUDE_50KM', 'GRID_CELL_50KM', 'OBLAST_ID_50KM'])
grid.head()

Unnamed: 0,LATITUDE,LONGITUDE,GRID_CELL,OBLAST_ID,LATITUDE_1KM,LONGITUDE_1KM,GRID_CELL_1KM,OBLAST_ID_1KM,LATITUDE_50KM,LONGITUDE_50KM,GRID_CELL_50KM,OBLAST_ID_50KM
0,44.393223,33.956463,44.393223_33.956463,UA43,44.39,33.96,44.39_33.96,UA43,44.5,34.0,44.5_34.0,UA43
1,44.399612,33.970333,44.399612_33.970333,UA43,44.4,33.97,44.4_33.97,UA43,44.5,34.0,44.5_34.0,UA43
2,44.407276,33.984516,44.407276_33.984516,UA43,44.41,33.98,44.41_33.98,UA43,44.5,34.0,44.5_34.0,UA43
3,44.42485,34.020508,44.42485_34.020508,UA43,44.42,34.02,44.42_34.02,UA43,44.5,34.0,44.5_34.0,UA43
4,44.425514,34.015533,44.425514_34.015533,UA43,44.43,34.02,44.43_34.02,UA43,44.5,34.0,44.5_34.0,UA43


In [79]:
# Calculate the population density for each grid cell
grid = generate_population_density(grid, 'input_data/raw/population/ukr_pd_2020_1km_UNadj_ASCII_XYZ.csv', '1KM', round_to_1km)
grid = generate_population_density(grid, 'input_data/raw/population/ukr_pd_2020_1km_UNadj_ASCII_XYZ.csv', '50KM', round_to_50km)
grid.head()

Unnamed: 0,LATITUDE,LONGITUDE,GRID_CELL,OBLAST_ID,LATITUDE_1KM,LONGITUDE_1KM,GRID_CELL_1KM,OBLAST_ID_1KM,LATITUDE_50KM,LONGITUDE_50KM,GRID_CELL_50KM,OBLAST_ID_50KM,POP_DENSITY_1KM,POP_DENSITY_50KM
0,44.393223,33.956463,44.393223_33.956463,UA43,44.39,33.96,44.39_33.96,UA43,44.5,34.0,44.5_34.0,UA43,53.258136,133.344432
1,44.399612,33.970333,44.399612_33.970333,UA43,44.4,33.97,44.4_33.97,UA43,44.5,34.0,44.5_34.0,UA43,194.500009,133.344432
2,44.407276,33.984516,44.407276_33.984516,UA43,44.41,33.98,44.41_33.98,UA43,44.5,34.0,44.5_34.0,UA43,194.500009,133.344432
3,44.42485,34.020508,44.42485_34.020508,UA43,44.42,34.02,44.42_34.02,UA43,44.5,34.0,44.5_34.0,UA43,194.500009,133.344432
4,44.425514,34.015533,44.425514_34.015533,UA43,44.43,34.02,44.43_34.02,UA43,44.5,34.0,44.5_34.0,UA43,194.500009,133.344432


In [80]:
# Load land use data with a resolution of 200m
land_use_data = load_land_use('input_data/raw/land_use', '005')
# Merge the land use data with the grid data
grid = merge_land_use(grid, calculate_land_use(land_use_data.copy(), '1KM', round_to_1km), '1KM')
grid = merge_land_use(grid, calculate_land_use(land_use_data.copy(), '50KM', round_to_50km), '50KM')
grid.head()

Unnamed: 0,LATITUDE,LONGITUDE,GRID_CELL,OBLAST_ID,LATITUDE_1KM,LONGITUDE_1KM,GRID_CELL_1KM,OBLAST_ID_1KM,LATITUDE_50KM,LONGITUDE_50KM,...,LAND_USE_CLASS_1.0_1KM,LAND_USE_CLASS_2.0_1KM,LAND_USE_CLASS_3.0_1KM,LAND_USE_CLASS_4.0_1KM,LAND_USE_CLASS_0_1KM,LAND_USE_CLASS_1.0_50KM,LAND_USE_CLASS_2.0_50KM,LAND_USE_CLASS_3.0_50KM,LAND_USE_CLASS_4.0_50KM,LAND_USE_CLASS_0_50KM
0,44.393223,33.956463,44.393223_33.956463,UA43,44.39,33.96,44.39_33.96,UA43,44.5,34.0,...,0.0,0.0,0.0,0.2,0.0,0.541633,0.110739,0.033369,0.04967,0.0
1,44.399612,33.970333,44.399612_33.970333,UA43,44.4,33.97,44.4_33.97,UA43,44.5,34.0,...,0.458333,0.125,0.0,0.416667,0.0,0.541633,0.110739,0.033369,0.04967,0.0
2,44.407276,33.984516,44.407276_33.984516,UA43,44.41,33.98,44.41_33.98,UA43,44.5,34.0,...,0.545455,0.136364,0.272727,0.045455,0.0,0.541633,0.110739,0.033369,0.04967,0.0
3,44.42485,34.020508,44.42485_34.020508,UA43,44.42,34.02,44.42_34.02,UA43,44.5,34.0,...,0.4,0.1,0.25,0.25,0.0,0.541633,0.110739,0.033369,0.04967,0.0
4,44.425514,34.015533,44.425514_34.015533,UA43,44.43,34.02,44.43_34.02,UA43,44.5,34.0,...,0.75,0.208333,0.041667,0.0,0.0,0.541633,0.110739,0.033369,0.04967,0.0


In [82]:
# Save the static cell data
grid.to_csv('input_data/processed/cell_static.csv', index=False)

## Cells Dynamic

In [33]:
def load_weather_data(data, path_to_weather, file_name):
    # Iterate over all oblasts in Ukraine and load the corresponding weather data
    weather_data = {}
    for oblast_id in data['OBLAST_ID'].unique():
        weather_data[oblast_id] = pd.read_csv(f'{path_to_weather}/{file_name}_{oblast_id}.csv')
        # Add the oblast ID to the weather data
        weather_data[oblast_id]['OBLAST_ID'] = oblast_id
    # Concatenate all weather dataframes
    weather_data = pd.concat(weather_data.values(), ignore_index=True)
    # Drop duplicates
    weather_data.drop_duplicates(inplace=True)
    # Reset the index
    weather_data.reset_index(drop=True, inplace=True)
    # Make sure all columns are in uppercase
    weather_data.columns = map(str.upper, weather_data.columns)
    # Transform the date to datetime format
    weather_data['TIME'] = weather_data['TIME'].apply(lambda x: pd.to_datetime(x).date())
    # Get the average per day for each weather feature
    weather_data = weather_data.groupby(['OBLAST_ID', 'TIME']).mean().reset_index()
    # Rename the TIME column to ACQ_DATE
    weather_data.rename(columns={'TIME': 'ACQ_DATE'}, inplace=True)
    # Move the OBLAST_ID column to the first position
    weather_data = weather_data[['OBLAST_ID'] + [col for col in weather_data.columns if col != 'OBLAST_ID']]
    return weather_data

def get_weather_and_cloud_data(data, path_to_weather_and_clouds, weather_file_name, cloud_file_name):
    # Load the weather data
    weather_data = load_weather_data(data, path_to_weather_and_clouds, weather_file_name)
    # Load the cloud data
    cloud_data = load_weather_data(data, path_to_weather_and_clouds, cloud_file_name)
    # Merge the weather data with the cloud data
    weather_data = weather_data.merge(cloud_data, on=['OBLAST_ID', 'ACQ_DATE'], how='left')
    return weather_data

In [34]:
# Generate weather data for each oblast
weather_data = get_weather_and_cloud_data(fire_data, 'input_data/raw/weather', 'weather_by_day_oblast', 'cloud_cover_by_hour_oblast')
weather_data.head()

Unnamed: 0,OBLAST_ID,ACQ_DATE,TEMPERATURE_2M_MAX (°C),TEMPERATURE_2M_MIN (°C),TEMPERATURE_2M_MEAN (°C),RAIN_SUM (MM),SNOWFALL_SUM (CM),WIND_DIRECTION_10M_DOMINANT (°),CLOUD_COVER (%)
0,UA05,2015-01-01,0.2,-8.8,-3.4,0.0,0.49,283.0,93.458333
1,UA05,2015-01-02,3.4,-2.0,0.5,1.8,0.98,276.0,87.708333
2,UA05,2015-01-03,2.9,0.6,1.6,0.6,0.77,278.0,78.958333
3,UA05,2015-01-04,1.8,-1.1,0.3,0.1,0.49,276.0,78.0
4,UA05,2015-01-05,-0.5,-3.1,-1.6,0.0,2.31,308.0,74.708333


In [35]:
# Save the dynamic cell data
weather_data.to_csv('input_data/processed/cell_dynamic.csv', index=False)