In [1]:
import datetime as dt
import thredds_lsasaf_utils as tlu

import pandas as pd
import geopandas as gpd

from shapely import wkt
from shapely.geometry import Point

import rasterio
from rasterio.mask import mask

import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.genmod.families import Gaussian
from statsmodels.genmod.families.links import Power
from statsmodels.genmod.families import Gamma
from statsmodels.genmod.families.links import log, identity

import pickle

import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import patsy


In [2]:
def read_fused_data(filename):

    # Replace 'your_file.csv' with the path to your CSV file
    file_path = filename

    # Load the CSV file into a pandas DataFrame
    df = pd.read_csv(file_path)

    # Ensure the CSV contains 'lat' and 'lon' columns
    if 'lat' not in df.columns or 'lon' not in df.columns:
        raise ValueError("The CSV file must contain 'lat' and 'lon' columns")

    # Create a GeoDataFrame
    geometry = [Point(xy) for xy in zip(df['lon'], df['lat'])]
    gdf = gpd.GeoDataFrame(df, geometry=geometry)

    # Set the coordinate reference system (CRS) if known, e.g., WGS84 (EPSG:4326)
    gdf.set_crs(epsg=4326, inplace=True)

    return gdf

In [None]:

# Read data from .csv file
gdf = read_fused_data('fused_geo_data_june_to_august.csv')

# Define target and LULC columns
target = 'temperature'  # Replace with your target column name
gdf = gdf.drop(columns=['lulc_values'])
lulc_columns = [col for col in gdf.columns if col.startswith('lulc_')]  # LULC proportions

# Ensure categorical variables are treated as such
gdf['hour'] = gdf['hour'].astype('category')
gdf['month'] = gdf['month'].astype('category')
gdf['year'] = gdf['year'].astype('category')

# Drop rows with missing values
gdf = gdf.dropna(subset=[target, 'hour', 'month', 'year'] + lulc_columns)


In [None]:
from statsmodels.regression.mixed_linear_model import MixedLM

# Ensure categorical variables
gdf['hour'] = gdf['hour'].astype('category')
gdf['month'] = gdf['month'].astype('category')
gdf['year'] = gdf['year'].astype('category')

# Fixed effects formula (interaction between hour and LULC, month as main effect)
fixed_effects_formula = f"{target} ~ C(hour) * ({' + '.join(lulc_columns)}) + C(month)"

# Design matrices for fixed effects
import patsy
y, X = patsy.dmatrices(fixed_effects_formula, data=gdf, return_type='dataframe')

# Random effects (group by 'year')
groups = gdf['year']

# Fit the mixed-effects model
mixed_model = MixedLM(endog=y, exog=X, groups=groups)
mixed_results = mixed_model.fit()

# Display results
print(mixed_results.summary())
