In [1]:
import datetime as dt
import thredds_lsasaf_utils as tlu

import pandas as pd
import geopandas as gpd

from shapely import wkt
from shapely.geometry import Point

import rasterio
from rasterio.mask import mask

import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.genmod.families import Gaussian
from statsmodels.genmod.families.links import Power
from statsmodels.genmod.families import Gamma
from statsmodels.genmod.families.links import log, identity

import pickle

import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import patsy


In [8]:
def read_fused_data(filename):

    # Replace 'your_file.csv' with the path to your CSV file
    file_path = filename

    # Load the CSV file into a pandas DataFrame
    df = pd.read_csv(file_path)

    # Ensure the CSV contains 'lat' and 'lon' columns
    if 'lat' not in df.columns or 'lon' not in df.columns:
        raise ValueError("The CSV file must contain 'lat' and 'lon' columns")

    # Create a GeoDataFrame
    geometry = [Point(xy) for xy in zip(df['lon'], df['lat'])]
    gdf = gpd.GeoDataFrame(df, geometry=geometry)

    # Set the coordinate reference system (CRS) if known, e.g., WGS84 (EPSG:4326)
    gdf.set_crs(epsg=4326, inplace=True)

    return gdf

In [None]:

# Read data from .csv file
gdf = read_fused_data('fused_geo_data_june_to_august.csv')

# Define target and LULC columns
target = 'temperature'  # Replace with your target column name
gdf = gdf.drop(columns=['lulc_values'])
lulc_columns = [col for col in gdf.columns if col.startswith('lulc_')]  # LULC proportions

# Ensure categorical variables are treated as such
gdf['hour'] = gdf['hour'].astype('category')
gdf['month'] = gdf['month'].astype('category')
gdf['year'] = gdf['year'].astype('category')

# Drop rows with missing values
gdf = gdf.dropna(subset=[target, 'hour', 'month', 'year'] + lulc_columns)

# Update the interaction formula to include 'month'
# interaction_formula = f"{target} ~ C(hour) * C(month) * ({' + '.join(lulc_columns)})"
interaction_formula = f"{target} ~ C(hour) * ({' + '.join(lulc_columns)}) + C(month)"

# Step 2: Split the data into training and testing sets (75-25 split)
train_data, test_data = train_test_split(gdf, test_size=0.75, random_state=42)

# Step 3: Generate design matrices for train and test sets
y_train, X_train = patsy.dmatrices(interaction_formula, data=train_data, return_type='dataframe')
y_test, X_test = patsy.dmatrices(interaction_formula, data=test_data, return_type='dataframe')

# Step 4: Fit the GLM model on the training data
gamma_model = sm.GLM(y_train, X_train, family=sm.families.Gamma(link=sm.families.links.identity()))
gamma_results = gamma_model.fit()

# Step 5: Display model summary
print(gamma_results.summary())


In [None]:

# Make predictions on the test data
y_pred = gamma_results.predict(X_test)

# Plot observed vs predicted values
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, alpha=0.7, edgecolors='k', label='Data points')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='red', label='Perfect Fit')
plt.xlabel("Observed Temperature")
plt.ylabel("Predicted Temperature")
plt.title("Observed vs Predicted Temperature")
plt.legend()
plt.grid(True)
plt.show()

In [None]:

import pickle

# Save the GLM model to a pickle file
with open('glm_model.pkl', 'wb') as file:
    pickle.dump(gamma_results, file)

print("GLM model saved as 'glm_model.pkl'")

# Save the model summary to a text file
with open('glm_model_summary.txt', 'w') as file:
    file.write(gamma_results.summary().as_text())

print("GLM model summary saved as 'glm_model_summary.txt'")

In [None]:
import numpy as np
from sklearn.metrics import mean_squared_log_error
from scipy.stats import pearsonr

# Compute RMSLE (Root Mean Squared Logarithmic Error)
rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
print(f"RMSLE: {rmsle:.4f}")


In [None]:
import matplotlib.pyplot as plt

# Create a combined 'month-hour' column for better visualization
test_data['month_hour'] = test_data['month'].astype(str) + '-' + test_data['hour'].astype(str)

# Add predictions to the test dataset
test_data['observed'] = y_test.values
test_data['predicted'] = y_pred

# Sort by the combined 'month-hour' index for proper plotting
test_data = test_data.sort_values(by=['month', 'hour'])

# Plot observed vs predicted as a time series
plt.figure(figsize=(14, 6))
plt.plot(test_data['month_hour'], test_data['observed'], label='Observed Temperature', marker='o', linestyle='-', color='blue')
plt.plot(test_data['month_hour'], test_data['predicted'], label='Predicted Temperature', marker='x', linestyle='--', color='red')
plt.xlabel('Month-Hour')
plt.ylabel('Temperature')
plt.title('Time Series: Observed vs Predicted Temperature')
plt.xticks(rotation=90)  # Rotate x-axis labels for better readability
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()