In [52]:
import datetime as dt
import thredds_lsasaf_utils as tlu

import pandas as pd
import geopandas as gpd

from shapely import wkt
from shapely.geometry import Point

import rasterio
from rasterio.mask import mask

import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.genmod.families import Gaussian
from statsmodels.genmod.families.links import Power
from statsmodels.genmod.families import Gamma
from statsmodels.genmod.families.links import log, identity

import pickle

import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import patsy

import common_utils

In [53]:
def read_fused_data(filename):

    # Replace 'your_file.csv' with the path to your CSV file
    file_path = filename

    # Load the CSV file into a pandas DataFrame
    df = pd.read_csv(file_path)

    # Ensure the CSV contains 'lat' and 'lon' columns
    if 'lat' not in df.columns or 'lon' not in df.columns:
        raise ValueError("The CSV file must contain 'lat' and 'lon' columns")

    # Create a GeoDataFrame
    geometry = [Point(xy) for xy in zip(df['lon'], df['lat'])]
    gdf = gpd.GeoDataFrame(df, geometry=geometry)

    # Set the coordinate reference system (CRS) if known, e.g., WGS84 (EPSG:4326)
    gdf.set_crs(epsg=4326, inplace=True)

    return gdf

In [54]:
# Read data from .csv file
gdf = read_fused_data('../msg_lu_fused_data_for_glm/lst_data._bkup.csv')
# file_path = '../msg_lu_fused_data_for_glm/msg_lu_fused_data_for_glm.gpkg'
# gdf =  gpd.read_file(file_path)  # Read the .gpkg file

In [55]:

# Define target and LULC columns
target = 'temperature'  # Replace with your target column name

# Convert to Kelvin
gdf['temperature'] = gdf['temperature'] + 273.15

# Ensure categorical variables are treated as such
gdf['hour'] = gdf['hour'].astype('category')
gdf['month'] = gdf['month'].astype('category')

# Drop the geometry column
gdf = gdf.drop(columns=['geometry'])

# Define LULC columns (all columns except the target)
lulc_columns = ['water', 'trees', 'crop', 'built_area', 'bare_ground', 'range_land']

# Drop rows with missing values
gdf = gdf.dropna(subset=[target, 'hour', 'month'] + lulc_columns)

In [56]:
# Step 1: Calculate the 1st and 99th percentiles for the target
lower_bound = np.percentile(gdf[target], 1)
upper_bound = np.percentile(gdf[target], 99)

# Step 2: Filter out rows outside this range
gdf = gdf[(gdf[target] >= lower_bound) & (gdf[target] <= upper_bound)]

In [None]:
gdf.shape

In [None]:
import matplotlib.pyplot as plt

# Assuming your DataFrame is called 'gdf' and the 'temperature' column exists
plt.figure(figsize=(8, 6))

# Plotting the histogram for temperature
plt.hist(gdf['temperature'], bins=30, color='skyblue', edgecolor='black')

# Adding title and labels
plt.title('Temperature Distribution')
plt.xlabel('Temperature')
plt.ylabel('Frequency')

# Show the plot
plt.show()


In [None]:

interaction_formula = f"{target} ~ C(month) + C(hour) * ({' + '.join(lulc_columns)})"
interaction_formula

# Step 2: Split the data into training and testing sets (0.50-0.50 split)
train_data, test_data = train_test_split(gdf, test_size=0.40)

import statsmodels.api as sm

# Step 3: Generate design matrices for train and test sets
y_train, X_train = patsy.dmatrices(interaction_formula, data=train_data, return_type='dataframe')
y_test, X_test = patsy.dmatrices(interaction_formula, data=test_data, return_type='dataframe')

# Step 4: Fit the GLM model on the training data
gamma_model = sm.GLM(y_train, X_train, family=sm.families.Gamma(link=sm.families.links.log()))
gamma_results = gamma_model.fit()

In [None]:
# Step 5: Display model summary
print(gamma_results.summary())

In [None]:
# Make predictions on the test data
y_pred = gamma_results.predict(X_test)

# Plot observed vs predicted values
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, alpha=0.7, edgecolors='k', label='Data points')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='red', label='Perfect Fit')
plt.xlabel("Observed Temperature")
plt.ylabel("Predicted Temperature")
plt.title("Observed vs Predicted Temperature")
plt.legend()
plt.grid(True)
plt.show()

In [None]:
import pickle

# Save the GLM model to a pickle file
with open('glm_model.pkl', 'wb') as file:
    pickle.dump(gamma_results, file)

print("GLM model saved as 'glm_model.pkl'")

# Save the model summary to a text file
with open('glm_model_summary.txt', 'w') as file:
    file.write(gamma_results.summary().as_text())

print("GLM model summary saved as 'glm_model_summary.txt'")

In [None]:
import matplotlib.pyplot as plt

# Create a combined 'month-hour' column for better visualization
test_data['month_str'] = test_data['month'].astype(str).str.zfill(2)

test_data['month_hour'] = test_data['month_str'].astype(str) + '-' + test_data['hour'].astype(str)

# Add predictions to the test dataset
test_data['observed'] = y_test.values
test_data['predicted'] = y_pred

# Sort by the combined 'month-hour' index for proper plotting
test_data = test_data.sort_values(by=['month_str', 'hour'])

# Plot observed vs predicted as a time series
plt.figure(figsize=(14, 6))
plt.plot(test_data['month_hour'], test_data['observed'], label='Observed Temperature', marker='o', linestyle='-', color='blue')
plt.plot(test_data['month_hour'], test_data['predicted'], label='Predicted Temperature', marker='x', linestyle='--', color='red')
plt.xlabel('Month-Hour')
plt.ylabel('Temperature')
plt.title('Time Series: Observed vs Predicted Temperature')
plt.xticks(rotation=90)  # Rotate x-axis labels for better readability
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
from sklearn.metrics import mean_squared_error
from math import sqrt

mse = mean_squared_error(y_test, y_pred)
rmse = sqrt(mse)
print("Root Mean Squared Error:", round(rmse, 1))

In [None]:
import os
concatenated_file = os.path.join('../ecostress_lulc_fused_data_for_glm', 'concatenated_output.gpkg')

# Read the concatenated file
new_gdf = gpd.read_file(concatenated_file)  # Read the .gpkg file

new_gdf = new_gdf[new_gdf['month'].isin([6, 7, 8])]

new_gdf.head()

In [None]:
# Convert to Kelvin
new_gdf['temperature'] = new_gdf['temperature'] + 273.15

# Ensure categorical variables are treated as such
new_gdf['hour'] = new_gdf['hour'].astype('category')
new_gdf['month'] = new_gdf['month'].astype('category')

# Drop rows with missing values
new_gdf = new_gdf.dropna(subset=['temperature', 'hour', 'month'])
new_gdf.shape
new_gdf.head()

In [67]:
# Ensure 'datetime' column exists or create one from year, month, day, hour
if 'time' not in new_gdf.columns:
    new_gdf['time'] = pd.to_datetime(
        new_gdf[['year', 'month', 'day', 'hour']].astype(str).agg('-'.join, axis=1)
    )

# Sort the data by datetime
new_gdf = new_gdf.sort_values(by='time')


In [68]:

# Filter the first few days and explicitly create a copy
first_few_days = new_gdf[new_gdf['time'] < (new_gdf['time'].min() + pd.Timedelta(days=1))].copy()

In [None]:
first_few_days.shape

In [None]:
# Inspect the categories of the categorical variables
print("Categories in training data:")
print(f"Month: {gdf['month'].unique()}")
print(f"Hour: {gdf['hour'].unique()}")

print("\nCategories in prediction data:")
print(f"Month: {first_few_days['month'].unique()}")
print(f"Hour: {first_few_days['hour'].unique()}")


In [71]:

# Ensure consistent categories
first_few_days['month'] = pd.Categorical(first_few_days['month'], categories=gdf['month'].unique())
first_few_days['hour'] = pd.Categorical(first_few_days['hour'], categories=gdf['hour'].unique())

# # Recreate the design matrix for the new data
# _, X_test = patsy.dmatrices(interaction_formula, data=first_few_days, return_type='dataframe')

# # Check the design matrix
# print(f"X_test shape: {X_test.shape}")
# print(f"Model parameters shape: {gamma_results.params.shape}")

# # Inspect column names in X_test vs X_train
# print("X_test columns:", X_test.columns)

# Predict using the trained model
# first_few_days['predicted'] = gamma_results.predict(X_test)


In [None]:
# Extract only the predictors part of the formula
predictor_formula = interaction_formula.split("~")[1].strip()
print(predictor_formula)

In [73]:

# Generate the design matrix for new data
X_new = patsy.dmatrix(predictor_formula, data=first_few_days, return_type='dataframe')

# Make predictions using the trained model
# predictions = gamma_results.predict(X_new)
first_few_days['predicted'] = gamma_results.predict(X_new)

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 6))

plt.scatter(first_few_days['temperature'], first_few_days['predicted'], alpha=0.7, edgecolor='k')
# Scatter plot

# Add a reference line for perfect predictions
max_val = max(first_few_days['temperature'].max(), first_few_days['predicted'].max())
min_val = min(first_few_days['temperature'].min(), first_few_days['predicted'].min())

plt.plot([min_val, max_val], [min_val, max_val], color='red', linestyle='--', linewidth=1, label='Perfect Fit')

# Add labels, title, and legend
plt.xlabel('Observed Temperature')
plt.ylabel('Predicted Temperature')
plt.title('Observed vs Predicted Temperatures')
plt.legend()
plt.grid(True)

# Show the plot
plt.show()

In [None]:
from sklearn.metrics import mean_squared_error
import numpy as np

# Replace 'temperature' and 'predicted_temperature' with actual column names
observed = first_few_days['temperature']
predicted = first_few_days['predicted']

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(observed, predicted))

print(f"RMSE: {rmse}")

In [79]:

difference = np.abs(observed - predicted)

# Step 3: Identify rows where the difference exceeds 5
large_diff_indices = np.where(difference > 5)[0]

# Step 4: Extract corresponding rows from the test data
rows_with_large_difference = first_few_days.iloc[large_diff_indices]

# Print the results
print("Rows where the observed and predicted values differ by more than 5:")
print(rows_with_large_difference)


Rows where the observed and predicted values differ by more than 5:
         no_data  water     trees  flooded_veg      crop  built_area  \
4002269      0.0    0.0  0.000000          0.0  0.000000    1.000000   
4021066      0.0    0.0  0.000000          0.0  0.015873    0.984127   
4021065      0.0    0.0  0.000000          0.0  0.000000    1.000000   
4021064      0.0    0.0  0.300000          0.0  0.700000    0.000000   
4021062      0.0    0.0  0.938462          0.0  0.000000    0.000000   
...          ...    ...       ...          ...       ...         ...   
864342       0.0    0.0  0.938462          0.0  0.000000    0.000000   
864344       0.0    0.0  0.300000          0.0  0.700000    0.000000   
864345       0.0    0.0  0.000000          0.0  0.000000    1.000000   
864334       0.0    0.0  0.000000          0.0  0.000000    0.000000   
901919       0.0    0.0  0.000000          0.0  1.000000    0.000000   

         bare_ground  snow_or_ice  clouds  range_land  year month  

In [78]:
first_few_days.shape

(225480, 18)

In [77]:
len(rows_with_large_difference)

208981