<p>Example script to extract 1 month of hourly MLST MSG product over a domain</p>

In [None]:
import datetime as dt
import thredds_lsasaf_utils as tlu
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point

In [None]:
# Change here your user credentials
server_user = "karpagam"
server_passwd = "chip-chop-2025"

# Change here the product details
# Go to https://thredds.lsasvcs.ipma.pt/thredds/catalog/catalog.html
# Navigate selecting satelite, product, format, and data to find the product_path and product file name
# This is an example for the MSG MLST
product_path = "/MSG/MLST/NETCDF/"
product_fname = "NETCDF4_LSASAF_MSG_LST_MSG-Disk"
NcvarsLoad = ['LST'] # list of netcdf variables to load from remote files

# time period to process
dstart = dt.datetime(2024, 6, 1, 0, 0, 0) # start slot
dend = dt.datetime(2024, 8, 31, 23, 0, 0)   # end slot
product_freq = "h" # hourly frequency

## Define latitude/longitude domain to load [lat_min,lat_max,lon_min,lon_max,]
# LatLonBox = [36, 44,-10, 3] # example for Iberian Peninsula

# Define latitude/longitude domain to load [lat_min,lat_max,lon_min,lon_max,]
LatLonBox = [41.6899140207028722, 42.0902931428349447, 12.2299337725884012, 12.7300258912577391] # Rome


In [None]:
# Initialize product details

product = tlu.lsa_product(product_path,product_fname)
product.user = server_user
product.passwd = server_passwd

# list of slots to be processed:
slot_list = tlu.gen_slot_list(dstart, dend, product_freq)
print(f"Will load:{len(slot_list)} files: {slot_list[0]} to {slot_list[-1]}")

# Load data
ds_full = tlu.load_product_slots_domain(product, slot_list, NcvarsLoad, LatLonBox=LatLonBox)


In [None]:

# Extract the data array (assuming the variable name is 'temperature')
data_array = ds_full['LST']


In [None]:
# Step 1: Extract the temperature DataArray
temperature_da = ds_full['LST']

# Step 2: Stack dimensions (combine 'time', 'lat', and 'lon')
stacked = temperature_da.stack(points=('time', 'lat', 'lon'))

# Step 3: Reset the index and convert to DataFrame
df = stacked.reset_index(['time', 'lat', 'lon']).to_dataframe(name='temperature').reset_index(drop=True)

# Step 4: Add an 'hour' column 'day', 'month' and 'year'
df['hour'] = df['time'].dt.hour
df['day'] = df['time'].dt.day
df['month'] = df['time'].dt.month
df['year'] = df['time'].dt.year

# Step 5: Convert to GeoPandas GeoDataFrame
geometry = [Point(lon, lat) for lon, lat in zip(df['lon'], df['lat'])]  # Create Point geometries
gdf = gpd.GeoDataFrame(df, geometry=geometry, crs="EPSG:4326")  # Set CRS to WGS 84 (EPSG:4326)

# Inspect the GeoDataFrame
print(gdf.tail())


In [None]:
import rasterio
from rasterio.mask import mask
from shapely.geometry import Point

In [None]:

# Step 1: Load the LULC raster and the temperature GeoDataFrame
lulc_file = "ESRI_LULC_2023_Rome.tif"  # Path to the ESRI LULC TIFF file

# Buffer radius: 2.5 km -> 2500 meters
buffer_radius = 2500


# Step 2: Function to calculate land use proportions
def calculate_lulc_proportions(lulc_raster, point, buffer_radius):
    """
    Calculate proportions of LULC categories within a circular buffer around a point.
    """
    # Create a circular buffer around the point (convert to GeoJSON format)
    buffer = point.buffer(buffer_radius, resolution=50)  # Buffer with high resolution
    geojson_buffer = [buffer.__geo_interface__]  # Convert to GeoJSON format for rasterio.mask

    # Mask the LULC raster using the buffer
    try:
        out_image, out_transform = mask(lulc_raster, geojson_buffer, crop=True)
        data = out_image[0]  # Extract the first band
        data = data[data > 0]  # Remove invalid or no-data values (assumed to be <= 0)

        # Calculate proportions of each LULC category
        unique, counts = np.unique(data, return_counts=True)
        proportions = {f"lulc_{int(cat)}": count / data.size for cat, count in zip(unique, counts)}
        return proportions
    except Exception as e:
        print(f"Error processing buffer at point {point}: {e}")
        return {}


# Step 3: Process each point in the GeoDataFrame
with rasterio.open(lulc_file) as lulc_raster:
    all_proportions = []  # List to store proportions for each point
    for idx, row in gdf.iterrows():
        point = row.geometry  # Get the point geometry
        proportions = calculate_lulc_proportions(lulc_raster, point, buffer_radius)
        all_proportions.append(proportions)

# Step 4: Merge LULC proportions with the GeoDataFrame
proportions_df = pd.DataFrame(all_proportions).fillna(0)  # Replace NaNs with 0 for missing LULC categories
gdf = pd.concat([gdf, proportions_df], axis=1)


In [None]:
# Define a function to categorize hours
def categorize_hours(hour):
    if 0 <= hour < 7:
        return "early_morning"
    elif 7 <= hour < 10:
        return "morning"
    elif 10 <= hour < 15:
        return "mid_day"
    elif 15 <= hour < 19:
        return "evening"
    elif 19 <= hour < 22:
        return "night"
    elif 22 <= hour <= 23:
        return "late_night"


In [None]:

# Step 2: Apply the function to create the new "hours" column
gdf['hours'] = gdf['hour'].apply(categorize_hours)

# Step 3: Verify the result
print("Sample of DataFrame with the new 'hours' column:")
print(gdf[['hour', 'hours']].head(10))


In [None]:
# Convert the 'hours' column to dummy variables for the GLM
hours_dummies = pd.get_dummies(data['hours'], prefix='hours', drop_first=True)

# Combine the new dummy variables with the original data
X = pd.concat([data[predictors], hours_dummies], axis=1)
X = sm.add_constant(X)  # Add an intercept


In [None]:
import geopandas as gpd
import pandas as pd
from shapely.geometry import Point

# Load CSV into a Pandas DataFrame
df = pd.read_csv('temperature_data.csv')

# Create geometry from latitude and longitude
geometry = [Point(xy) for xy in zip(df['lon'], df['lat'])]


# Convert to GeoDataFrame
gdf = gpd.GeoDataFrame(df, geometry=geometry)

# Set the Coordinate Reference System (CRS) - assuming WGS84 (EPSG:4326)
gdf.set_crs(epsg=4326, inplace=True)

# Display the GeoDataFrame
print(gdf)


In [None]:

gdf.head()
gdf.tail()

In [None]:
gdf.columns

In [None]:
# Extract the LULC columns
lulc_columns = ['lulc_1', 'lulc_2', 'lulc_4', 'lulc_5', 'lulc_7', 'lulc_8', 'lulc_11']

# Create a dictionary with geometry as keys and LULC values as lists
geo_lulc_dict = {row['geometry']: row[lulc_columns].to_list() for _, row in gdf.iterrows()}

# Display a snippet of the dictionary
print(geo_lulc_dict)


In [None]:
import pickle

# Save the dictionary as a pickle file
with open('landuse_profile.pkl', 'wb') as file:
    pickle.dump(geo_lulc_dict, file)

print("Landuse profile saved as 'landuse_profile.pkl'")


In [None]:
# Load the pickle file
with open('landuse_profile.pkl', 'rb') as file:
    loaded_geo_lulc_dict = pickle.load(file)

print("Loaded landuse profile:", loaded_geo_lulc_dict)


In [None]:

# Fit the updated GLM
model = sm.GLM(Y, X, family=sm.families.Gamma())
results = model.fit()

print("Updated GLM Summary:")
print(results.summary())


In [None]:

# Optional: Save as a shapefile or GeoJSON
# gdf.to_file("temperature_points.geojson", driver="GeoJSON")
# gdf.to_file("temperature_points.shp")

# Drop the geometry column and split it into 'lat' and 'lon'
gdf['lon'] = gdf.geometry.x  # Extract longitude
gdf['lat'] = gdf.geometry.y  # Extract latitude

# Drop the geometry column
gdf_csv = gdf.drop(columns='geometry')

# Save the DataFrame as a CSV file
output_csv_path = "temperature_data.csv"
gdf_csv.to_csv(output_csv_path, index=False)

print(f"GeoDataFrame saved as CSV file: {output_csv_path}")

In [None]:
gdf.columns

In [9]:
import statsmodels.api as sm
from statsmodels.genmod.families import Gaussian
from statsmodels.genmod.families.links import Power
from statsmodels.genmod.families import Gamma
from statsmodels.genmod.families.links import log, identity


# Step 1: Convert the 'hour' column to categorical
gdf['hour'] = gdf['hour'].astype('category')

# Step 2: Prepare the data for GLM
# Define the target variable (temperature) and predictors
target = 'temperature'
predictors = ['hour'] + [col for col in gdf.columns if col.startswith('lulc_')]  # Hour + LULC proportions

# Drop rows with missing values in predictors or target
data = gdf[[target] + predictors].dropna()

# Step 3: Create dummy variables for 'hour'
X = pd.get_dummies(data[predictors], drop_first=True)  # Convert 'hour' to dummy variables
X = sm.add_constant(X)  # Add an intercept term

# Define the dependent variable (Y)
Y = data[target]

# Print X and Y shapes for verification
print("Independent Variables (X):", X.shape)
print("Dependent Variable (Y):", Y.shape)


In [None]:

# Step 3: Fit the Generalized Linear Model with Gamma Family
# Choose a link function: inverse (default), log, or identity
# gamma_family = Gamma()  # Inverse link (default for Gamma)
# gamma_family = Gamma(link=log())    # Uncomment for log link
gamma_family = Gamma(link=identity())  # Uncomment for identity link

# Fit the GLM
model = sm.GLM(Y, X, family=gamma_family)
results = model.fit()

# Step 4: Summarize the model
print("GLM Model Summary with Gamma Family:")
print(results.summary())

# Step 5: Predict new values (optional)
data['temperature_predicted'] = results.predict(X)

# Save the updated DataFrame with predictions
output_file = "temperature_with_predictions_gamma.csv"
data.to_csv(output_file, index=False)
print(f"Predicted values saved to: {output_file}")


In [None]:
import numpy as np
from scipy.stats import pearsonr
from sklearn.metrics import mean_squared_error


In [None]:

# Extract observed and predicted temperature values
observed = data['temperature']  # Observed temperature
predicted = data['temperature_predicted']  # Predicted temperature

# Compute RMSE
rmse = np.sqrt(mean_squared_error(observed, predicted))

# Compute Pearson's Correlation Coefficient
correlation, p_value = pearsonr(observed, predicted)

# Print results
print("Model Performance Metrics:")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"Pearson's Correlation Coefficient: {correlation:.4f}")
print(f"P-value: {p_value:.4f}")


In [None]:
import matplotlib.pyplot as plt

# Scatter plot of observed vs. predicted
plt.figure(figsize=(8, 6))  # Set figure size
plt.scatter(observed, predicted, color='blue', alpha=0.6, edgecolor='k', label='Predicted vs Observed')

# Plot a 1:1 line (perfect predictions)
min_val = min(observed.min(), predicted.min())
max_val = max(observed.max(), predicted.max())
plt.plot([min_val, max_val], [min_val, max_val], color='red', linestyle='--', label='1:1 Line')

# Add labels, title, and legend
plt.xlabel('Observed Temperature')
plt.ylabel('Predicted Temperature')
plt.title('Observed vs. Predicted Temperature')
plt.legend()
plt.grid(True)

# Show the plot
plt.show()


In [None]:
# Step 3: Fit the Generalized Linear Model
model = sm.GLM(Y, X, family=sm.families.Gaussian())  # Gaussian family for continuous response
results = model.fit()

# Step 4: Summarize the model
print("GLM Model Summary:")
print(results.summary())

# Step 5: Predict new values (optional)
data['temperature_predicted'] = results.predict(X)

# Extract observed and predicted temperature values
observed = data['temperature']  # Observed temperature
predicted = data['temperature_predicted']  # Predicted temperature


In [None]:
# Scatter plot of observed vs. predicted
plt.figure(figsize=(8, 6))  # Set figure size
plt.scatter(observed, predicted, color='blue', alpha=0.6, edgecolor='k', label='Predicted vs Observed')

# Plot a 1:1 line (perfect predictions)
min_val = min(observed.min(), predicted.min())
max_val = max(observed.max(), predicted.max())
plt.plot([min_val, max_val], [min_val, max_val], color='red', linestyle='--', label='1:1 Line')

# Add labels, title, and legend
plt.xlabel('Observed Temperature')
plt.ylabel('Predicted Temperature')
plt.title('Observed vs. Predicted Temperature')
plt.legend()
plt.grid(True)

# Show the plot
plt.show()