In [1]:
import pandas as pd
import numpy as np
import os

from itertools import combinations
import math
import folium
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter

from dash import Dash, dcc, html, Input, Output, dash_table
import matplotlib.pyplot as plt
from plotly.subplots import make_subplots
import seaborn as sns
import plotly.express as px, plotly.graph_objects as go
import ipywidgets as W
from IPython.display import display

import ipywidgets as widgets
from IPython.display import display, clear_output


from scipy import stats
import requests, xmltodict
from datetime import datetime, timedelta, timezone
from sklearn.linear_model import LinearRegression
from math import radians, sin, cos, sqrt, atan2
from sklearn.metrics import mean_squared_error, r2_score


In [2]:
# Sensor coordinates
lht_sensors = {
    'LHT65013': (62.234563, 25.672774),
    'LHT65010': (62.260777, 25.693876),
    'LHT65009': (62.222971, 25.804673),
    'LHT65008': (62.227604, 25.736853),
    'LHT65007': (62.286678, 25.74533),
    'LHT65006': (62.265198, 25.89008),
    'LHT65005': (62.197614, 25.720489),}

ws100_sensors = {
    'Saaritie':        (62.136788, 25.762473),
    'Tuulimyllyntie':  (62.221789, 25.695931),
    'Tähtiniementie':  (62.011127, 25.552755),
    'Kaakkovuori': (62.294362, 25.800196),
    'Kotaniementie':   (62.265705, 25.909542),}

# Geocoding setup 
geolocator = Nominatim(user_agent="sensor_map")
reverse_geocode = RateLimiter(geolocator.reverse, min_delay_seconds=1, swallow_exceptions=True)

def extract_street(address_dict):
    for key in ['road', 'pedestrian', 'footway', 'cycleway', 'path', 'residential']:
        if key in address_dict:
            return address_dict[key]
    for fallback in ['neighbourhood', 'suburb', 'hamlet', 'village', 'town', 'city', 'county']:
        if fallback in address_dict:
            return address_dict[fallback]
    return "Unknown street"

# I will  cache this to avoid hitting the API repeatedly for the same coords
geocode_cache = {}

def get_street_name(lat, lon):
    key = (round(lat, 6), round(lon, 6))
    if key in geocode_cache:
        return geocode_cache[key]
    
    location = reverse_geocode((lat, lon))
    street = "Unknown street"
    if location and hasattr(location, "raw"):
        street = extract_street(location.raw.get('address', {}))
    
    geocode_cache[key] = street
    return street


map_center = [62.24, 25.75]
sensors_map = folium.Map(location=map_center, zoom_start=11, tiles="OpenStreetMap")

# Layer groups for sensor types
lht_layer = folium.FeatureGroup(name="LHT Sensors", show=True)
ws100_layer = folium.FeatureGroup(name="WS100 Sensors", show=True)

# Drop blue markers for LHT sensors
for sensor_name, (lat, lon) in lht_sensors.items():
    street_name = get_street_name(lat, lon)
    popup_html = f"<b>{sensor_name}</b><br>{street_name}<br>({lat:.6f}, {lon:.6f})"
    tooltip_text = f"{sensor_name} – {street_name}"

    folium.Marker(
        location=[lat, lon], popup=popup_html, tooltip=tooltip_text,
        icon=folium.Icon(color="blue", icon="info-sign")).add_to(lht_layer)


# Red markers for WS100 stations
for site_name, (lat, lon) in ws100_sensors.items():
    street_name = get_street_name(lat, lon)
    popup_html = f"<b>{site_name} (WS100)</b><br>{street_name}<br>({lat:.6f}, {lon:.6f})"
    tooltip_text = f"{site_name} – {street_name}"
    folium.Marker(
        location=[lat, lon],popup=popup_html,tooltip=tooltip_text, icon=folium.Icon(color="red", icon="info-sign")).add_to(ws100_layer)

# Add layers to map and enable toggling
lht_layer.add_to(sensors_map)
ws100_layer.add_to(sensors_map)
folium.LayerControl(collapsed=False).add_to(sensors_map)
sensors_map

**Removing bad data from raw DS**

In [3]:
# I am going to define function to fix the timestamp, remove bad data, rename columns.

def clean_lht_sensor(raw_data,
    timestamp_col="Timestamp",
    temp_col="TempC_SHT",
    hum_col="Hum_SHT",
    start_date="2021-01-08",
    temp_range=(-40, 38),
    hum_range=(0, 100)):
    copy_data = raw_data.copy()

    # Convert timestamp to datetime and sorting
    copy_data[timestamp_col] = pd.to_datetime(copy_data[timestamp_col])
    copy_data = copy_data.sort_values(timestamp_col)
    # Removing data before start_date, sensor measured indoor data until 08.01.2021, which does not make sense for outdoor in Jyvaskyla at the time
    start_dt = pd.to_datetime(start_date)
    # For analysis, I keep data after start date
    copy_data = copy_data[copy_data[timestamp_col] >= start_dt]


    # Renaming columns 
    copy_data = copy_data.rename(columns={temp_col: "Temperature_C", hum_col: "Humidity"})
    # Setting  Timestamp as index, if I dont, I will lose it. I need it for analysis
    copy_data = copy_data.set_index(timestamp_col)


    # In LHTs, I have seen some anomaly values, which are out of physical limits.
    correct_temperature = (copy_data["Temperature_C"] >= temp_range[0]) & (copy_data["Temperature_C"] <= temp_range[1])
    correct_humidity = (copy_data["Humidity"]>= hum_range[0]) & (copy_data["Humidity"] <= hum_range[1])
    # Masking bad values
    copy_data.loc[~correct_temperature, "Temperature_C"] = pd.NA
    copy_data.loc[~correct_humidity, "Humidity"] = pd.NA
    # Filling of Missing Values by looking at neighboring values in time
    copy_data = copy_data.interpolate(
    method="time",limit=3, limit_direction="both",limit_area="inside")




    # Sorting the Timestamp
    copy_data = copy_data.reset_index().rename(columns={timestamp_col: "Timestamp"})
    copy_data = copy_data.sort_values("Timestamp").reset_index(drop=True)

    return copy_data

<h1><center>Fixing Gaps longer than 3 hours </center></h1>

In [4]:
# I fixed the 3 hours gap by interpolate but for loner gaps interpolate is not accurate, I am gonna use  regression to fill the gap
# Based on the  location and the values 

In [5]:
# I am gonna find the distance of each LHT sensor from  their coordinates
LHTs = {
    'Keltimaentie-LHT65013': (62.234563, 25.672774),
    'Hikipolku-LHT65010': (62.260777, 25.693876),
    'Hameenpohjantie-LHT65009': (62.222971, 25.804673),
    'Survontie-LHT65008': (62.227604, 25.736853),
    'Ritopohantie-LHT65007': (62.286678, 25.74533),
    'Kaunisharjuntie-LHT65006': (62.265198, 25.89008),
    'Keilonkankaantie-LHT65005': (62.197614, 25.720489),}


# Based on this coordinates, I will calculate the distance of each LHT sensor that I will it  use for analysis:
   
def LHTs_distance(coord1, coord2):

    lat1, lon1 = coord1
    lat2, lon2 = coord2
    
    # Convert to radians
    lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])
    
    # Haversine formula
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * atan2(sqrt(a), sqrt(1-a))
    
    # Earth radius in kilometers
    R = 6371.0
    distance = R * c
    
    return distance

<h1><center>Ritopohantie-LHT65007<h1>

In [6]:
Ritopohantie = pd.read_csv("Marjetas_Data/Marjetas_Data/JKL LHT/Data/LHT65007(JKL)-TEMP.csv", sep=";")
Ritopohantie.head()




Unnamed: 0,Timestamp,TempC_SHT,Hum_SHT
0,2020-10-27 15:28:59,17.43,49.1
1,2020-12-01 11:58:59,20.7,32.5
2,2020-12-01 13:58:59,22.42,24.2
3,2020-12-01 14:58:59,22.76,23.8
4,2020-12-01 15:58:59,22.86,23.6


In [7]:
Ritopohantie.shape

(38820, 3)

In [8]:
Ritopohantie.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38820 entries, 0 to 38819
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Timestamp  38820 non-null  object 
 1   TempC_SHT  38820 non-null  float64
 2   Hum_SHT    38820 non-null  float64
dtypes: float64(2), object(1)
memory usage: 910.0+ KB


In [9]:
Ritopohantie["Timestamp"] = pd.to_datetime(Ritopohantie["Timestamp"],format="%Y-%m-%d %H:%M:%S", errors="coerce")
print("NaT after parse:", Ritopohantie["Timestamp"].isna().sum())
# Sort the ds by timestamp
Ritopohantie = Ritopohantie.sort_values(by="Timestamp").reset_index(drop=True)
Ritopohantie.head()

NaT after parse: 0


Unnamed: 0,Timestamp,TempC_SHT,Hum_SHT
0,2020-10-27 15:28:59,17.43,49.1
1,2020-12-01 11:58:59,20.7,32.5
2,2020-12-01 13:58:59,22.42,24.2
3,2020-12-01 14:58:59,22.76,23.8
4,2020-12-01 15:58:59,22.86,23.6


In [10]:
Ritopohantie.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38820 entries, 0 to 38819
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   Timestamp  38820 non-null  datetime64[ns]
 1   TempC_SHT  38820 non-null  float64       
 2   Hum_SHT    38820 non-null  float64       
dtypes: datetime64[ns](1), float64(2)
memory usage: 910.0 KB


In [11]:
Ritopohantie.duplicated().sum()

np.int64(0)

In [12]:
# Let's find out start and end time of the data and the total days 
start_time = Ritopohantie["Timestamp"].min()
end_time = Ritopohantie["Timestamp"].max()
total_days = (end_time - start_time).days
print(f"Data starts from: {start_time}")
print(f"Data ends at: {end_time}")
print(f"Total days of data: {total_days} days")

Data starts from: 2020-10-27 15:28:59
Data ends at: 2025-09-18 11:57:24
Total days of data: 1786 days


In [13]:
df_Ritopohantie = Ritopohantie.copy()
df_Ritopohantie['Timestamp'] = pd.to_datetime(df_Ritopohantie['Timestamp'])
df_Ritopohantie = df_Ritopohantie.sort_values('Timestamp').set_index('Timestamp')
# I am going to define anomalies which are out of range values:
# -40 C to 40 C for temperature
Anomaly_temperature = (df_Ritopohantie['TempC_SHT'] < -40) | (df_Ritopohantie['TempC_SHT'] > 40)
# 0 to 100 for humidity
# values below 0  OR above 100 are physically impossible
Anomaly_humidity = (df_Ritopohantie['Hum_SHT'] < 0) | (df_Ritopohantie['Hum_SHT'] > 100)

gap_min = df_Ritopohantie.index.to_series().diff().dt.total_seconds() / 60.0
long_gaps = gap_min[gap_min > 180]  

# Finding gap 
stuck_temp_mask = df_Ritopohantie['TempC_SHT'].rolling('3h').std() == 0
stuck_points = int(stuck_temp_mask.sum())


print("Anomaly_temperature count:", Anomaly_temperature.sum())
print("Actual anomalous temperature values:")
print(df_Ritopohantie[Anomaly_temperature]['TempC_SHT'])
print("----------------------------------")
print("Anomaly_humidity count:", Anomaly_humidity.sum())
print("Actual anomalous humidity values:")
print(df_Ritopohantie[Anomaly_humidity]['Hum_SHT'])
print("----------------------------------")
print("Gaps longer than 3 hours:", long_gaps.shape[0])
print("Actual gap durations (minutes):")
print(long_gaps)
print("----------------------------------")
print("Number of rows where temperature was constant for 3+ hours:", stuck_points)
print("Timestamps where temperature was stuck:")
print(df_Ritopohantie[stuck_temp_mask].index)

Anomaly_temperature count: 59
Actual anomalous temperature values:
Timestamp
2021-06-22 15:06:23     41.33
2021-06-22 16:06:23     41.85
2021-06-22 18:06:23     40.15
2021-06-23 15:06:24     40.11
2021-07-03 18:06:33     40.24
2021-07-05 16:06:35     41.13
2021-07-05 17:06:35     41.24
2021-07-12 16:06:42     40.51
2021-07-12 16:33:35     40.51
2021-07-13 15:06:43     40.19
2021-07-14 15:06:44     40.47
2021-07-14 16:06:44     40.84
2021-07-14 17:06:44     42.02
2021-07-14 18:06:44     42.80
2021-07-26 15:06:56     40.88
2021-07-26 16:06:56     41.82
2021-07-26 17:06:56     41.91
2021-07-26 18:06:56     41.76
2021-07-27 14:06:57     41.29
2021-07-27 15:06:57     43.95
2021-07-27 16:06:57     42.42
2021-07-27 17:06:57     41.12
2021-07-27 18:06:57     42.14
2022-06-26 17:19:15     40.06
2022-12-26 09:24:29     52.28
2023-06-19 17:31:13     40.05
2023-06-20 15:31:14     40.68
2024-06-27 15:44:55     41.36
2024-06-27 16:44:55     42.44
2024-06-27 17:44:55     42.03
2024-06-28 15:44:56    

In [14]:
df_cleaned_Ritopohantie = clean_lht_sensor(Ritopohantie)
print("Shape before cleaning:", Ritopohantie.shape)
print('Shape after cleaning:', df_cleaned_Ritopohantie.shape)

Shape before cleaning: (38820, 3)
Shape after cleaning: (38027, 3)


In [15]:
df_cleaned_Ritopohantie.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38027 entries, 0 to 38026
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   Timestamp      38027 non-null  datetime64[ns]
 1   Temperature_C  38027 non-null  float64       
 2   Humidity       38027 non-null  float64       
dtypes: datetime64[ns](1), float64(2)
memory usage: 891.4 KB


<h1><center>Hikipolku-LHT65010<h1>

In [16]:
Hikipolku = pd.read_csv("Marjetas_Data/Marjetas_Data/JKL LHT/Data/LHT65010(JKL)-TEMP.csv", sep=";")
Hikipolku.head()

Unnamed: 0,Timestamp,Hum_SHT,TempC_SHT
0,2020-10-27 15:14:30,32.8,23.32
1,2020-10-27 15:34:30,33.7,22.56
2,2020-12-01 12:00:23,27.0,20.85
3,2020-12-01 13:00:23,24.5,22.32
4,2020-12-01 14:00:23,24.2,22.55


In [17]:
Hikipolku.shape

(39420, 3)

In [18]:
Hikipolku.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39420 entries, 0 to 39419
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Timestamp  39420 non-null  object 
 1   Hum_SHT    39420 non-null  float64
 2   TempC_SHT  39420 non-null  float64
dtypes: float64(2), object(1)
memory usage: 924.0+ KB


In [19]:
Hikipolku["Timestamp"] = pd.to_datetime(Hikipolku["Timestamp"],format="%Y-%m-%d %H:%M:%S")
print("NaT after parse:", Hikipolku["Timestamp"].isna().sum())
# Sort the ds by timestamp
Hikipolku = Hikipolku.sort_values(by="Timestamp").reset_index(drop=True)
Hikipolku

NaT after parse: 0


Unnamed: 0,Timestamp,Hum_SHT,TempC_SHT
0,2020-10-27 15:14:30,32.8,23.32
1,2020-10-27 15:34:30,33.7,22.56
2,2020-12-01 12:00:23,27.0,20.85
3,2020-12-01 13:00:23,24.5,22.32
4,2020-12-01 14:00:23,24.2,22.55
...,...,...,...
39415,2025-09-18 05:39:28,100.0,11.09
39416,2025-09-18 06:39:28,100.0,11.66
39417,2025-09-18 07:39:29,100.0,12.04
39418,2025-09-18 09:39:29,100.0,13.92


In [20]:
Hikipolku.duplicated().sum()

np.int64(5)

In [21]:
# printing the duplicated rows
duplicates = Hikipolku[Hikipolku.duplicated()]
duplicates

Unnamed: 0,Timestamp,Hum_SHT,TempC_SHT
4848,2021-07-19 18:16:42,47.6,19.92
5013,2021-07-26 10:54:59,35.3,32.59
12624,2022-06-13 00:52:14,80.4,11.4
13681,2022-07-26 22:39:56,100.0,17.25
14513,2022-08-30 14:41:17,56.9,20.04


In [22]:
# Let's find out start and end time of the data and the total days 
start_time = Hikipolku["Timestamp"].min()
end_time = Hikipolku["Timestamp"].max()
total_days = (end_time - start_time).days
print(f"Data starts from: {start_time}")
print(f"Data ends at: {end_time}")
print(f"Total days of data: {total_days} days")

Data starts from: 2020-10-27 15:14:30
Data ends at: 2025-09-18 10:39:29
Total days of data: 1786 days


In [23]:
df_Hikipolku = Hikipolku.copy()
df_Hikipolku['Timestamp'] = pd.to_datetime(df_Hikipolku['Timestamp'])
df_Hikipolku = df_Hikipolku.sort_values('Timestamp').set_index('Timestamp')
# I am going to define anomalies which are out of range values:
# -40 C to 40 C for temperature
Anomaly_temperature = (df_Hikipolku['TempC_SHT'] < -40) | (df_Hikipolku['TempC_SHT'] > 40)
# 0 to 100 for humidity
# values below 0  OR above 100 are physically impossible
Anomaly_humidity = (df_Hikipolku['Hum_SHT'] < 0) | (df_Hikipolku['Hum_SHT'] > 100)

gap_min = df_Hikipolku.index.to_series().diff().dt.total_seconds() / 60.0
long_gaps = gap_min[gap_min > 180]  

# Finding gap 
stuck_temp_mask = df_Hikipolku['TempC_SHT'].rolling('3h').std() == 0
stuck_points = int(stuck_temp_mask.sum())


print("Anomaly_temperature count:", Anomaly_temperature.sum())
print("Actual anomalous temperature values:")
print(df_Hikipolku[Anomaly_temperature]['TempC_SHT'])
print("----------------------------------")
print("Anomaly_humidity count:", Anomaly_humidity.sum())
print("Actual anomalous humidity values:")
print(df_Hikipolku[Anomaly_humidity]['Hum_SHT'])
print("----------------------------------")
print("Gaps longer than 3 hours:", long_gaps.shape[0])
print("Actual gap durations (minutes):")
print(long_gaps)
print("----------------------------------")
print("Number of rows where temperature was constant for 3+ hours:", stuck_points)
print("Timestamps where temperature was stuck:")
print(df_Hikipolku[stuck_temp_mask].index)

Anomaly_temperature count: 3
Actual anomalous temperature values:
Timestamp
2023-07-21 02:58:12     49.81
2024-05-11 20:17:27    104.36
2025-07-14 12:37:38     40.10
Name: TempC_SHT, dtype: float64
----------------------------------
Anomaly_humidity count: 1
Actual anomalous humidity values:
Timestamp
2025-04-16 19:34:40    3276.7
Name: Hum_SHT, dtype: float64
----------------------------------
Gaps longer than 3 hours: 126
Actual gap durations (minutes):
Timestamp
2020-12-01 12:00:23    50185.883333
2020-12-02 16:00:24      300.016667
2020-12-06 21:00:32      180.016667
2020-12-10 15:00:42      180.016667
2020-12-12 07:00:48      180.016667
                           ...     
2025-03-14 14:32:36      180.016667
2025-04-29 23:35:03      180.016667
2025-05-06 16:35:22      240.016667
2025-05-12 18:35:40      300.016667
2025-06-21 01:36:57      180.016667
Name: Timestamp, Length: 126, dtype: float64
----------------------------------
Number of rows where temperature was constant for 3+ h

**Applying function to clean the DS**

In [24]:
df_cleaned_Hikipolku = clean_lht_sensor(Hikipolku)
print("Shape before cleaning:", Hikipolku.shape)
print('Shape after cleaning:', df_cleaned_Hikipolku.shape)

Shape before cleaning: (39420, 3)
Shape after cleaning: (38796, 3)


In [25]:
df_cleaned_Hikipolku.isna().sum()

Timestamp        0
Humidity         0
Temperature_C    0
dtype: int64

<h1><center>Kaunisharjuntie-LHT65006<h1>

In [26]:
Kaunisharjuntie = pd.read_csv("Marjetas_Data\Marjetas_Data\JKL LHT\Data\LHT65006(JLK)-TEMP.csv", sep=";")
Kaunisharjuntie.head()

  Kaunisharjuntie = pd.read_csv("Marjetas_Data\Marjetas_Data\JKL LHT\Data\LHT65006(JLK)-TEMP.csv", sep=";")


Unnamed: 0,Timestamp,TempC_SHT,Hum_SHT
0,2020-10-27 15:01:22,24.15,34.2
1,2020-10-27 15:21:22,22.08,34.9
2,2020-10-27 15:41:22,20.87,37.8
3,2020-12-01 11:58:43,21.91,28.7
4,2020-12-01 14:58:43,22.64,23.6


In [27]:
Kaunisharjuntie.shape

(39898, 3)

In [28]:
Kaunisharjuntie.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39898 entries, 0 to 39897
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Timestamp  39898 non-null  object 
 1   TempC_SHT  39898 non-null  float64
 2   Hum_SHT    39898 non-null  float64
dtypes: float64(2), object(1)
memory usage: 935.2+ KB


In [29]:
Kaunisharjuntie["Timestamp"] = pd.to_datetime(Kaunisharjuntie["Timestamp"],format="%Y-%m-%d %H:%M:%S", errors="coerce")
print("NaT after parse:", Kaunisharjuntie["Timestamp"].isna().sum())
# Sort the ds by timestamp
Kaunisharjuntie = Kaunisharjuntie.sort_values(by="Timestamp").reset_index(drop=True)
Kaunisharjuntie.head()

NaT after parse: 0


Unnamed: 0,Timestamp,TempC_SHT,Hum_SHT
0,2020-10-27 15:01:22,24.15,34.2
1,2020-10-27 15:21:22,22.08,34.9
2,2020-10-27 15:41:22,20.87,37.8
3,2020-12-01 11:58:43,21.91,28.7
4,2020-12-01 14:58:43,22.64,23.6


In [30]:
Kaunisharjuntie.duplicated().sum()

np.int64(0)

In [31]:
# Let's find out start and end time of the data and the total days 
start_time = Kaunisharjuntie["Timestamp"].min()
end_time = Kaunisharjuntie["Timestamp"].max()
total_days = (end_time - start_time).days
print(f"Data starts from: {start_time}")
print(f"Data ends at: {end_time}")
print(f"Total days of data: {total_days} days")

Data starts from: 2020-10-27 15:01:22
Data ends at: 2025-09-18 12:27:34
Total days of data: 1786 days


In [32]:
df_Kaunisharjuntie = Kaunisharjuntie.copy()
df_Kaunisharjuntie['Timestamp'] = pd.to_datetime(df_Kaunisharjuntie['Timestamp'])
df_Kaunisharjuntie = df_Kaunisharjuntie.sort_values('Timestamp').set_index('Timestamp')
# I am going to define anomalies which are out of range values:
# -40 C to 40 C for temperature
Anomaly_temperature = (df_Kaunisharjuntie['TempC_SHT'] < -40) | (df_Kaunisharjuntie['TempC_SHT'] > 40)
# 0 to 100 for humidity
# values below 0  OR above 100 are physically impossible
Anomaly_humidity = (df_Kaunisharjuntie['Hum_SHT'] < 0) | (df_Kaunisharjuntie['Hum_SHT'] > 100)

gap_min = df_Kaunisharjuntie.index.to_series().diff().dt.total_seconds() / 60.0
long_gaps = gap_min[gap_min > 180]  

# Finding gap 
stuck_temp_mask = df_Kaunisharjuntie['TempC_SHT'].rolling('3h').std() == 0
stuck_points = int(stuck_temp_mask.sum())


print("Anomaly_temperature count:", Anomaly_temperature.sum())
print("Actual anomalous temperature values:")
print(df_Kaunisharjuntie[Anomaly_temperature]['TempC_SHT'])
print("----------------------------------")
print("Anomaly_humidity count:", Anomaly_humidity.sum())
print("Actual anomalous humidity values:")
print(df_Kaunisharjuntie[Anomaly_humidity]['Hum_SHT'])
print("----------------------------------")
print("Gaps longer than 3 hours:", long_gaps.shape[0])
print("Actual gap durations (minutes):")
print(long_gaps)
print("----------------------------------")
print("Number of rows where temperature was constant for 3+ hours:", stuck_points)
print("Timestamps where temperature was stuck:")
print(df_Kaunisharjuntie[stuck_temp_mask].index)

Anomaly_temperature count: 1
Actual anomalous temperature values:
Timestamp
2025-01-07 12:16:18    327.67
Name: TempC_SHT, dtype: float64
----------------------------------
Anomaly_humidity count: 1
Actual anomalous humidity values:
Timestamp
2021-12-18 16:17:56    3276.7
Name: Hum_SHT, dtype: float64
----------------------------------
Gaps longer than 3 hours: 106
Actual gap durations (minutes):
Timestamp
2020-12-01 11:58:43    50177.350000
2020-12-14 21:59:12      720.016667
2020-12-15 05:59:15      480.050000
2020-12-15 13:59:16      480.016667
2020-12-15 23:59:14      599.966667
                           ...     
2025-05-08 06:23:18      300.000000
2025-05-08 15:23:20      240.016667
2025-05-10 09:23:25      180.016667
2025-05-12 18:23:31      240.016667
2025-09-09 11:27:17      300.016667
Name: Timestamp, Length: 106, dtype: float64
----------------------------------
Number of rows where temperature was constant for 3+ hours: 129
Timestamps where temperature was stuck:
DatetimeIn

**Applying function to clean the DS**

In [33]:
df_cleaned_Kaunisharjuntie = clean_lht_sensor(Kaunisharjuntie)
print("Shape before cleaning:", Kaunisharjuntie.shape)
print('Shape after cleaning:', df_cleaned_Kaunisharjuntie.shape)

Shape before cleaning: (39898, 3)
Shape after cleaning: (39147, 3)


In [34]:
df_cleaned_Kaunisharjuntie.head()

Unnamed: 0,Timestamp,Temperature_C,Humidity
0,2021-01-08 00:59:57,-5.24,53.7
1,2021-01-08 01:59:57,-5.61,54.1
2,2021-01-08 02:59:56,-5.91,54.4
3,2021-01-08 03:59:56,-6.13,54.6
4,2021-01-08 04:59:56,-6.29,55.0


In [35]:
# Save cleaned data to  new CSV file
df_cleaned_Kaunisharjuntie.to_csv("D:\Fiveth\Marjetas_project\cleaned_datasets\LHT\Kaunisharjuntie.csv", index=False)

  df_cleaned_Kaunisharjuntie.to_csv("D:\Fiveth\Marjetas_project\cleaned_datasets\LHT\Kaunisharjuntie.csv", index=False)


<h1><center>Keltimaentie-LHT65013<h1>

In [36]:
Keltimaentie = pd.read_csv("Marjetas_Data\Marjetas_Data\JKL LHT\Data\LHT65013(JKL)-TEMP.csv", sep=";")
Keltimaentie.head()

  Keltimaentie = pd.read_csv("Marjetas_Data\Marjetas_Data\JKL LHT\Data\LHT65013(JKL)-TEMP.csv", sep=";")


Unnamed: 0,Timestamp,Hum_SHT,TempC_SHT
0,2020-10-27 15:26:21,33.6,23.5
1,2020-10-27 15:46:22,34.3,22.35
2,2020-11-11 14:16:13,25.6,22.69
3,2020-11-11 14:31:04,24.2,24.45
4,2020-11-11 14:32:04,23.5,24.5


In [37]:
Keltimaentie.shape

(38853, 3)

In [38]:
Keltimaentie.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38853 entries, 0 to 38852
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Timestamp  38853 non-null  object 
 1   Hum_SHT    38853 non-null  float64
 2   TempC_SHT  38853 non-null  float64
dtypes: float64(2), object(1)
memory usage: 910.7+ KB


In [39]:
Keltimaentie["Timestamp"] = pd.to_datetime(Keltimaentie["Timestamp"],format="%Y-%m-%d %H:%M:%S", errors="coerce")
print("NaT after parse:", Keltimaentie["Timestamp"].isna().sum())
# Sort the ds by timestamp
Keltimaentie = Keltimaentie.sort_values(by="Timestamp").reset_index(drop=True)
Keltimaentie.head()

NaT after parse: 0


Unnamed: 0,Timestamp,Hum_SHT,TempC_SHT
0,2020-10-27 15:26:21,33.6,23.5
1,2020-10-27 15:46:22,34.3,22.35
2,2020-11-11 14:16:13,25.6,22.69
3,2020-11-11 14:31:04,24.2,24.45
4,2020-11-11 14:32:04,23.5,24.5


In [40]:
Keltimaentie.duplicated().sum()

np.int64(0)

In [41]:
# Let's find out start and end time of the data and the total days 
start_time = Keltimaentie["Timestamp"].min()
end_time = Keltimaentie["Timestamp"].max()
total_days = (end_time - start_time).days
print(f"Data starts from: {start_time}")
print(f"Data ends at: {end_time}")
print(f"Total days of data: {total_days} days")

Data starts from: 2020-10-27 15:26:21
Data ends at: 2025-09-18 11:09:07
Total days of data: 1786 days


In [42]:
df_Keltimaentie = Keltimaentie.copy()
df_Keltimaentie['Timestamp'] = pd.to_datetime(df_Keltimaentie['Timestamp'])
df_Keltimaentie = df_Keltimaentie.sort_values('Timestamp').set_index('Timestamp')
# I am going to define anomalies which are out of range values:
# -40 C to 40 C for temperature
Anomaly_temperature = (df_Keltimaentie['TempC_SHT'] < -40) | (df_Keltimaentie['TempC_SHT'] > 40)
# 0 to 100 for humidity
# values below 0  OR above 100 are physically impossible
Anomaly_humidity = (df_Keltimaentie['Hum_SHT'] < 0) | (df_Keltimaentie['Hum_SHT'] > 100)

gap_min = df_Keltimaentie.index.to_series().diff().dt.total_seconds() / 60.0
long_gaps = gap_min[gap_min > 180]  

# Finding gap 
stuck_temp_mask = df_Keltimaentie['TempC_SHT'].rolling('3h').std() == 0
stuck_points = int(stuck_temp_mask.sum())


print("Anomaly_temperature count:", Anomaly_temperature.sum())
print("Actual anomalous temperature values:")
print(df_Keltimaentie[Anomaly_temperature]['TempC_SHT'])
print("----------------------------------")
print("Anomaly_humidity count:", Anomaly_humidity.sum())
print("Actual anomalous humidity values:")
print(df_Keltimaentie[Anomaly_humidity]['Hum_SHT'])
print("----------------------------------")
print("Gaps longer than 3 hours:", long_gaps.shape[0])
print("Actual gap durations (minutes):")
print(long_gaps)
print("----------------------------------")
print("Number of rows where temperature was constant for 3+ hours:", stuck_points)
print("Timestamps where temperature was stuck:")
print(df_Keltimaentie[stuck_temp_mask].index)

Anomaly_temperature count: 2
Actual anomalous temperature values:
Timestamp
2023-02-23 10:35:15    42.13
2024-02-23 13:50:35    89.02
Name: TempC_SHT, dtype: float64
----------------------------------
Anomaly_humidity count: 1
Actual anomalous humidity values:
Timestamp
2021-12-16 21:17:14    3276.7
Name: Hum_SHT, dtype: float64
----------------------------------
Gaps longer than 3 hours: 115
Actual gap durations (minutes):
Timestamp
2020-11-11 14:16:13    21509.850000
2020-12-01 12:03:07    28566.183333
2020-12-02 16:03:07      300.016667
2020-12-03 18:03:07      240.000000
2020-12-05 20:03:09      240.000000
                           ...     
2025-03-12 10:04:16      180.016667
2025-04-09 11:05:28      180.016667
2025-04-19 03:05:48      180.016667
2025-05-12 18:06:35      360.016667
2025-06-24 19:07:30      180.016667
Name: Timestamp, Length: 115, dtype: float64
----------------------------------
Number of rows where temperature was constant for 3+ hours: 72
Timestamps where temper

**Applying clean function to clean the DS**

In [43]:
df_cleaned_Keltimaentie = clean_lht_sensor(Keltimaentie)
print("Shape before cleaning:", Keltimaentie.shape)
print('Shape after cleaning:', df_cleaned_Keltimaentie.shape)

Shape before cleaning: (38853, 3)
Shape after cleaning: (38065, 3)


In [44]:
df_cleaned_Keltimaentie.shape

(38065, 3)

<h1><center>Survontie-LHT65008<h1>

In [45]:
Survontie = pd.read_csv("Marjetas_Data\Marjetas_Data\JKL LHT\Data\LHT65008(JKL)-TEMP.csv", sep=";")
Survontie.head()

  Survontie = pd.read_csv("Marjetas_Data\Marjetas_Data\JKL LHT\Data\LHT65008(JKL)-TEMP.csv", sep=";")


Unnamed: 0,Timestamp,Hum_SHT,TempC_SHT
0,2020-10-27 15:04:38,34.0,23.66
1,2020-10-27 15:24:38,33.6,22.84
2,2020-10-27 15:44:38,34.6,22.21
3,2020-12-01 11:59:11,30.1,20.67
4,2020-12-01 12:59:11,24.6,22.18


In [46]:
Survontie.shape

(40774, 3)

In [47]:
Survontie.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40774 entries, 0 to 40773
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Timestamp  40774 non-null  object 
 1   Hum_SHT    40774 non-null  float64
 2   TempC_SHT  40774 non-null  float64
dtypes: float64(2), object(1)
memory usage: 955.8+ KB


In [48]:
Survontie["Timestamp"] = pd.to_datetime(Survontie["Timestamp"],format="%Y-%m-%d %H:%M:%S", errors="coerce")
print("NaT after parse:", Survontie["Timestamp"].isna().sum())
# Sort the ds by timestamp
Survontie = Survontie.sort_values(by="Timestamp").reset_index(drop=True)
Survontie.head()

NaT after parse: 0


Unnamed: 0,Timestamp,Hum_SHT,TempC_SHT
0,2020-10-27 15:04:38,34.0,23.66
1,2020-10-27 15:24:38,33.6,22.84
2,2020-10-27 15:44:38,34.6,22.21
3,2020-12-01 11:59:11,30.1,20.67
4,2020-12-01 12:59:11,24.6,22.18


In [49]:
Survontie.duplicated().sum()

np.int64(1)

In [50]:
# Remove duplicates
Survontie = Survontie.drop_duplicates()
Survontie.shape

(40773, 3)

In [51]:
# Let's find out start and end time of the data and the total days 
start_time = Survontie["Timestamp"].min()
end_time = Survontie["Timestamp"].max()
total_days = (end_time - start_time).days
print(f"Data starts from: {start_time}")
print(f"Data ends at: {end_time}")
print(f"Total days of data: {total_days} days")

Data starts from: 2020-10-27 15:04:38
Data ends at: 2025-09-18 11:26:57
Total days of data: 1786 days


In [52]:
df_Survontie = Survontie.copy()
df_Survontie['Timestamp'] = pd.to_datetime(df_Survontie['Timestamp'])
df_Survontie = df_Survontie.sort_values('Timestamp').set_index('Timestamp')
# I am going to define anomalies which are out of range values:
# -40 C to 40 C for temperature
Anomaly_temperature = (df_Survontie['TempC_SHT'] < -40) | (df_Survontie['TempC_SHT'] > 40)
# 0 to 100 for humidity
# values below 0  OR above 100 are physically impossible
Anomaly_humidity = (df_Survontie['Hum_SHT'] < 0) | (df_Survontie['Hum_SHT'] > 100)

gap_min = df_Survontie.index.to_series().diff().dt.total_seconds() / 60.0
long_gaps = gap_min[gap_min > 180]  

# Finding gap 
stuck_temp_mask = df_Survontie['TempC_SHT'].rolling('3h').std() == 0
stuck_points = int(stuck_temp_mask.sum())


print("Anomaly_temperature count:", Anomaly_temperature.sum())
print("Actual anomalous temperature values:")
print(df_Survontie[Anomaly_temperature]['TempC_SHT'])
print("----------------------------------")
print("Anomaly_humidity count:", Anomaly_humidity.sum())
print("Actual anomalous humidity values:")
print(df_Survontie[Anomaly_humidity]['Hum_SHT'])
print("----------------------------------")
print("Gaps longer than 3 hours:", long_gaps.shape[0])
print("Actual gap durations (minutes):")
print(long_gaps)
print("----------------------------------")
print("Number of rows where temperature was constant for 3+ hours:", stuck_points)
print("Timestamps where temperature was stuck:")
print(df_Survontie[stuck_temp_mask].index)

Anomaly_temperature count: 0
Actual anomalous temperature values:
Series([], Name: TempC_SHT, dtype: float64)
----------------------------------
Anomaly_humidity count: 0
Actual anomalous humidity values:
Series([], Name: Hum_SHT, dtype: float64)
----------------------------------
Gaps longer than 3 hours: 70
Actual gap durations (minutes):
Timestamp
2020-12-01 11:59:11    50174.550000
2020-12-01 20:59:11      300.000000
2020-12-02 15:59:14      300.016667
2020-12-06 19:59:30      180.050000
2020-12-07 19:59:31      180.016667
                           ...     
2025-01-03 14:06:52      240.016667
2025-02-18 22:11:49      180.016667
2025-04-01 14:15:43      180.016667
2025-05-08 12:18:42      180.016667
2025-05-12 18:19:02      240.016667
Name: Timestamp, Length: 70, dtype: float64
----------------------------------
Number of rows where temperature was constant for 3+ hours: 81
Timestamps where temperature was stuck:
DatetimeIndex(['2020-12-06 20:59:27', '2020-12-20 08:00:32',
        

**Applying the clean function to clean the DS**

In [53]:
df_cleaned_Survontie = clean_lht_sensor(Survontie)
print("Shape before cleaning:", Survontie.shape)
print('Shape after cleaning:', df_cleaned_Survontie.shape)

Shape before cleaning: (40773, 3)
Shape after cleaning: (40100, 3)


In [54]:
df_cleaned_Survontie.head()

Unnamed: 0,Timestamp,Humidity,Temperature_C
0,2021-01-08 00:01:37,91.0,-6.6
1,2021-01-08 01:01:37,91.2,-6.71
2,2021-01-08 02:01:38,91.3,-6.85
3,2021-01-08 03:01:38,91.4,-6.85
4,2021-01-08 04:01:38,90.8,-6.94


<h1><center>Hameenpohjantie-LHT65009<h1>

In [55]:
Hameenpohjantie = pd.read_csv("Marjetas_Data\Marjetas_Data\JKL LHT\Data\LHT65009(JKL)-TEMP.csv", sep=";")
Hameenpohjantie.head()

  Hameenpohjantie = pd.read_csv("Marjetas_Data\Marjetas_Data\JKL LHT\Data\LHT65009(JKL)-TEMP.csv", sep=";")


Unnamed: 0,Timestamp,Hum_SHT,TempC_SHT
0,2021-01-01 13:01:00,37.6,16.17
1,2021-01-01 14:01:04,37.4,16.13
2,2021-01-01 15:01:00,37.3,16.11
3,2021-01-01 16:01:00,37.1,16.1
4,2021-01-01 17:01:00,37.0,16.1


In [56]:
Hameenpohjantie.shape

(38495, 3)

In [57]:
Hameenpohjantie.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38495 entries, 0 to 38494
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Timestamp  38495 non-null  object 
 1   Hum_SHT    38495 non-null  float64
 2   TempC_SHT  38495 non-null  float64
dtypes: float64(2), object(1)
memory usage: 902.4+ KB


In [58]:
Hameenpohjantie["Timestamp"] = pd.to_datetime(Hameenpohjantie["Timestamp"],format="%Y-%m-%d %H:%M:%S", errors="coerce")
print("NaT after parse:", Hameenpohjantie["Timestamp"].isna().sum())
# Sort the ds by timestamp
Hameenpohjantie = Hameenpohjantie.sort_values(by="Timestamp").reset_index(drop=True)
Hameenpohjantie.head()

NaT after parse: 0


Unnamed: 0,Timestamp,Hum_SHT,TempC_SHT
0,2021-01-01 13:01:00,37.6,16.17
1,2021-01-01 14:01:04,37.4,16.13
2,2021-01-01 15:01:00,37.3,16.11
3,2021-01-01 16:01:00,37.1,16.1
4,2021-01-01 17:01:00,37.0,16.1


In [59]:
Hameenpohjantie.duplicated().sum()

np.int64(0)

In [60]:
# Let's find out start and end time of the data and the total days 
start_time = Hameenpohjantie["Timestamp"].min()
end_time = Hameenpohjantie["Timestamp"].max()
total_days = (end_time - start_time).days
print(f"Data starts from: {start_time}")
print(f"Data ends at: {end_time}")
print(f"Total days of data: {total_days} days")

Data starts from: 2021-01-01 13:01:00
Data ends at: 2025-09-18 11:30:09
Total days of data: 1720 days


In [61]:
df_Hameenpohjantie= Hameenpohjantie.copy()
df_Hameenpohjantie['Timestamp'] = pd.to_datetime(df_Hameenpohjantie['Timestamp'])
df_Hameenpohjantie = df_Hameenpohjantie.sort_values('Timestamp').set_index('Timestamp')
# I am going to define anomalies which are out of range values:
# -40 C to 40 C for temperature
Anomaly_temperature = (df_Hameenpohjantie['TempC_SHT'] < -40) | (df_Hameenpohjantie['TempC_SHT'] > 40)
# 0 to 100 for humidity
# values below 0  OR above 100 are physically impossible
Anomaly_humidity = (df_Hameenpohjantie['Hum_SHT'] < 0) | (df_Hameenpohjantie['Hum_SHT'] > 100)

gap_min = df_Hameenpohjantie.index.to_series().diff().dt.total_seconds() / 60.0
long_gaps = gap_min[gap_min > 180]  

# Finding gap 
stuck_temp_mask = df_Hameenpohjantie['TempC_SHT'].rolling('3h').std() == 0
stuck_points = int(stuck_temp_mask.sum())


print("Anomaly_temperature count:", Anomaly_temperature.sum())
print("Actual anomalous temperature values:")
print(df_Hameenpohjantie[Anomaly_temperature]['TempC_SHT'])
print("----------------------------------")
print("Anomaly_humidity count:", Anomaly_humidity.sum())
print("Actual anomalous humidity values:")
print(df_Hameenpohjantie[Anomaly_humidity]['Hum_SHT'])
print("----------------------------------")
print("Gaps longer than 3 hours:", long_gaps.shape[0])
print("Actual gap durations (minutes):")
print(long_gaps)
print("----------------------------------")
print("Number of rows where temperature was constant for 3+ hours:", stuck_points)
print("Timestamps where temperature was stuck:")
print(df_Hameenpohjantie[stuck_temp_mask].index)

Anomaly_temperature count: 2
Actual anomalous temperature values:
Timestamp
2025-07-24 16:28:18    41.67
2025-07-30 16:28:29    40.80
Name: TempC_SHT, dtype: float64
----------------------------------
Anomaly_humidity count: 0
Actual anomalous humidity values:
Series([], Name: Hum_SHT, dtype: float64)
----------------------------------
Gaps longer than 3 hours: 97
Actual gap durations (minutes):
Timestamp
2021-01-05 17:01:08    240.000000
2021-01-14 11:01:46    360.033333
2021-01-14 18:01:48    240.000000
2021-01-16 12:02:00    180.033333
2021-01-20 17:02:17    180.016667
                          ...    
2025-05-08 05:25:36    240.016667
2025-05-12 18:25:48    240.000000
2025-06-18 05:27:06    180.016667
2025-07-24 01:28:17    180.016667
2025-08-23 13:29:16    240.000000
Name: Timestamp, Length: 97, dtype: float64
----------------------------------
Number of rows where temperature was constant for 3+ hours: 75
Timestamps where temperature was stuck:
DatetimeIndex(['2021-01-01 18:01:00

**Applying clean function to clean the DS**

In [62]:
df_cleaned_Hameenpohjantie = clean_lht_sensor(Hameenpohjantie)
print("Shape before cleaning:", Hameenpohjantie.shape)
print('Shape after cleaning:', df_cleaned_Hameenpohjantie.shape)

Shape before cleaning: (38495, 3)
Shape after cleaning: (38353, 3)


In [63]:
df_cleaned_Hameenpohjantie.head()

Unnamed: 0,Timestamp,Humidity,Temperature_C
0,2021-01-08 00:01:14,92.9,-6.62
1,2021-01-08 01:01:14,93.3,-6.78
2,2021-01-08 02:01:14,93.7,-7.01
3,2021-01-08 04:01:14,92.9,-6.95
4,2021-01-08 05:01:15,93.0,-6.97


In [64]:
# Save cleaned data to  new CSV file
#df_cleaned_Hameenpohjantie.to_csv("D:\Fiveth\Marjetas_project\cleaned_datasets\LHT\Hameenpohjantie.csv", index=False)

<h1><center>Keilonkankaantie-LHT65005<h1>

In [65]:
Keilonkankaantie = pd.read_csv("Marjetas_Data\Marjetas_Data\JKL LHT\Data\LHT65005(JKL)-TEMP.csv", sep=";")
Keilonkankaantie.head()

  Keilonkankaantie = pd.read_csv("Marjetas_Data\Marjetas_Data\JKL LHT\Data\LHT65005(JKL)-TEMP.csv", sep=";")


Unnamed: 0,Timestamp,TempC_SHT,Hum_SHT
0,2020-10-27 15:00:02,23.62,35.7
1,2020-10-27 15:20:00,20.95,36.2
2,2020-10-27 15:40:00,19.08,41.4
3,2020-12-01 11:58:13,20.77,29.5
4,2020-12-01 12:58:13,22.13,24.4


In [66]:
Keilonkankaantie.shape

(40820, 3)

In [67]:
Keilonkankaantie.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40820 entries, 0 to 40819
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Timestamp  40820 non-null  object 
 1   TempC_SHT  40820 non-null  float64
 2   Hum_SHT    40820 non-null  float64
dtypes: float64(2), object(1)
memory usage: 956.8+ KB


In [68]:
Keilonkankaantie["Timestamp"] = pd.to_datetime(Keilonkankaantie["Timestamp"],format="%Y-%m-%d %H:%M:%S", errors="coerce")
print("NaT after parse:", Keilonkankaantie["Timestamp"].isna().sum())
# Sort the ds by timestamp
Keilonkankaantie = Keilonkankaantie.sort_values(by="Timestamp").reset_index(drop=True)
Keilonkankaantie.head()

NaT after parse: 0


Unnamed: 0,Timestamp,TempC_SHT,Hum_SHT
0,2020-10-27 15:00:02,23.62,35.7
1,2020-10-27 15:20:00,20.95,36.2
2,2020-10-27 15:40:00,19.08,41.4
3,2020-12-01 11:58:13,20.77,29.5
4,2020-12-01 12:58:13,22.13,24.4


In [69]:
Keilonkankaantie.duplicated().sum()

np.int64(0)

In [70]:
# Let's find out start and end time of the data and the total days 
start_time = Keilonkankaantie["Timestamp"].min()
end_time = Keilonkankaantie["Timestamp"].max()
total_days = (end_time - start_time).days
print(f"Data starts from: {start_time}")
print(f"Data ends at: {end_time}")
print(f"Total days of data: {total_days} days")

Data starts from: 2020-10-27 15:00:02
Data ends at: 2025-09-18 11:25:46
Total days of data: 1786 days


In [71]:
df_Keilonkankaantie = Keilonkankaantie.copy()
df_Keilonkankaantie['Timestamp'] = pd.to_datetime(df_Keilonkankaantie['Timestamp'])
df_Keilonkankaantie = df_Keilonkankaantie.sort_values('Timestamp').set_index('Timestamp')
# I am going to define anomalies which are out of range values:
# -40 C to 40 C for temperature
Anomaly_temperature = (df_Keilonkankaantie['TempC_SHT'] < -40) | (df_Keilonkankaantie['TempC_SHT'] > 40)
# 0 to 100 for humidity
# values below 0  OR above 100 are physically impossible
Anomaly_humidity = (df_Keilonkankaantie['Hum_SHT'] < 0) | (df_Keilonkankaantie['Hum_SHT'] > 100)

gap_min = df_Keilonkankaantie.index.to_series().diff().dt.total_seconds() / 60.0
long_gaps = gap_min[gap_min > 180]  

# Finding gap 
stuck_temp_mask = df_Keilonkankaantie['TempC_SHT'].rolling('3h').std() == 0
stuck_points = int(stuck_temp_mask.sum())


print("Anomaly_temperature count:", Anomaly_temperature.sum())
print("Actual anomalous temperature values:")
print(df_Keilonkankaantie[Anomaly_temperature]['TempC_SHT'])
print("----------------------------------")
print("Anomaly_humidity count:", Anomaly_humidity.sum())
print("Actual anomalous humidity values:")
print(df_Keilonkankaantie[Anomaly_humidity]['Hum_SHT'])
print("----------------------------------")
print("Gaps longer than 3 hours:", long_gaps.shape[0])
print("Actual gap durations (minutes):")
print(long_gaps)
print("----------------------------------")
print("Number of rows where temperature was constant for 3+ hours:", stuck_points)
print("Timestamps where temperature was stuck:")
print(df_Keilonkankaantie[stuck_temp_mask].index)

Anomaly_temperature count: 1
Actual anomalous temperature values:
Timestamp
2024-04-11 14:20:27    327.67
Name: TempC_SHT, dtype: float64
----------------------------------
Anomaly_humidity count: 0
Actual anomalous humidity values:
Series([], Name: Hum_SHT, dtype: float64)
----------------------------------
Gaps longer than 3 hours: 22
Actual gap durations (minutes):
Timestamp
2020-12-01 11:58:13    50178.216667
2020-12-01 19:58:12      239.983333
2020-12-02 15:58:11      300.000000
2020-12-04 03:58:14      300.050000
2020-12-07 02:58:11      240.000000
2020-12-15 21:58:20      180.050000
2021-01-14 11:58:35      420.033333
2021-01-20 16:58:53      180.016667
2021-02-17 11:00:03      480.066667
2021-02-26 20:00:25      300.000000
2021-03-03 22:00:31      240.000000
2021-03-05 14:00:34      540.000000
2021-03-07 11:00:39      360.016667
2021-03-08 09:00:41     1320.033333
2021-10-20 09:02:27     2340.033333
2022-03-08 17:06:52      180.016667
2022-03-29 23:07:21      180.016667
2022-04

**Applying the clean function to clean the DS**

In [72]:
df_cleaned_Keilonkankaantie = clean_lht_sensor(Keilonkankaantie)
print("Shape before cleaning:", Keilonkankaantie.shape)
print('Shape after cleaning:', df_cleaned_Keilonkankaantie.shape)

Shape before cleaning: (40820, 3)
Shape after cleaning: (40017, 3)


In [73]:
df_cleaned_Keilonkankaantie.head()

Unnamed: 0,Timestamp,Temperature_C,Humidity
0,2021-01-08 00:58:17,-7.19,93.3
1,2021-01-08 01:58:17,-7.39,93.4
2,2021-01-08 02:58:18,-7.3,93.4
3,2021-01-08 03:58:17,-7.4,93.3
4,2021-01-08 04:58:17,-7.41,93.3


In [74]:
df_cleaned_Keilonkankaantie.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40017 entries, 0 to 40016
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   Timestamp      40017 non-null  datetime64[ns]
 1   Temperature_C  40017 non-null  float64       
 2   Humidity       40017 non-null  float64       
dtypes: datetime64[ns](1), float64(2)
memory usage: 938.0 KB


In [75]:
# Save cleaned data to  new CSV file
df_cleaned_Keilonkankaantie.to_csv("D:\Fiveth\Marjetas_project\cleaned_datasets\LHT\Keilonkankaantie.csv", index=False)

  df_cleaned_Keilonkankaantie.to_csv("D:\Fiveth\Marjetas_project\cleaned_datasets\LHT\Keilonkankaantie.csv", index=False)


<h3>Calculating Dew point temperature - Saturation Vapor Pressure - Vapor Pressure Deficit - Absolute Humidity<h3>

In [76]:
# Sensor dictionary
sensors = {
   'Keltimaentie-LHT65013': df_cleaned_Keltimaentie,
    'Hikipolku-LHT65010': df_cleaned_Hikipolku,
    'Hameenpohjantie-LHT65009': df_cleaned_Hameenpohjantie,
    'Survontie-LHT65008': df_cleaned_Survontie,
    'Ritopohantie-LHT65007': df_cleaned_Ritopohantie,
    'Kaunisharjuntie-LHT65006': df_cleaned_Kaunisharjuntie,
    'Keilonkankaantie-LHT65005': df_cleaned_Keilonkankaantie }



# FUNCTIONS:

# Calculating dew point, temprature where cmondensation forms 
def dewpoint_C(temperature, humidity):
    # b for liquid water, Comes from the Sonntag fit to lab data for water vapor over liquid water
    # c for liquid water
    # For temperatures above freezing:
    # b = 17.625, c = 243.04
    
    # For temperatures below freezing:
    # b = 22.46, c = 272.62
    b = np.where(temperature >= 0, 17.625, 22.46)
    c = np.where(temperature >= 0, 243.04, 272.62)
    rh_frac = np.clip(humidity, 1e-6, 100) / 100.0
    gamma = np.log(rh_frac) + (b * temperature) / (c + temperature)
    return (c * gamma) / (b - gamma)


# Saturation Vapor Pressure: 
# The maximum amount of water vapor that air can hold at a given temperature before condensation starts.
def svp_kpa_piecewise(Tc):
    # Saturation vapor pressure over water 
    es_water = 0.6108 * np.exp(17.27 * Tc / (Tc + 237.3))
    # Saturation vapor pressure over ice 
    es_ice   = 0.6108 * np.exp(21.875 * Tc / (Tc + 265.5))
    # Use water above , ice below
    return np.where(Tc >= 0, es_water, es_ice)


# Vapor Pressure Deficit VPD:
# how much more moisture the air could absorb before it is  completely full.
def vpd_kpa(Tc, RH):
    es = svp_kpa_piecewise(Tc)                          
    rh_frac = np.clip(RH, 0, 100) / 100.0               
    ea = es * rh_frac                                   
    return es - ea   
# High VPD means dry air, plants lose water quickly.
# Low VPD means humid air. 


# Absolute Humidity:
# the actual amount of water vapor in the air, in grams per cubic meter
def abs_humidity_gm3(Tc, RH):
    # calculating Saturation Vapor Pressure in kilo Pascals
    es_pa = svp_kpa_piecewise(Tc) * 1000.0
    # calculating actual Vapor Pressure in Pascals
    Vapor_pressure = np.clip(RH,0,100)/100.0 * es_pa
    # Converting Celsius to Kelvin
    return 2.16679 * Vapor_pressure / (Tc + 273.15)


**Applying the Functions**

In [77]:
def prepare_and_features(df):
    # Preparing data 
    report_df = df.copy()
    report_df = report_df.dropna(subset=["Timestamp"]).sort_values("Timestamp")
    report_df["Temperature_C"] = pd.to_numeric(report_df["Temperature_C"], errors="coerce")
    report_df["Humidity"] = pd.to_numeric(report_df["Humidity"], errors="coerce")

# APPLYING FUNCTIONS:
    
# Dew Point Temperature
    report_df["DewPoint_C"] = dewpoint_C(report_df["Temperature_C"], report_df["Humidity"])
# Absolute Humidity
    report_df["AbsHum_gm3"] = abs_humidity_gm3(report_df["Temperature_C"], report_df["Humidity"])
# Vapor Pressure Deficit
    report_df["VPD_kPa"]= vpd_kpa(report_df["Temperature_C"], report_df["Humidity"])
    report_df["hour"]= report_df["Timestamp"].dt.hour
    report_df["date"]= report_df["Timestamp"].dt.date
    return report_df

**Creating  summmary report for each LHT sensors**

In [78]:
def summarize(Report_df):
    # Calculating the time difference between one step and the next
    step_min = Report_df["Timestamp"].diff().dt.total_seconds().median()/60.0

   # Calculating daily temperature and humdity range:
   # Finding the max and min temperature also average humadity for each day
    daily = Report_df.groupby("date").agg(T_min=("Temperature_C","min"),T_max=("Temperature_C","max"),RH_mean=("Humidity","mean"))
    # Subtracting min from max to get daily temperature range
    daily_temp_range = (daily["T_max"] - daily["T_min"]).median() if not daily.empty else np.nan

    
    #  Day-night Cycles:
    #  How much temperature and humidity typically rises and falls in 24 hours cyle
    # resampling by hour to get mean temperature and humidity for each hour of the day
    data_hourly = Report_df.groupby("hour").agg(T=("Temperature_C","mean"), RH=("Humidity","mean"))
    # Finding  the highest average temperature and lowest average temperature  hourly, then subtracting to get diurnal amplitude
    Temperature_24H = (data_hourly["T"].max() - data_hourly["T"].min()) if not data_hourly.empty else np.nan
    # Same idea for humidity
    Humidity_24H= (data_hourly["RH"].max() - data_hourly["RH"].min()) if not data_hourly.empty else np.nan

    

    # Calculating Vapor Pressure Deficit for midday hours:
    daily_vpd_max = Report_df.groupby("date")["VPD_kPa"].max()
    vpd_peak_mean = daily_vpd_max.mean() if not daily_vpd_max.empty else np.nan
    data_sorted = Report_df.sort_values(["date", "VPD_kPa"], ascending=[True, False])
    top4 = data_sorted.groupby("date").head(4)
    daily_vpd_top4 = top4.groupby("date")["VPD_kPa"].mean()
    vpd_peak_smooth = daily_vpd_top4.mean() if not daily_vpd_top4.empty else np.nan
    
    
    # Identify Comfort and Condensation issues
    pct_RH_gt90 = (Report_df["Humidity"]>=90).mean()*100.0
    pct_RH_lt30 = (Report_df["Humidity"]<=30).mean()*100.0

    return {
        "rows": len(Report_df),
        "step_min": step_min,
        "T_mean": Report_df["Temperature_C"].mean(),
        "T_min": Report_df["Temperature_C"].min(),
        "T_max": Report_df["Temperature_C"].max(),
        "RH_mean": Report_df["Humidity"].mean(),
        "RH_min": Report_df["Humidity"].min(),
        "RH_max": Report_df["Humidity"].max(),
        "DewPoint_mean": Report_df["DewPoint_C"].mean(),
        "AbsHum_mean": Report_df["AbsHum_gm3"].mean(),
        "VPD_peak_mean": vpd_peak_mean,  
        "VPD_peak_smooth": vpd_peak_smooth,
        "DTR_median": daily_temp_range,
        "Temperature_24H": Temperature_24H,
        "Humidity_24H": Humidity_24H,
        "%RH>=90": pct_RH_gt90,
        "%RH<=30": pct_RH_lt30,}


**Compering LHTs**

In [79]:
summaries = {}
for name, df in sensors.items():
    Report_df = prepare_and_features(df)
    summaries[name] = summarize(Report_df)

Comperison_table = pd.DataFrame(summaries).T.round(1)
print("\n=== LHT per sensor summary ===")
print(Comperison_table.to_string())

# Creating difference vs network mean table:
# Making  list of columns to include in difference table
temp_hum_columns = [column for column in Comperison_table.columns if column not in ("rows",)]
difference_table = (Comperison_table[temp_hum_columns] - Comperison_table[temp_hum_columns].mean()).round(1)
print("\n=== Difference vs network mean ===")
print(difference_table.sort_values("T_mean", ascending=False).to_string())



=== LHT per sensor summary ===
                              rows  step_min  T_mean  T_min  T_max  RH_mean  RH_min  RH_max  DewPoint_mean  AbsHum_mean  VPD_peak_mean  VPD_peak_smooth  DTR_median  Temperature_24H  Humidity_24H  %RH>=90  %RH<=30
Keltimaentie-LHT65013      38065.0      60.0     5.4  -30.9   36.6     88.9    22.4   100.0            3.2          6.9            0.6              0.5         8.7              6.7          18.5     70.4      0.2
Hikipolku-LHT65010         38796.0      60.0     6.1  -30.2   37.9     87.2    15.9   100.0            3.5          7.1            0.8              0.7        10.8              9.1          21.7     66.7      0.7
Hameenpohjantie-LHT65009   38353.0      60.0     5.9  -31.4   38.0     86.5    16.0   100.0            3.3          7.0            0.9              0.7         9.9              7.6          22.6     64.1      1.0
Survontie-LHT65008         40100.0      60.0     5.6  -29.9   34.7     87.4    17.3   100.0            3.2          

In [80]:

start_time = pd.to_datetime("2021-01-20 00:00:00")  
end_time   = pd.to_datetime("2025-09-17 23:00:00")  

master_index = pd.date_range(start=start_time, end=end_time, freq="h")

In [81]:
def prepare_single_lht(df_raw, master_index):
    df = df_raw.copy()

    # 1) Make sure we have a datetime index and floor to the hour
    if "Timestamp" in df.columns:
        df["Timestamp"] = pd.to_datetime(df["Timestamp"]).dt.floor("h")
        df = df.set_index("Timestamp")
    else:
        df.index = pd.to_datetime(df.index).floor("h")
        df.index.name = "Timestamp"

    # 2) Sort by time
    df = df.sort_index()

    # 3) If we have multiple readings in the same hour, aggregate them
    df = df.groupby(df.index).mean()
    df.index.name = "Timestamp"

    # 4) Reindex to your full hourly master_index
    df = df.reindex(master_index)

    # 5) Keep index name consistent
    df.index.name = "Timestamp"

    return df


df_Keltimaenties = prepare_single_lht(df_cleaned_Keltimaentie, master_index)
df_Hikipolkus = prepare_single_lht(df_cleaned_Hikipolku, master_index)
df_Hameenpohjanties = prepare_single_lht(df_cleaned_Hameenpohjantie, master_index)
df_Survonties  = prepare_single_lht(df_cleaned_Survontie, master_index)
df_Ritopohanties  = prepare_single_lht(df_cleaned_Ritopohantie, master_index)
df_Kaunisharjunties = prepare_single_lht(df_cleaned_Kaunisharjuntie, master_index)
df_Keilonkankaanties= prepare_single_lht(df_cleaned_Keilonkankaantie, master_index)


In [82]:

sensor_dfs = {
    "Keltimaentie-LHT65013":    df_Keltimaenties,
    "Hikipolku-LHT65010":       df_Hikipolkus,
    "Hameenpohjantie-LHT65009": df_Hameenpohjanties,
    "Survontie-LHT65008":       df_Survonties,
    "Ritopohantie-LHT65007":    df_Ritopohanties,
    "Kaunisharjuntie-LHT65006": df_Kaunisharjunties,
    "Keilonkankaantie-LHT65005":df_Keilonkankaanties,}

combined = pd.concat(sensor_dfs, axis=1)  
combined = combined.sort_index()          

print(combined.index.equals(master_index))  
print(combined.head())


True
                    Keltimaentie-LHT65013               Hikipolku-LHT65010  \
                                 Humidity Temperature_C           Humidity   
Timestamp                                                                    
2021-01-20 00:00:00                 100.0         -2.24              100.0   
2021-01-20 01:00:00                 100.0         -2.26              100.0   
2021-01-20 02:00:00                 100.0         -2.04              100.0   
2021-01-20 03:00:00                 100.0         -1.87              100.0   
2021-01-20 04:00:00                 100.0         -1.95              100.0   

                                  Hameenpohjantie-LHT65009                \
                    Temperature_C                 Humidity Temperature_C   
Timestamp                                                                  
2021-01-20 00:00:00         -2.22                    100.0         -1.75   
2021-01-20 01:00:00         -2.16                    100.0        

In [83]:
combined.columns

MultiIndex([(    'Keltimaentie-LHT65013',      'Humidity'),
            (    'Keltimaentie-LHT65013', 'Temperature_C'),
            (       'Hikipolku-LHT65010',      'Humidity'),
            (       'Hikipolku-LHT65010', 'Temperature_C'),
            ( 'Hameenpohjantie-LHT65009',      'Humidity'),
            ( 'Hameenpohjantie-LHT65009', 'Temperature_C'),
            (       'Survontie-LHT65008',      'Humidity'),
            (       'Survontie-LHT65008', 'Temperature_C'),
            (    'Ritopohantie-LHT65007', 'Temperature_C'),
            (    'Ritopohantie-LHT65007',      'Humidity'),
            ( 'Kaunisharjuntie-LHT65006', 'Temperature_C'),
            ( 'Kaunisharjuntie-LHT65006',      'Humidity'),
            ('Keilonkankaantie-LHT65005', 'Temperature_C'),
            ('Keilonkankaantie-LHT65005',      'Humidity')],
           )

In [84]:
summaries_df = pd.DataFrame.from_dict(summaries, orient='index')

# FIRST: Define lht_sensor dictionary
lht_sensor = {
    'Keltimaentie-LHT65013': (62.234563, 25.672774),
    'Hikipolku-LHT65010': (62.260777, 25.693876),
    'Hameenpohjantie-LHT65009': (62.222971, 25.804673),
    'Survontie-LHT65008': (62.227604, 25.736853),
    'Ritopohantie-LHT65007': (62.286678, 25.74533),
    'Kaunisharjuntie-LHT65006': (62.265198, 25.89008),
    'Keilonkankaantie-LHT65005': (62.197614, 25.720489),
}

# THEN: Extract sensor names from the dictionary (this is better than hardcoding)
SENSOR_NAMES = list(lht_sensor.keys())

# Optional sanity check: make sure names line up
print("summaries_df index:", list(summaries_df.index))
print("lht_sensor keys:   ", list(SENSOR_NAMES))


def haversine_distance(coord1, coord2):
    """
    Calculate the great circle distance between two points 
    on the earth (specified in decimal degrees).
    Returns distance in kilometers.
    """
    lat1, lon1 = coord1
    lat2, lon2 = coord2
    
    # Convert to radians
    lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])
    
    # Haversine formula
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * atan2(sqrt(a), sqrt(1-a))
    
    # Earth radius in kilometers
    R = 6371.0
    distance = R * c
    
    return distance

# Build geographic distance matrix using consistent names
geo_dist = pd.DataFrame(index=SENSOR_NAMES, columns=SENSOR_NAMES, dtype=float)

for s1 in SENSOR_NAMES:
    for s2 in SENSOR_NAMES:
        geo_dist.loc[s1, s2] = haversine_distance(lht_sensor[s1], lht_sensor[s2])

print("=== GEOGRAPHIC DISTANCE (km) ===")
print(geo_dist.round(2))

summaries_df index: ['Keltimaentie-LHT65013', 'Hikipolku-LHT65010', 'Hameenpohjantie-LHT65009', 'Survontie-LHT65008', 'Ritopohantie-LHT65007', 'Kaunisharjuntie-LHT65006', 'Keilonkankaantie-LHT65005']
lht_sensor keys:    ['Keltimaentie-LHT65013', 'Hikipolku-LHT65010', 'Hameenpohjantie-LHT65009', 'Survontie-LHT65008', 'Ritopohantie-LHT65007', 'Kaunisharjuntie-LHT65006', 'Keilonkankaantie-LHT65005']
=== GEOGRAPHIC DISTANCE (km) ===
                           Keltimaentie-LHT65013  Hikipolku-LHT65010  \
Keltimaentie-LHT65013                       0.00                3.11   
Hikipolku-LHT65010                          3.11                0.00   
Hameenpohjantie-LHT65009                    6.95                7.11   
Survontie-LHT65008                          3.41                4.31   
Ritopohantie-LHT65007                       6.91                3.92   
Kaunisharjuntie-LHT65006                   11.76               10.17   
Keilonkankaantie-LHT65005                   4.80               

In [85]:
# To find the best match, now I want to calculate climate distance using 4 features: T_mean, RH_mean, DTR_median, and %RH>=90
# Based  on this I can find most similar sensor to each sensor to fix the gaps 

def climate_distance(row_i, row_j):
    return np.sqrt(
        (row_i['T_mean']  - row_j['T_mean'])**2 +
        (row_i['RH_mean'] - row_j['RH_mean'])**2 +
        (row_i['DTR_median']     - row_j['DTR_median'])**2 +
        (row_i['%RH>=90']   - row_j['%RH>=90'])**2)


idx = summaries_df.index
climate_dist = pd.DataFrame(index=idx, columns=idx, dtype=float)

for s1 in idx:
    for s2 in idx:
        climate_dist.loc[s1, s2] = climate_distance(
            summaries_df.loc[s1],
            summaries_df.loc[s2])

print(climate_dist.round(2))


print("\n=== NEAREST GEOGRAPHIC NEIGHBORS ===")
for sensor in SENSOR_NAMES:
    distances = geo_dist.loc[sensor].sort_values()
    nearest = distances[distances > 0].head(3)
    print(f"\n{sensor}:")
    for neighbor, dist in nearest.items():
        print(f"  → {neighbor}: {dist:.2f} km")


                           Keltimaentie-LHT65013  Hikipolku-LHT65010  \
Keltimaentie-LHT65013                       0.00                4.55   
Hikipolku-LHT65010                          4.55                0.00   
Hameenpohjantie-LHT65009                    6.80                2.85   
Survontie-LHT65008                          7.10                4.74   
Ritopohantie-LHT65007                       8.99                4.47   
Kaunisharjuntie-LHT65006                    2.14                6.15   
Keilonkankaantie-LHT65005                  10.23                6.78   

                           Hameenpohjantie-LHT65009  Survontie-LHT65008  \
Keltimaentie-LHT65013                          6.80                7.10   
Hikipolku-LHT65010                             2.85                4.74   
Hameenpohjantie-LHT65009                       0.00                2.82   
Survontie-LHT65008                             2.82                0.00   
Ritopohantie-LHT65007                          2

In [86]:
print("\n=== NEAREST CLIMATE NEIGHBORS ===")
for sensor in climate_dist.index:
    distances = climate_dist.loc[sensor].sort_values()
    nearest = distances[distances > 0].head(3)
    print(f"\n{sensor}:")
    for neighbor, dist in nearest.items():
        print(f"  → {neighbor}: {dist:.2f}")


=== NEAREST CLIMATE NEIGHBORS ===

Keltimaentie-LHT65013:
  → Kaunisharjuntie-LHT65006: 2.14
  → Hikipolku-LHT65010: 4.55
  → Hameenpohjantie-LHT65009: 6.80

Hikipolku-LHT65010:
  → Hameenpohjantie-LHT65009: 2.85
  → Ritopohantie-LHT65007: 4.47
  → Keltimaentie-LHT65013: 4.55

Hameenpohjantie-LHT65009:
  → Survontie-LHT65008: 2.82
  → Hikipolku-LHT65010: 2.85
  → Ritopohantie-LHT65007: 2.91

Survontie-LHT65008:
  → Hameenpohjantie-LHT65009: 2.82
  → Keilonkankaantie-LHT65005: 3.51
  → Hikipolku-LHT65010: 4.74

Ritopohantie-LHT65007:
  → Hameenpohjantie-LHT65009: 2.91
  → Hikipolku-LHT65010: 4.47
  → Keilonkankaantie-LHT65005: 4.70

Kaunisharjuntie-LHT65006:
  → Keltimaentie-LHT65013: 2.14
  → Hikipolku-LHT65010: 6.15
  → Survontie-LHT65008: 7.47

Keilonkankaantie-LHT65005:
  → Survontie-LHT65008: 3.51
  → Hameenpohjantie-LHT65009: 3.94
  → Ritopohantie-LHT65007: 4.70


In [87]:
# Finding matching datasets 
def get_matching_sensors(sensor, climate_dist, geo_dist,
                             max_climate_dist=5.0, max_geo_dist=8.0):
    d_clim = climate_dist.loc[sensor]
    d_geo  = geo_dist.loc[sensor]

    mask = (d_clim <= max_climate_dist) & (d_geo <= max_geo_dist)
    candidates = d_clim.index[mask].tolist()

    if sensor in candidates:
        candidates.remove(sensor)

    return candidates

In [88]:
# After finding the distance, I will create fucntion to to  see how these sensor are related to each other
def compute_correlations(combined, target_sensor, candidate_sensors, variable="Temperature_C"):
    y = combined[(target_sensor, variable)]
    results = {}

    for s in candidate_sensors:
        x = combined[(s, variable)]
        mask = y.notna() & x.notna()
        if mask.sum() < 800:
            continue
        r = np.corrcoef(x[mask], y[mask])[0, 1]
        results[s] = r

    return pd.Series(results).sort_values(ascending=False)


In [89]:
# Building  linear regression model to learn relationship between  target sensor and its match 
def build_regression_model(combined, target_sensor, neighbours, variable="Temperature_C"):
    y = combined[(target_sensor, variable)]
    X = combined[[(s, variable) for s in neighbours]]

    # training mask - subset *only for training*, not for combined
    mask_train = y.notna() & X.notna().all(axis=1)

    X_train = X.loc[mask_train]
    y_train = y.loc[mask_train]

    model = LinearRegression().fit(X_train, y_train)
    return model



In [90]:
# Filling the gaps for LHT sensors
def fill_gaps_for_sensor(combined, target_sensor, climate_dist, geo_dist, var_list=("Temperature_C", "Humidity"),
                         r_min=0.70, max_climate_dist=5.0, max_geo_dist=8.0):

    combined_filled = combined.copy()

    candidates = get_matching_sensors(target_sensor, climate_dist, geo_dist, max_climate_dist=max_climate_dist, max_geo_dist=max_geo_dist)

    if not candidates:
        print(f"{target_sensor}: no climate/geo neighbours → only keep interpolation/climatology.")
        return combined_filled

    corr_temp = compute_correlations(combined, target_sensor, candidates, "Temperature_C")

    if corr_temp.empty:
        print(f"{target_sensor}: no overlapping data to compute correlation → skip regression.")
        return combined_filled

    good_neighbours = corr_temp[corr_temp >= r_min].index.tolist()

    if not good_neighbours:
        print(f"{target_sensor}: no neighbours with r >= {r_min:.2f} → skip regression.")
        return combined_filled

    print(f"{target_sensor}: using neighbours {good_neighbours}")

    for var in var_list:
        y = combined[(target_sensor, var)]
        X_all = combined[[(s, var) for s in good_neighbours]]

        # Build model
        mask_train = y.notna() & X_all.notna().all(axis=1)
        model = LinearRegression().fit(X_all.loc[mask_train], y.loc[mask_train])

        # Real gaps for this variable
        mask_gap = y.isna() & X_all.notna().all(axis=1)

        if mask_gap.any():
            X_gap = X_all.loc[mask_gap]
            y_pred = model.predict(X_gap)

            combined_filled.loc[mask_gap, (target_sensor, var)] = y_pred

    return combined_filled



In [91]:
Keltimaentie_trianing  = fill_gaps_for_sensor(
    combined, target_sensor='Keltimaentie-LHT65013',
    climate_dist=climate_dist,
    geo_dist=geo_dist, r_min=0.69)

Keltimaentie-LHT65013: using neighbours ['Hikipolku-LHT65010']


In [92]:
Fixed_Keltimaentie = Keltimaentie_trianing['Keltimaentie-LHT65013']
Fixed_Keltimaentie

Unnamed: 0_level_0,Humidity,Temperature_C
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2021-01-20 00:00:00,100.0,-2.24
2021-01-20 01:00:00,100.0,-2.26
2021-01-20 02:00:00,100.0,-2.04
2021-01-20 03:00:00,100.0,-1.87
2021-01-20 04:00:00,100.0,-1.95
...,...,...
2025-09-17 19:00:00,100.0,13.27
2025-09-17 20:00:00,100.0,12.39
2025-09-17 21:00:00,100.0,11.20
2025-09-17 22:00:00,100.0,12.05


In [93]:
Fixed_Keltimaentie.isna().sum()
# Show rows with NA values
rows_with_na = Fixed_Keltimaentie[Fixed_Keltimaentie.isna().any(axis=1)]
print("\nRows containing NA values:")
print(rows_with_na)


Rows containing NA values:
                     Humidity  Temperature_C
Timestamp                                   
2021-01-20 15:00:00       NaN            NaN
2021-01-20 16:00:00       NaN            NaN
2021-01-21 20:00:00       NaN            NaN
2021-01-23 14:00:00       NaN            NaN
2021-01-27 22:00:00       NaN            NaN
...                       ...            ...
2025-06-23 00:00:00       NaN            NaN
2025-07-06 08:00:00       NaN            NaN
2025-08-30 04:00:00       NaN            NaN
2025-09-06 23:00:00       NaN            NaN
2025-09-17 10:00:00       NaN            NaN

[275 rows x 2 columns]


In [94]:
Fixed_Keltimaentie.index
Fixed_Keltimaentie = Keltimaentie_trianing['Keltimaentie-LHT65013'].reset_index()
Fixed_Keltimaentie.head()


Unnamed: 0,Timestamp,Humidity,Temperature_C
0,2021-01-20 00:00:00,100.0,-2.24
1,2021-01-20 01:00:00,100.0,-2.26
2,2021-01-20 02:00:00,100.0,-2.04
3,2021-01-20 03:00:00,100.0,-1.87
4,2021-01-20 04:00:00,100.0,-1.95


In [95]:
Fixed_Keltimaentie = Fixed_Keltimaentie[["Timestamp", "Temperature_C", "Humidity"]]


In [96]:
Fixed_Keltimaentie= Fixed_Keltimaentie.interpolate(method="linear", limit=3, limit_direction="both", limit_area="inside")

In [97]:
# Save cleaned data to  new CSV file
Fixed_Keltimaentie.to_csv("D:\Fiveth\Marjetas_project\cleaned_datasets\LHT\Keltimaentie.csv", index=False)

  Fixed_Keltimaentie.to_csv("D:\Fiveth\Marjetas_project\cleaned_datasets\LHT\Keltimaentie.csv", index=False)


In [98]:
Hikipolku_trianing= fill_gaps_for_sensor(
    combined,
    target_sensor="Hikipolku-LHT65010", climate_dist=climate_dist,
    geo_dist=geo_dist, r_min=0.97)

print(combined.index.equals(master_index)) 


Hikipolku-LHT65010: using neighbours ['Keltimaentie-LHT65013']
True


In [99]:
Fixed_Hikipolku = Hikipolku_trianing['Hikipolku-LHT65010']

In [100]:
Fixed_Hikipolku.shape

(40848, 2)

In [101]:
Fixed_Hikipolku.index

DatetimeIndex(['2021-01-20 00:00:00', '2021-01-20 01:00:00',
               '2021-01-20 02:00:00', '2021-01-20 03:00:00',
               '2021-01-20 04:00:00', '2021-01-20 05:00:00',
               '2021-01-20 06:00:00', '2021-01-20 07:00:00',
               '2021-01-20 08:00:00', '2021-01-20 09:00:00',
               ...
               '2025-09-17 14:00:00', '2025-09-17 15:00:00',
               '2025-09-17 16:00:00', '2025-09-17 17:00:00',
               '2025-09-17 18:00:00', '2025-09-17 19:00:00',
               '2025-09-17 20:00:00', '2025-09-17 21:00:00',
               '2025-09-17 22:00:00', '2025-09-17 23:00:00'],
              dtype='datetime64[ns]', name='Timestamp', length=40848, freq='h')

In [102]:
Fixed_Hikipolku = Fixed_Hikipolku.reset_index()

In [103]:
Fixed_Hikipolku = Fixed_Hikipolku[["Timestamp", "Temperature_C", "Humidity"]]

Fixed_Hikipolku = Fixed_Hikipolku.interpolate(method="linear", limit=3, limit_direction="both", limit_area="inside")

Fixed_Hikipolku.head()


Unnamed: 0,Timestamp,Temperature_C,Humidity
0,2021-01-20 00:00:00,-2.22,100.0
1,2021-01-20 01:00:00,-2.16,100.0
2,2021-01-20 02:00:00,-2.1,100.0
3,2021-01-20 03:00:00,-2.02,100.0
4,2021-01-20 04:00:00,-1.99,100.0


In [104]:
# Save cleaned data to  new CSV file
Fixed_Hikipolku.to_csv("D:\Fiveth\Marjetas_project\cleaned_datasets\LHT\Hikipolku.csv", index=False)

  Fixed_Hikipolku.to_csv("D:\Fiveth\Marjetas_project\cleaned_datasets\LHT\Hikipolku.csv", index=False)


In [105]:
trianing_data= fill_gaps_for_sensor(
    combined,
    target_sensor="Kaunisharjuntie-LHT65006", climate_dist=climate_dist,
    geo_dist=geo_dist, r_min=0.05)

print(combined.index.equals(master_index)) 

Kaunisharjuntie-LHT65006: no climate/geo neighbours → only keep interpolation/climatology.
True


In [106]:
trianing_data= fill_gaps_for_sensor(
    combined,
    target_sensor="Ritopohantie-LHT65007", climate_dist=climate_dist,
 geo_dist=geo_dist, r_min=0.90)

print(combined.index.equals(master_index)) 

Ritopohantie-LHT65007: using neighbours ['Hameenpohjantie-LHT65009', 'Hikipolku-LHT65010']
True


In [107]:
Fixed_Ritopohantie = trianing_data['Ritopohantie-LHT65007']
Fixed_Ritopohantie.shape


(40848, 2)

In [108]:
Fixed_Ritopohantie.isna().sum()

Temperature_C    412
Humidity         412
dtype: int64

In [109]:
Fixed_Ritopohantie = Fixed_Ritopohantie.reset_index()
Fixed_Ritopohantie = Fixed_Ritopohantie[["Timestamp", "Temperature_C", "Humidity"]]

In [110]:
Fixed_Ritopohantie = Fixed_Ritopohantie.interpolate(method="linear", limit=3, limit_direction="both", limit_area="inside")

In [111]:
Fixed_Ritopohantie.isna().sum()

Timestamp         0
Temperature_C    81
Humidity         81
dtype: int64

In [112]:
# Save cleaned data to  new CSV file
Fixed_Ritopohantie.to_csv("D:\Fiveth\Marjetas_project\cleaned_datasets\LHT\Ritopohantie.csv", index=False)

  Fixed_Ritopohantie.to_csv("D:\Fiveth\Marjetas_project\cleaned_datasets\LHT\Ritopohantie.csv", index=False)


In [113]:
trianing_data= fill_gaps_for_sensor(
    combined,
    target_sensor="Survontie-LHT65008", climate_dist=climate_dist,
 geo_dist=geo_dist, r_min=0.90)

print(combined.index.equals(master_index)) 

Survontie-LHT65008: using neighbours ['Keilonkankaantie-LHT65005', 'Hameenpohjantie-LHT65009', 'Hikipolku-LHT65010']
True


In [114]:
Fixed_Survontie = trianing_data["Survontie-LHT65008"]
Fixed_Survontie.shape

(40848, 2)

In [115]:
Fixed_Survontie.isna().sum()

Humidity         312
Temperature_C    312
dtype: int64

In [116]:
Fixed_Survontie = Fixed_Survontie.reset_index()
Fixed_Survontie = Fixed_Survontie[["Timestamp", "Temperature_C", "Humidity"]]

In [117]:
Fixed_Survontie = Fixed_Survontie.interpolate(method="linear", limit=3, limit_direction="both", limit_area="inside")

In [118]:
Fixed_Survontie.isna().sum()

Timestamp         0
Temperature_C    79
Humidity         79
dtype: int64

In [132]:
Fixed_Survontie['Humidity'] = Fixed_Survontie['Humidity'].clip(0,100)

In [133]:
# Save cleaned data to  new CSV file
Fixed_Survontie.to_csv("D:\Fiveth\Marjetas_project\cleaned_datasets\LHT\Survontie.csv", index=False)

  Fixed_Survontie.to_csv("D:\Fiveth\Marjetas_project\cleaned_datasets\LHT\Survontie.csv", index=False)


In [120]:
training_data= fill_gaps_for_sensor(
    combined,
    target_sensor="Keilonkankaantie-LHT65005", climate_dist=climate_dist,
 geo_dist=geo_dist, r_min=0.90)

print(combined.index.equals(master_index)) 

Keilonkankaantie-LHT65005: using neighbours ['Survontie-LHT65008', 'Hameenpohjantie-LHT65009']
True


In [121]:
Fixed_Keilonkankaantie = training_data["Keilonkankaantie-LHT65005"]
Fixed_Keilonkankaantie.shape

(40848, 2)

In [122]:
Fixed_Keilonkankaantie  = Fixed_Keilonkankaantie.reset_index()
Fixed_Keilonkankaantie = Fixed_Keilonkankaantie[["Timestamp", "Temperature_C", "Humidity"]]

In [123]:
Fixed_Keilonkankaantie = Fixed_Keilonkankaantie.interpolate(method="linear", limit=3, limit_direction="both", limit_area="inside")
Fixed_Keilonkankaantie.isna().sum()

Timestamp         0
Temperature_C    79
Humidity         79
dtype: int64

In [124]:
# Save cleaned data to  new CSV file
Fixed_Keilonkankaantie.to_csv("D:\Fiveth\Marjetas_project\cleaned_datasets\LHT\Hameenpohjantie.csv", index=False)

  Fixed_Keilonkankaantie.to_csv("D:\Fiveth\Marjetas_project\cleaned_datasets\LHT\Hameenpohjantie.csv", index=False)


In [125]:
training_data= fill_gaps_for_sensor(
    combined,
    target_sensor="Hameenpohjantie-LHT65009", climate_dist=climate_dist,
 geo_dist=geo_dist, r_min=0.98)

print(combined.index.equals(master_index))

Hameenpohjantie-LHT65009: using neighbours ['Survontie-LHT65008', 'Keilonkankaantie-LHT65005', 'Ritopohantie-LHT65007']
True


In [126]:
Fixed_Hameenpohjantie =  training_data["Hameenpohjantie-LHT65009"]
Fixed_Hameenpohjantie.shape

(40848, 2)

In [127]:
Fixed_Hameenpohjantie.isna().sum()

Humidity         414
Temperature_C    414
dtype: int64

In [128]:
Fixed_Hameenpohjantie  = Fixed_Hameenpohjantie.reset_index()
Fixed_Hameenpohjantie = Fixed_Hameenpohjantie[["Timestamp", "Temperature_C", "Humidity"]]

In [129]:
Fixed_Hameenpohjantie = Fixed_Hameenpohjantie.interpolate(method="linear", limit=3, limit_direction="both", limit_area="inside")
Fixed_Hameenpohjantie.isna().sum()

Timestamp         0
Temperature_C    81
Humidity         81
dtype: int64

In [130]:
Fixed_Hameenpohjantie = Fixed_Hameenpohjantie.reset_index()
Fixed_Hameenpohjantie = Fixed_Hameenpohjantie[["Timestamp", "Temperature_C", "Humidity"]]

In [131]:
# Save cleaned data to  new CSV file
Fixed_Hameenpohjantie.to_csv("D:\Fiveth\Marjetas_project\cleaned_datasets\LHT\Hameenpohjantie.csv", index=False)

  Fixed_Hameenpohjantie.to_csv("D:\Fiveth\Marjetas_project\cleaned_datasets\LHT\Hameenpohjantie.csv", index=False)


In [135]:
# Sensor dictionary
sensor = {
   'Keltimaentie-LHT65013': Fixed_Keltimaentie,
    'Hikipolku-LHT65010': Fixed_Hikipolku,
    'Hameenpohjantie-LHT65009': Fixed_Hameenpohjantie,
    'Survontie-LHT65008': Fixed_Survontie,
    'Ritopohantie-LHT65007': Fixed_Ritopohantie,
    'Keilonkankaantie-LHT65005': Fixed_Keilonkankaantie }


In [136]:
summaries = {}
for name, df in sensor.items():
    Report_df = prepare_and_features(df)
    summaries[name] = summarize(Report_df)

Comperison_table = pd.DataFrame(summaries).T.round(1)
print("\n=== LHT per sensor summary ===")
print(Comperison_table.to_string())

# Creating difference vs network mean table:
# Making  list of columns to include in difference table
temp_hum_columns = [column for column in Comperison_table.columns if column not in ("rows",)]
difference_table = (Comperison_table[temp_hum_columns] - Comperison_table[temp_hum_columns].mean()).round(1)
print("\n=== Difference vs network mean ===")
print(difference_table.sort_values("T_mean", ascending=False).to_string())


=== LHT per sensor summary ===
                              rows  step_min  T_mean  T_min  T_max  RH_mean  RH_min  RH_max  DewPoint_mean  AbsHum_mean  VPD_peak_mean  VPD_peak_smooth  DTR_median  Temperature_24H  Humidity_24H  %RH>=90  %RH<=30
Keltimaentie-LHT65013      40848.0      60.0     5.4  -30.9   36.6     88.9    22.4   100.0            3.2          6.9            0.6              0.5         9.1              6.3          17.6     70.1      0.2
Hikipolku-LHT65010         40848.0      60.0     6.1  -30.5   37.9     87.2    15.9   100.0            3.5          7.1            0.8              0.7        10.8              8.2          20.5     66.7      0.7
Hameenpohjantie-LHT65009   40848.0      60.0     6.0  -31.4   38.0     86.4    16.0   100.0            3.4          7.0            0.9              0.7        10.0              7.7          23.0     63.7      0.9
Survontie-LHT65008         40848.0      60.0     5.6  -29.9   34.7     87.4    17.3   100.0            3.2          

<h3><center>VPD Scale What the Numbers Mean<h3>

**0-0.4 kPa	Air almost "full"	Very muggy, poor drying**<br>
**0.4-0.8 kPa	Comfortable range	Good for plants, comfortable**<br>
**0.8-1.2 kPa	Ideal for growth	Plants happy, good drying**<br>
**1.2-1.6 kPa	High drying power	Plants might stress**<br>
**1.6 kPa	Very "thirsty" air	Rapid drying, plant stress**

In [137]:
def prepare_all_sensors(sensor):
    prepared = {}
    summaries = []
    for site_name, raw_df in sensor.items():
        df = prepare_and_features(raw_df).copy()
        df["site"] = site_name

        df["Timestamp"] = pd.to_datetime(df["Timestamp"])
        df["year"] = df["Timestamp"].dt.year
        df["month"] = df["Timestamp"].dt.month
        df["day"] = df["Timestamp"].dt.day
        prepared[site_name] = df

        s = summarize(df)
        s["site"] = site_name
        summaries.append(s)

    summary_df = pd.DataFrame(summaries).set_index("site")
    all_data = pd.concat(prepared.values(), ignore_index=True)
    return prepared, summary_df, all_data
prepared, summary_df, all_data = prepare_all_sensors(sensors)

In [138]:
all_data.head()

Unnamed: 0,Timestamp,Humidity,Temperature_C,DewPoint_C,AbsHum_gm3,VPD_kPa,hour,date,site,year,month,day
0,2021-01-08 00:03:52,93.5,-7.11,-7.881536,2.547816,0.021747,0,2021-01-08,Keltimaentie-LHT65013,2021,1,8
1,2021-01-08 01:03:52,93.7,-7.22,-7.966458,2.529987,0.020877,1,2021-01-08,Keltimaentie-LHT65013,2021,1,8
2,2021-01-08 02:03:53,93.9,-7.33,-8.051468,2.512251,0.020022,2,2021-01-08,Keltimaentie-LHT65013,2021,1,8
3,2021-01-08 03:03:53,94.1,-7.32,-8.017195,2.519701,0.019382,3,2021-01-08,Keltimaentie-LHT65013,2021,1,8
4,2021-01-08 05:03:53,94.1,-7.53,-8.226094,2.475936,0.01903,5,2021-01-08,Keltimaentie-LHT65013,2021,1,8


<h3><center>Fungal Risk Analysis<h3>

In [139]:
# Creating wet episodes function, to finds continuous periods when surface are wet enough for fungi to grow.
def find_wet_episodes(df_site, max_gap_factor=2.0):
    df = df_site.sort_values("Timestamp").copy()

   # calculating the time gap between each measurement
    dt = df["Timestamp"].diff().dt.total_seconds() / 3600.0
    median_step = dt.median()
    if np.isnan(median_step) or median_step <= 0:median_step = 1.0  
        
    # Define the weet condition: 
    # Humidity is over 90% and air temperature  is between dew point and 2 degrees and above 5 degrees that fungi can grow
    dew_diff = df["Temperature_C"]-df["DewPoint_C"]
    wet_mask = ((df["Humidity"] >= 90) &(dew_diff <= 2.0) &(df["Temperature_C"] >= 5.0))

    # Grouping continuous wet periods:
    # Not wet anymore and longer than max_gap
    max_gap = max_gap_factor * median_step
    new_block = ((~wet_mask)|(dt > max_gap))

    # cumulative id of segments
    block_id = new_block.cumsum()

    # Analyzing each wet episode: 
    # When it started, when it ended, how long it lasted in hours, average temperature during the wet period
    wet_blocks = (df[wet_mask].groupby(block_id[wet_mask]).agg(
            start_time=("Timestamp", "min"),end_time=("Timestamp", "max"),
            duration_h=("Timestamp", lambda x: (x.max() - x.min()).total_seconds() / 3600.0),
            T_mean=("Temperature_C", "mean")))

    return wet_blocks.reset_index(drop=True)

**Classifying we episode**

In [140]:
# After finding wet episodes, I will classify them based on their duration and temperature
def classify_fungal_risk(wet_blocks):
  
    if wet_blocks.empty:
        return {
            "episodes_total": 0,
            "episodes_6h_15to20C": 0,
            "episodes_12h_10to15C": 0,
            "episodes_24h_5to10C": 0,}
    # Warm 15-20: is needed shorter wet periods 6+ hours
    cond_6h_15to20 = (wet_blocks["duration_h"] >= 6) & (wet_blocks["T_mean"].between(15, 20))
    # medium temperature10-15: is need medium wet periods 12+ hours
    cond_12h_10to15 = (wet_blocks["duration_h"] >= 12) & (wet_blocks["T_mean"].between(10, 15))
    # Was it cool 5-10 AND wet for at least 24 hours
    cond_24h_5to10 = (wet_blocks["duration_h"] >= 24) & (wet_blocks["T_mean"].between(5, 10))

    return {
        "episodes_total": len(wet_blocks),
        "episodes_6h_15to20C": int(cond_6h_15to20.sum()),
        "episodes_12h_10to15C": int(cond_12h_10to15.sum()),
        "episodes_24h_5to10C": int(cond_24h_5to10.sum()),}


In [143]:
# Applying fungal risk classification per site
def fungal_risk_per_site(all_data):
    results = {}
    for site, df_site in all_data.groupby("site"):
        wet_blocks = find_wet_episodes(df_site)
        stats = classify_fungal_risk(wet_blocks)
        results[site] = stats
    return pd.DataFrame(results).T
results = fungal_risk_per_site(all_data)
results

Unnamed: 0,episodes_total,episodes_6h_15to20C,episodes_12h_10to15C,episodes_24h_5to10C
Hameenpohjantie-LHT65009,1051,127,87,24
Hikipolku-LHT65010,1002,99,103,25
Kaunisharjuntie-LHT65006,831,100,102,30
Keilonkankaantie-LHT65005,818,92,110,23
Keltimaentie-LHT65013,1024,106,116,25
Ritopohantie-LHT65007,1075,121,97,19
Survontie-LHT65008,858,110,114,31


<h3><center>Viusalizing VPD and Absolute Humidity per site<h3>

In [None]:
def _plot_single_day(df_day, site_summary):
  
    df_day = df_day.sort_values("Timestamp")
    hours = df_day["Timestamp"].dt.hour

    fig = make_subplots(specs=[[{"secondary_y": True}]])

    # Temperature
    fig.add_trace( go.Scatter(x=hours,
            y=df_day["Temperature_C"],mode="lines+markers",name="Temp (°C)"),secondary_y=False)

    # VPD
    fig.add_trace(
        go.Scatter(x=hours,y=df_day["VPD_kPa"],mode="lines+markers",name="VPD (kPa)",line=dict(dash="dot")),secondary_y=False)

    # RH
    fig.add_trace(
        go.Scatter(x=hours,y=df_day["Humidity"],mode="lines",name="RH (%)"),secondary_y=True)

    # Absolute Humidity
    fig.add_trace(go.Scatter(
            x=hours,y=df_day["AbsHum_gm3"], mode="lines",name="AbsHum (g/m³)",line=dict(dash="dash")),secondary_y=True)

    # Build metrics text from site_summary (your long-term stats)
    txt = (
        f"{site_summary.name}<br>"
        f"VPD_peak_smooth: {site_summary['VPD_peak_smooth']:.2f} kPa<br>"
        f"DTR_median: {site_summary['DTR_median']:.2f} °C<br>"
        f"Temperature_24H: {site_summary['Temperature_24H']:.2f} °C<br>"
        f"Humidity_24H: {site_summary['Humidity_24H']:.2f} %-pts<br>"
        f"%RH>=90: {site_summary['%RH>=90']:.1f}%<br>"
        f"%RH<=30: {site_summary['%RH<=30']:.1f}%"
    )

    fig.update_layout(
        title="24h profile (selected day)",xaxis_title="Hour of day",annotations=[dict(
                x=0.01, y=1.15, xref="paper", yref="paper",text=txt,
                showarrow=False,align="left",bgcolor="rgba(255,255,255,0.9)")])
    

    fig.update_yaxes(title_text="Temp (°C) & VPD (kPa)", secondary_y=False)
    fig.update_yaxes(title_text="RH (%) & AbsHum (g/m³)", secondary_y=True)

    return fig


In [None]:
def _plot_month_overview(df_month, site, year, month):
    daily = (df_month
        .groupby("day").agg(
            T_mean=("Temperature_C", "mean"),VPD_mean=("VPD_kPa", "mean"),
            RH90_pct=("Humidity", lambda x: (x >= 90).mean() * 100),).reset_index())

    fig = make_subplots(specs=[[{"secondary_y": True}]])

    fig.add_trace(
        go.Bar(x=daily["day"], y=daily["T_mean"],name="Temp (°C)"),
        secondary_y=False)

    fig.add_trace(
        go.Scatter(
            x=daily["day"],y=daily["VPD_mean"],
            mode="lines+markers",name="VPD (kPa)",line=dict(dash="dot")), secondary_y=False)

    fig.add_trace(
        go.Scatter(
            x=daily["day"],y=daily["RH90_pct"],
            mode="lines+markers",name="%RH>=90"),secondary_y=True)

    fig.update_layout(
        title=f"{site} – Daily overview for {year}-{month:02d}", xaxis_title="Day of month")
    
    fig.update_yaxes(title_text="Temp (°C) & VPD (kPa)", secondary_y=False)
    fig.update_yaxes(title_text="%RH>=90", secondary_y=True)

    return fig


In [None]:
def _plot_year_overview(df_year, site, year):
    monthly = (df_year
        .groupby("month").agg(
            T_mean=("Temperature_C", "mean"),
            VPD_mean=("VPD_kPa", "mean"),
            RH90_pct=("Humidity", lambda x: (x >= 90).mean() * 100),).reset_index())

    fig = make_subplots(specs=[[{"secondary_y": True}]])
    
    # Temperature
    fig.add_trace(go.Scatter(
            x=monthly["month"],y=monthly["T_mean"], mode="lines+markers",name="Temp (°C)"),secondary_y=False)

    # VPD
    fig.add_trace(go.Scatter(
            x=monthly["month"],y=monthly["VPD_mean"], mode="lines+markers",name="VPD (kPa)", line=dict(dash="dot")), secondary_y=False)

    # %RH>=90
    fig.add_trace(go.Scatter(
            x=monthly["month"],y=monthly["RH90_pct"],mode="lines+markers",name="%RH>=90",),secondary_y=True)

    fig.update_layout(
        title=f"{site} – Monthly overview for {year}",xaxis_title="Month")
    
    fig.update_yaxes(title_text="Temp (°C) & VPD (kPa)", secondary_y=False)
    fig.update_yaxes(title_text="%RH>=90", secondary_y=True)

    return fig


In [None]:
def build_daily_base_explanation(df_day, site_summary):
    """
    Make a physics-correct, fairly detailed explanation string
    for ONE site on ONE day. DeepSeek will only rewrite this.
    """

    df_day = df_day.sort_values("Timestamp").copy()
    site_name = site_summary.name
    date_str = df_day["Timestamp"].dt.date.iloc[0].isoformat()

    # Basic stats
    T_min = df_day["Temperature_C"].min()
    T_max = df_day["Temperature_C"].max()
    T_mean = df_day["Temperature_C"].mean()

    RH_min = df_day["Humidity"].min()
    RH_max = df_day["Humidity"].max()
    RH_mean = df_day["Humidity"].mean()
    pct_RH90 = (df_day["Humidity"] >= 90).mean() * 100
    pct_RH30 = (df_day["Humidity"] <= 30).mean() * 100

    VPD_min = df_day["VPD_kPa"].min()
    VPD_max = df_day["VPD_kPa"].max()
    VPD_mean = df_day["VPD_kPa"].mean()

    AbsHum_min = df_day["AbsHum_gm3"].min()
    AbsHum_max = df_day["AbsHum_gm3"].max()

    # Compare to long-term site stats
    T_ref = site_summary["Temperature_24H"]
    H_ref = site_summary["Humidity_24H"]
    VPD_ref = site_summary["VPD_peak_smooth"]
    RH90_ref = site_summary["%RH>=90"]

    # Tiny classifications (pure Python, no LLM guessing)
    if T_mean > T_ref + 2:
        temp_comment = "This day was warmer than the typical day at this site."
    elif T_mean < T_ref - 2:
        temp_comment = "This day was cooler than the typical day at this site."
    else:
        temp_comment = "This day was close to the typical temperature at this site."

    if pct_RH90 > RH90_ref + 5:
        humidity_comment = "The air was more often very humid (RH≥90%) than usual."
    elif pct_RH90 < RH90_ref - 5:
        humidity_comment = "There were fewer very humid hours than the long-term average."
    else:
        humidity_comment = "The share of very humid hours (RH≥90%) was close to the long-term average."

    if VPD_max < 0.2:
        vpd_comment = "Drying power (VPD) stayed very weak all day, so wet surfaces would dry slowly."
    elif VPD_max < 0.6:
        vpd_comment = "Drying power (VPD) was mostly weak to moderate, so drying was possible but not very fast."
    else:
        vpd_comment = "There were periods with stronger drying power (higher VPD), especially around the warmest hours."

    base = []

    base.append(
        f"At {site_name} on {date_str}, air temperature ranged from about {T_min:.1f} °C "
        f"to {T_max:.1f} °C, with a daily mean of {T_mean:.1f} °C."
    )
    base.append(
        f"Relative humidity varied between roughly {RH_min:.0f}% and {RH_max:.0f}%, "
        f"with a mean near {RH_mean:.0f}%, about {pct_RH90:.0f}% of hours at or above 90% "
        f"and about {pct_RH30:.0f}% of hours at or below 30%."
    )
    base.append(
        f"VPD values were between about {VPD_min:.2f} and {VPD_max:.2f} kPa "
        f"(daily mean ≈ {VPD_mean:.2f} kPa), and absolute humidity ranged from "
        f"about {AbsHum_min:.1f} to {AbsHum_max:.1f} g/m³."
    )
    base.append(
        f"Compared to its long-term behaviour, this site has a typical 24-h mean temperature of "
        f"{T_ref:.1f} °C, a median diurnal range of {site_summary['DTR_median']:.1f} °C "
        f"and about {RH90_ref:.1f}% of hours with RH≥90%."
    )
    base.append(temp_comment)
    base.append(humidity_comment)
    base.append(vpd_comment)

    return " ".join(base)


In [None]:
import re

def build_daily_deepseek_prompt(base_explanation: str) -> str:
    """
    Turn the physics-correct daily summary into a rewrite task.
    DeepSeek should only rephrase, not reason or chat.
    """

    # Try to recover "At <site> on <date>" from the first sentence
    m = re.search(r"At (.+?) on (\d{4}-\d{2}-\d{2})", base_explanation)
    if m:
        site_name = m.group(1)
        date_str = m.group(2)
        first_start = f"At {site_name} on {date_str},"
    else:
        first_start = ""

    prompt = (
        "You are rewriting a short daily weather summary.\n"
        "Goal: write ONE clear explanation that a non-technical person in Jyväskylä can read.\n"
        "Rules:\n"
        "- Use ONLY the information in the summary below.\n"
        "- Keep ALL numbers exactly the same (°C, %, kPa, g/m³, hours).\n"
        "- Keep the physical meaning exactly the same:\n"
        "  * high humidity + low VPD = slow drying\n"
        "  * higher VPD = stronger drying (if humidity is not extreme).\n"
        "- Do NOT talk about rewriting, prompts, summaries, or what you are doing.\n"
        "- Do NOT use words like 'I', 'me', 'my', 'we', 'let me', or 'I'll'.\n"
    )

    if first_start:
        prompt += (
            f"- Your FIRST sentence must start with: \"{first_start}\"\n"
        )

    prompt += (
        "- Write 6–8 sentences in ONE paragraph (about 90–140 words).\n"
        "- Speak as a neutral narrator explaining what the day felt like.\n\n"
        "SUMMARY:\n"
        f"{base_explanation}\n\n"
        "Final explanation:"
    )

    return prompt



In [None]:
generator = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    do_sample=False,
    temperature=0.3,
    top_p=0.9,
    return_full_text=True,
    )


NameError: name 'pipeline' is not defined

In [None]:
def deepseek_explain(prompt: str, max_new_tokens: int = 260) -> str:
    import re

    out = generator(
        prompt,
        max_new_tokens=max_new_tokens,
    )
    text = out[0]["generated_text"]

    # --- 1) Remove ALL <think>...</think> blocks (if any) ---
    while True:
        start = text.find("<think>")
        if start == -1:
            break
        end = text.find("</think>", start)
        if end == -1:
            text = text[:start]
            break
        text = text[:start] + text[end + len("</think>"):]

    # --- 2) Keep only what comes after the last 'Final explanation:' ---
    marker = "Final explanation:"
    idx = text.rfind(marker)
    if idx != -1:
        text = text[idx + len(marker):]

    text = text.strip()

    # --- 3) Drop any leftover meta-sentences at the start ---
    # e.g. "Alright, I need to rewrite this..." or "[Your rewritten explanation here]"
    text = text.replace("[Your rewritten explanation here]", "").strip()

    sentences = re.split(r'(?<=[.!?])\s+', text)

    def is_weather_sentence(s: str) -> bool:
        s_low = s.lower()
        # keep sentences that talk about conditions, not about "rewriting" or "summary"
        keywords = [
            "temperature", "humid", "vpd", "dew", "drying",
            "air", "day", "morning", "afternoon", "evening",
            "surface", "ground", "plants", "wet", "dry"
        ]
        bad_words = ["rewrite", "summary", "prompt", "explanation", "i ", "let me"]

        if any(b in s_low for b in bad_words):
            return False
        return any(k in s_low for k in keywords)

    weather_sents = [s for s in sentences if s.strip() and is_weather_sentence(s)]

    if weather_sents:
        text = " ".join(weather_sents)
    else:
        # fallback: if filtering was too aggressive, keep the original text
        text = " ".join(sentences)

    # --- 4) Hard-limit to 8 sentences max ---
    sentences = re.split(r'(?<=[.!?])\s+', text)
    text = " ".join(sentences[:8]).strip()

    return text


In [None]:
def wrap_for_plotly(text: str, width: int = 90) -> str:
    """
    Insert <br> line breaks so the annotation does not run off the figure.
    width = max characters per line.
    """
    words = text.split()
    lines = []
    line = []

    current_len = 0
    for w in words:
        wlen = len(w) + 1  # +1 for space
        if current_len + wlen > width and line:
            lines.append(" ".join(line))
            line = [w]
            current_len = wlen
        else:
            line.append(w)
            current_len += wlen

    if line:
        lines.append(" ".join(line))

    return "<br>".join(lines)


In [None]:
def build_drilldown_figure(all_data, summary_df, site, year=None, month=None, day=None):
    # 1) Filter by site
    df_site = all_data[all_data["site"] == site].copy()
    if df_site.empty:
        return go.Figure().update_layout(title="No data for this site")

    # 2) Filter by year
    if year is not None:
        df_site = df_site[df_site["year"] == year]
        if df_site.empty:
            return go.Figure().update_layout(title=f"No data for {site} in {year}")

    # 3) Filter by month
    if month is not None:
        df_site = df_site[df_site["month"] == month]
        if df_site.empty:
            return go.Figure().update_layout(title=f"No data for {site} in {year}-{month:02d}")

    # 4) If a day is chosen: show 24h profile + DeepSeek explanation
    if day is not None:
        df_day = df_site[df_site["day"] == day]
        if df_day.empty:
            return go.Figure().update_layout(
                title=f"No data for {site} on {year}-{month:02d}-{day:02d}"
            )

        # Base fig (your existing daily plot)
        fig = _plot_single_day(df_day, summary_df.loc[site])

        # ---- build explanation text ----
        base = build_daily_base_explanation(df_day, summary_df.loc[site])
        prompt = build_daily_deepseek_prompt(base)
        explanation = deepseek_explain(prompt)
        explanation_wrapped = wrap_for_plotly(explanation, width=90)

        # ---- add annotation under the plot ----
        fig.add_annotation(
            x=0.0, y=-0.25,               # a bit under the x-axis
            xref="paper", yref="paper",
            xanchor="left", yanchor="top",
            text=explanation_wrapped,
            showarrow=False,
            align="left",
            bgcolor="rgba(255,255,255,0.9)",
            bordercolor="rgba(0,0,0,0.2)",
            borderwidth=1,
            borderpad=4,
            font=dict(size=11),
        )

        # More bottom space so the text is not cut
        fig.update_layout(margin=dict(b=150))

        return fig

    # 5) If only month or year is chosen: keep your existing overviews
    if month is not None:
        return _plot_month_overview(df_site, site, year, month)

    if year is not None:
        return _plot_year_overview(df_site, site, year)

    # 6) No year selected yet
    return go.Figure().update_layout(title=f"Select a year for site: {site}")


In [None]:

def create_drilldown_dashboard(all_data, summary_df):

    sites = sorted(all_data["site"].unique())
    if not sites:
        print("No sites found in all_data.")
        return

    site_dd = widgets.Dropdown(options=sites, description="Site:")
    year_dd = widgets.Dropdown(options=[], description="Year:")
    month_dd = widgets.Dropdown(options=[], description="Month:")
    day_dd = widgets.Dropdown(options=[], description="Day:")

    out = widgets.Output()

    def refresh_figure():
        with out:
            clear_output(wait=True)

            site = site_dd.value
            year = year_dd.value
            month = month_dd.value
            day = day_dd.value

         
            selected_year = year
            selected_month = None if (month in (None, "All")) else month
            selected_day = None if (day in (None, "All")) else day

            fig = build_drilldown_figure(all_data, summary_df, site=site, year=selected_year, month=selected_month, day=selected_day)
            display(fig)


    def on_site_change(change):
        site = change["new"]
        if site is None:
            return

        df_site = all_data[all_data["site"] == site]
        years = sorted(df_site["year"].unique())

        year_dd.options = years
        year_dd.value = years[0] if years else None

        month_dd.options = []
        month_dd.value = None
        day_dd.options = []
        day_dd.value = None

        refresh_figure()

    def on_year_change(change):
        site = site_dd.value
        year = change["new"]

        if site is None or year is None:
            month_dd.options = []
            month_dd.value = None
            day_dd.options = []
            day_dd.value = None
            refresh_figure()
            return

        df_sy = all_data[(all_data["site"] == site) & (all_data["year"] == year)]
        months = sorted(df_sy["month"].unique())

        month_dd.options = ["All"] + months
        month_dd.value = "All"

        day_dd.options = []
        day_dd.value = None
        refresh_figure()
        
    def on_month_change(change):
        site = site_dd.value
        year = year_dd.value
        month = change["new"]

        if site is None or year is None:
            return

        if month in (None, "All"):
            day_dd.options = []
            day_dd.value = None
            refresh_figure()
            return

        df_sym = all_data[
            (all_data["site"] == site) &
            (all_data["year"] == year) &
            (all_data["month"] == month)]
        days = sorted(df_sym["day"].unique())

        day_dd.options = ["All"] + days
        day_dd.value = "All"

        refresh_figure()


    def on_day_change(change): refresh_figure()
    site_dd.observe(on_site_change, names="value")
    year_dd.observe(on_year_change, names="value")
    month_dd.observe(on_month_change, names="value")
    day_dd.observe(on_day_change, names="value")
    on_site_change({"new": sites[0]})
    controls = widgets.VBox([site_dd, year_dd, month_dd, day_dd])
    display(widgets.VBox([controls, out]))


In [None]:
create_drilldown_dashboard(all_data, summary_df)

VBox(children=(VBox(children=(Dropdown(description='Site:', options=('LHT65005-Keilonkankaantie', 'LHT65006-Ka…

In [None]:
# https://chatgpt.com/c/690da02e-e24c-8328-956a-1846a27700b6

In [None]:
# http://chatgpt.com/c/68fced88-8ef8-832b-8418-c16f776ed93f

**Kotaniementie WS100 +LHT65006-Kaunisharjuntie**

In [None]:
# LHT says: most humid RH is around 89%, lowest midday dryness, VPD_mid is around 0.3 kPa, lots of time is more than 90% RH.
# WS100 says: highest short-burst intensity in the network, peaks up to is around 184 mm/h overall and more snow/mix share 20%.
# Moist air pooling in the strait + slope/terrain near Kanavuori

**North/NE:Kaakkovuorentie (WS100)+LHT65007-Ritopohantie urban fringe**

In [None]:
# LHT says: Warmest and biggest day night swing it has largest DTR, drier afternoons.
# WS100 says: Frequent sharp spikes peaks is around 111 mm/h rain heavy snow share is around 7 %.

**Central West Tuulimyllyntie (WS100) + Keltimaentie-LHT65013/Hikipolku-65010/Hameenpohjantie-65009/Survontie-65008 city + lake mix**

In [None]:
# LHT says: Humid but moderate dryness VPD_mid is around 0.4–0.6 kPa, mid diurnal swings.
# WS100 says: Wettest totals is around 2499 mm and largest single event is around 101 mm
# Sits on the West to East storm path

**South/SW Keilonkankaantie-LHT65005 no WS100 here**

In [None]:
# LHT says: Driest midday  VPD_mid is around 1.6 kPa.
# Best for dry air alertsL comfort, drying, static and as a contrast to the humid strait site.

**Saaritie & Tähtiniementie (shoreline south)**

In [None]:
# Moderate peaks is around 59–83 mm/h and steady long events.

**Flashy downpours:trigger at each site's p95–p99 10-min rate, highest at Kotaniementie and Kaakkovuorentie.**

**Fog/black ice risk is high for Kotaniementie/LHT65006-Kaunisharjuntie, If rain ends and LHT is more than 95% with Temperatrure is near 0 degree**

**Tuulimyllyntie event total is more 35–40 mm and LHT VPD_mid is bigger than 0.6 kPa, expect persistent wet surfaces**

**Wind share of 10 min records; mean speeds**<br>
NW: 21.7% · 3.46 m/s

W: 12.0% · 2.84 m/s

S: 16.4% · 2.20 m/s

SE: 16.6% · 2.15 m/s

N: 9.7% · 3.57 m/s

NE: 6.4% · 3.52 m/s

E: 7.8% · 3.25 m/s

**Hills help showers pop. When air is pushed up a ridge (like Kanavuori, 200 m), it cools and can squeeze out quick downpours. That lift is a normal effect of hills called orographic rain.**

**Big weather usually moves west → east in Finland. Our weather sits in the westerlies, so many rain bands travel from the west toward the east, lining up with the corridor you noticed**