In [2]:
import pandas as pd 
from pathlib import Path
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder
import joblib
import sys

# Manually set project root once
PROJECT_ROOT = Path().resolve().parents[1]
sys.path.append(str(PROJECT_ROOT))

print(Path.cwd())


# Now import
from config.config import MODEL_DIR


/home/jonas-limpert/Projects/EnviroSentinel/notebooks


ImportError: cannot import name 'MODEL_PATH' from 'config.config' (/home/jonas-limpert/Projects/EnviroSentinel/config/config.py)

In [None]:
data_path = Path("/home/jonas-limpert/Projects/EnviroSentinel/data/processed_data")

dfs = {}
for file in data_path.iterdir():
    df = pd.read_csv(file)
    dfs.update({file.name.split(".")[0]: df})
print([k for k in dfs.keys()])

In [None]:
model = IsolationForest(
    n_estimators=100,        # Number of trees
    max_samples='auto',      # Subsampling size per tree
    contamination=0.01,      # Estimated fraction of outliers
    random_state=42
)

In [None]:
encoder = OrdinalEncoder()
for df_name, df in dfs.items():
    df['station'] = df_name
    df_encoded = pd.get_dummies(df, columns=["sound_bin"], drop_first=True)
    df["sound_bin_encoded"] = encoder.fit_transform(df[["sound_bin"]])
    df['station_encoded'] = encoder.fit_transform(df[['station']])

In [None]:
validation_df = dfs.pop('Station_Kaiser')

combined_df = pd.concat([
    df for df in dfs.values()
    ], ignore_index=True)
    

## Encoding

In [None]:
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler

split_idx = int(len(combined_df) * 0.8)
train_df = combined_df.iloc[:split_idx].copy()
validation_df = combined_df.iloc[split_idx:].copy()

numeric_columns = combined_df.select_dtypes(include='number').columns.tolist()

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(train_df[numeric_columns])
X_val_scaled = scaler.transform(validation_df[numeric_columns])

model = IsolationForest(contamination=0.05, random_state=42)
model.fit(X_train_scaled)

validation_df['anomaly'] = model.predict(X_val_scaled)
train_df['anomaly'] = model.predict(X_train_scaled)

validation_df.to_csv("/home/jonas-limpert/Projects/EnviroSentinel/data/validation_df.csv", index=False)


In [None]:
import matplotlib.pyplot as plt
import pandas as pd

validation_df['timestamp'] = pd.to_datetime(validation_df['timestamp'])
validation_df.set_index('timestamp', inplace=True)

sensors_to_explore = [
    'value_Luftfeuchte',
    'value_Lautstärke',
    'value_Temperatur',
]

    

# Loop through each sensor
for sensor in sensors_to_explore:
    if sensor in validation_df.columns:
        anomalies = validation_df[validation_df['anomaly'] == -1]
        
        plt.figure(figsize=(15, 4))
        plt.plot(validation_df.index, validation_df[sensor], label=sensor, color='gray')
        plt.scatter(anomalies.index, anomalies[sensor], color='red', label='Anomaly', s=20)
        plt.title(f"Anomaly Detection: {sensor}")
        plt.xlabel("Timestamp")
        plt.ylabel(sensor)
        plt.legend()
        plt.grid(True)
        plt.tight_layout()
        plt.show()



In [None]:

# Step 1: Ensure timestamp is datetime and used as index
train_df['timestamp'] = pd.to_datetime(train_df['timestamp'])
train_df.set_index('timestamp', inplace=True)

# Step 2: Define which sensors to visualize
sensors_to_explore = [
    'value_Luftfeuchte',
    'value_Lautstärke',
    'value_Temperatur',
]

# Step 3: Loop over each station and each sensor
for station_name in train_df['station'].unique():
    station_df = train_df[train_df['station'] == station_name]
    
    for sensor in sensors_to_explore:
        if sensor in station_df.columns:
            anomalies = station_df[station_df['anomaly'] == -1]
            
            plt.figure(figsize=(15, 4))
            plt.plot(station_df.index, station_df[sensor], label=sensor, color='gray')
            plt.scatter(anomalies.index, anomalies[sensor], color='red', label='Anomaly', s=20)
            plt.title(f"Anomaly Detection: {sensor} – {station_name}")
            plt.xlabel("Timestamp")
            plt.ylabel(sensor)
            plt.legend()
            plt.grid(True)
            plt.tight_layout()
            plt.show()



In [None]:
from pathlib import Path
import joblib
ModelPath = Path(PROJECT_ROOT) / 'model'
ModelPath.mkdir(parents=True, exist_ok=True)
print(ModelPath)
# Save model
joblib.dump(model, ModelPath / 'isolation_forest_model.pkl')

# Save scaler (if you're using StandardScaler or similar)
joblib.dump(scaler, ModelPath / 'scaler.pkl')