In [None]:
# Import libraries
import pandas as pd
import numpy as np
import csv
import plotly.graph_objects as go
import matplotlib.pyplot as plt
from cartopy import crs as ccrs
from cartopy import feature as cfeature
from sklearn.cluster import DBSCAN, OPTICS
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
from scipy.interpolate import griddata
from scipy.spatial import cKDTree

from joblib import Parallel, delayed
from typing import List, Tuple

In [None]:
# Load the data
equipment = pd.read_csv('../data/equipment.csv', delimiter=',')

In [None]:
equipment.info()

In [None]:
equipment['Postleitzahl'] = equipment['Postleitzahl'].astype(str)

def expand_plz(plz: str) -> str:
    if len(plz) < 5:
        return '0' + plz
    return plz

equipment['Postleitzahl'] = equipment['Postleitzahl'].apply(expand_plz)

In [None]:
equipment.head()

In [None]:
# CSV-Datei mit Postleitzahlen und Koordinaten einlesen
plz_koordinaten = {}
with open('../data/plz_geocoord.csv', mode='r') as infile:
    reader = csv.reader(infile)
    next(reader)  # Überspringe die Kopfzeile
    for rows in reader:
        plz, lat, lon = rows
        plz_koordinaten[plz] = (float(lat), float(lon))


In [None]:
# Funktion zum Nachschlagen der Koordinaten
def get_coordinates(plz: str) -> Tuple[float, float]:
    return plz_koordinaten.get(plz, (np.nan, np.nan))

# Neue Spalten für Latitude und Longitude hinzufügen
equipment[['lat', 'lon']] = equipment['Postleitzahl'].apply(get_coordinates).apply(pd.Series)

In [None]:
equipment.head()

In [None]:
weather_csv = pd.read_csv('../data/weather_scores.csv')

In [None]:
# Load weather data from JSON
weather_data = pd.DataFrame(weather_csv)

# Create a grid of latitude and longitude values
lat_values = np.linspace(equipment['lat'].min(), equipment['lat'].max(), 100)
lon_values = np.linspace(equipment['lon'].min(), equipment['lon'].max(), 100)
lat_grid, lon_grid = np.meshgrid(lat_values, lon_values)

# Interpolate Weather values for the grid (for visualization purposes)
weather_grid = griddata((weather_data['lat'], weather_data['lon']), weather_data['weather_score_pca'], (lat_grid, lon_grid), method='linear')

# Plot the Weather data
plt.figure(figsize=(5, 5))
plt.contourf(lat_grid, lon_grid, weather_grid, cmap='coolwarm')
plt.colorbar(label='Weather score')
plt.xlabel('Latitude')
plt.ylabel('Longitude')
plt.title('Weather score Contour Plot')
plt.show()

# Create a KDTree for fast nearest-neighbor lookup
tree = cKDTree(weather_data[['lat', 'lon']])

# Define a function to interpolate weather data based on latitude and longitude
def interpolate_data(latitude, longitude, tree_object, data, k=3):
    dist, idx = tree_object.query([latitude, longitude], k=k)  # Find the k nearest neighbors
    # Handle cases where distance is zero (point is exactly at an incident location)
    if np.any(dist == 0):
        return data.iloc[idx[dist == 0]]['weather_score_pca'].values[0]
    
    if np.any(np.isnan(dist)):
        return np.nan
    
    # Extract the weather of the nearest neighbors
    nearest_score = data.iloc[idx]['weather_score_pca'].values
    
    # Inverse distance weighting
    weights = 1 / dist
    interpolated_data = np.dot(weights, nearest_score) / np.sum(weights)
    return interpolated_data

# Apply the interpolation
equipment['weather_score'] = equipment.apply(lambda row: interpolate_data(row['lat'], row['lon'], tree, weather_data) if not np.isnan(row['lat']) and not np.isnan(row['lon']) else np.nan, axis=1)

In [None]:
equipment.info()

In [None]:
with open('../data/equipment.csv', mode='w') as outfile:
    equipment.to_csv(outfile, index=False)

In [None]:
X_train = equipment[["Equipment", "lat", "lon", "weather_score"]].copy()
X_train.dropna(axis=0, inplace=True)

X_train.info()

In [None]:
# # Scale all features
# scaler = StandardScaler()
# X_train_scaled = scaler.fit_transform(X_train)

# X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns)

In [None]:
# Define the parameter grid
eps_values = np.arange(0.5, 2, 0.1)
min_samples_values = range(75, 200, 10)

print("Number of variations to test", len(eps_values) * len(min_samples_values))

In [None]:
def evaluate_dbscan(eps: int, min_samples: int, X: pd.DataFrame) -> Tuple[int, int, float]:
    dbscan = DBSCAN(eps=eps, min_samples=min_samples)
    labels = dbscan.fit_predict(X)
    
    # Silhouette Score requires at least 2 clusters, however, 2 clusters is not useful for our case
    if len(set(labels)) > 2:
        score = silhouette_score(X, labels)
    else:
        score = -1  # Invalid score if less than 2 clusters are found
    
    return (eps, min_samples, score, len(set(labels)))

In [None]:
# # Perform parallel grid search
# results = Parallel(n_jobs=-1)(delayed(evaluate_dbscan)(eps, min_samples, X_train[["lat", "lon", "weather_score"]])
#                               for eps in eps_values
#                               for min_samples in min_samples_values)

# # Convert results to a DataFrame for easier analysis
# results_df = pd.DataFrame(results, columns=['eps', 'min_samples', 'score', 'n_clusters'])

# # Display results
# print(results_df)

In [None]:
# # Run DBSCAN
# # Identify the best combination of parameters
# best_result = results_df.loc[results_df['score'].idxmax()]
# print(best_result)

# dbscan = DBSCAN(eps=best_result["eps"], min_samples=int(best_result["min_samples"]))
# # dbscan = DBSCAN(eps=0.3, min_samples=75)
# labels = dbscan.fit_predict(X_train[['lat', 'lon', 'weather_score']])


In [None]:
# from sklearn.cluster import KMeans

# # Define the parameter grid
# n_clusters_values = range(2, 10)

# def evaluate_kmeans(n_clusters: int, X: pd.DataFrame) -> Tuple[int, float]:
#     kmeans = KMeans(n_clusters=n_clusters)
#     kmeans.fit(X)
#     labels = kmeans.predict(X)
#     score = silhouette_score(X, labels)
#     return (n_clusters, score)

# # Perform parallel grid search
# results = Parallel(n_jobs=-1)(delayed(evaluate_kmeans)(n_clusters, X_train[["lat", "lon", "weather_score"]])
#                               for n_clusters in n_clusters_values)

# # Convert results to a DataFrame for easier analysis
# results_df = pd.DataFrame(results, columns=['n_clusters', 'score'])

# # Display results
# print(results_df)

In [None]:
dbscan = DBSCAN(eps=0.9, min_samples=120)
labels = dbscan.fit_predict(X_train[['lat', 'lon', 'weather_score']])

In [None]:
num_clusters = len(set(labels) - {-1})

print(f"Number of clusters: {num_clusters}")

In [None]:
X_train.loc[:, 'cluster'] = labels

In [None]:
noise_data = X_train[X_train['cluster'] == -1]
non_noise_data = X_train[X_train['cluster'] != -1]

# Create a 3D scatter plot using Plotly Graph Objects
fig = go.Figure()

# Add trace for non-noise clusters
fig.add_trace(go.Scatter3d(
    x=non_noise_data['lon'],
    y=non_noise_data['lat'],
    z=non_noise_data['weather_score'],
    mode='markers',
    marker=dict(
        size=8,
        color=non_noise_data['cluster'],  # Color by cluster
        colorscale='Viridis',              # Color scale
        opacity=0.8,
        colorbar=dict(title='Cluster')
    ),
    text=non_noise_data['weather_score'],  # Hover text
    hovertemplate='<b>Weather Score:</b> %{text}<br><b>Lat:</b> %{y}<br><b>Lon:</b> %{x}'
))

# Add trace for noise cluster
fig.add_trace(go.Scatter3d(
    x=noise_data['lon'],
    y=noise_data['lat'],
    z=noise_data['weather_score'],
    mode='markers',
    marker=dict(
        size=8,
        color='purple',  # Color for noise points
        opacity=0.01,
    ),
    text=noise_data['weather_score'],  # Hover text
    hovertemplate='<b>Weather Score:</b> %{text}<br><b>Lat:</b> %{y}<br><b>Lon:</b> %{x}'
))

# Update layout for better visualization
fig.update_layout(
    title='3D Scatter Plot of Equipment Clusters in Germany',
    scene=dict(
        xaxis=dict(title='Longitude'),
        yaxis=dict(title='Latitude'),
        zaxis=dict(title='Weather Score'),
        aspectmode='cube'  # Ensure aspect ratio is equal
    )
)

fig.show()

# Save the plot as an HTML file
fig.write_html('../plots/weather_clusters.html')

In [None]:
# Calculate centroid of each cluster
cluster_centers = X_train.groupby('cluster')[['lat', 'lon', 'weather_score']].mean().reset_index()

reduce_maintenance = []
increase_maintenance = []

for centroid in cluster_centers.itertuples():
    if centroid.cluster == -1:
        continue
    if centroid.weather_score > 5:
        increase_maintenance.append(centroid)
    elif centroid.weather_score < -5:
        reduce_maintenance.append(centroid)

reduce_maintenance = pd.DataFrame(reduce_maintenance).drop(columns=['Index'])
increase_maintenance = pd.DataFrame(increase_maintenance).drop(columns=['Index'])

In [None]:
# Create a 3D scatter plot using Plotly Graph Objects
fig = go.Figure()

reduce_maintenance_equipment = X_train[X_train['cluster'].isin(reduce_maintenance['cluster'])]
increase_maintenance_equipment = X_train[X_train['cluster'].isin(increase_maintenance['cluster'])]

fig.add_trace(go.Scatter3d(
    x=reduce_maintenance_equipment['lon'],
    y=reduce_maintenance_equipment['lat'],
    z=reduce_maintenance_equipment['weather_score'],
    mode='markers',
    marker=dict(
        size=8,
        color='green',
        colorscale='Viridis',              # Color scale
        opacity=0.8,
        colorbar=dict(title='Cluster')
    ),
    text=reduce_maintenance_equipment['weather_score'],  # Hover text
    hovertemplate='<b>Weather Score:</b> %{text}<br><b>Lat:</b> %{y}<br><b>Lon:</b> %{x}'
))

fig.add_trace(go.Scatter3d(
    x=increase_maintenance_equipment['lon'],
    y=increase_maintenance_equipment['lat'],
    z=increase_maintenance_equipment['weather_score'],
    mode='markers',
    marker=dict(
        size=8,
        color='red',
        colorscale='Viridis',              # Color scale
        opacity=0.8,
        colorbar=dict(title='Cluster')
    ),
    text=increase_maintenance_equipment['weather_score'],  # Hover text
    hovertemplate='<b>Weather Score:</b> %{text}<br><b>Lat:</b> %{y}<br><b>Lon:</b> %{x}'
))


# Update layout for better visualization
fig.update_layout(
    title='3D Scatter Plot of Equipment Clusters in Germany',
    scene=dict(
        xaxis=dict(title='Longitude'),
        yaxis=dict(title='Latitude'),
        zaxis=dict(title='Weather Score'),
        aspectmode='cube'  # Ensure aspect ratio is equal
    )
)

fig.show()

# Save the plot as an HTML file
fig.write_html('../plots/weather_result.html')

In [None]:
# Assign colors
increase_maintenance_equipment.loc[:, 'color'] = 'red'
reduce_maintenance_equipment.loc[:, 'color'] = 'green'

combined = pd.concat([reduce_maintenance_equipment, increase_maintenance_equipment])

# Create the plot
fig, ax = plt.subplots(figsize=(10, 10), subplot_kw={'projection': ccrs.PlateCarree()})

# Add geographical features
ax.add_feature(cfeature.BORDERS, linestyle='-')
ax.add_feature(cfeature.LAND, facecolor='white')
ax.add_feature(cfeature.OCEAN, facecolor='lightblue')
ax.add_feature(cfeature.COASTLINE, zorder=5)

# Plot data points
for color, group in combined.groupby('color'):
    ax.scatter(group['lon'], group['lat'], color=color, s=100, alpha=0.7, transform=ccrs.PlateCarree())

# Set plot title and extent
ax.set_title('Equipment Clusters in Germany')
ax.set_extent([5, 15, 47, 55], crs=ccrs.PlateCarree())

# Add legend
ax.legend(['Reduce Maintenance', 'Increase Maintenance'], loc='upper left')

# Save the plot as an image
plt.savefig('../plots/cluster_plot_weather.png', bbox_inches='tight')

# Show the plot
plt.show()