In [None]:
# Import libraries
import pandas as pd
import numpy as np
import csv
import plotly.graph_objects as go
import matplotlib.pyplot as plt
from PIL.ImageColor import colormap
from cartopy import crs as ccrs
from cartopy import feature as cfeature
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler, Normalizer, MinMaxScaler
from sklearn.decomposition import PCA   
from sklearn.metrics import silhouette_score
from scipy.interpolate import griddata
from scipy.spatial import KDTree

from joblib import Parallel, delayed
from typing import Tuple

In [None]:
# Load the data
equipment = pd.read_csv('../data/merged_data.csv', delimiter=',')

In [None]:
equipment.info()

In [None]:
# Create a 3D scatter plot using Plotly Graph Objects
fig = go.Figure()

# Add trace for non-noise clusters
fig.add_trace(go.Scatter3d(
    x=equipment['lon'],
    y=equipment['lat'],
    z=equipment['Anzahl Meldungen'],
    mode='markers',
    marker=dict(
        size=8,
        color="purple",  # Color by cluster
        colorscale='Viridis',              # Color scale
        opacity=0.8,
        colorbar=dict(title='Cluster')
    ),
    text=equipment['Anzahl Meldungen'],  # Hover text
    hovertemplate='<b>Incident count:</b> %{text}<br><b>Lat:</b> %{y}<br><b>Lon:</b> %{x} <br><b>Cluster:</b> %{marker.color}'
))

fig.write_html('../plots/test.html')

In [None]:
equipment.drop_duplicates(subset=["Equipment ID"], inplace=True)

In [None]:
equipment['lat'].hist(bins=50)
plt.xlabel("latitude")
plt.ylabel("Frequency")
plt.show()

In [None]:
equipment['lon'].hist(bins=50)
plt.xlabel("longitude")
plt.ylabel("Frequency")
plt.show()

In [None]:
equipment['Anzahl Meldungen'].hist(bins=50)
print(equipment['Anzahl Meldungen'].mean())
plt.xlabel("Incident Count")
plt.ylabel("Frequency")

plt.savefig("../plots/hist_inc.svg", format='svg')
plt.show()

In [None]:
equipment['Zyklus Wartung'].hist(bins=50)
plt.xlabel("Zyklus Wartung")
plt.ylabel("Frequency")

plt.savefig("../plots/hist_zyk.svg", format='svg')
plt.show()

In [None]:
equipment.rename(columns={"Anzahl Meldungen": "incident_count", "Equipment ID": "Equipment"}, inplace=True)

In [None]:
# Create Contour graph
lat_values = np.linspace(equipment['lat'].min(), equipment['lat'].max(), 50)
lon_values = np.linspace(equipment['lon'].min(), equipment['lon'].max(), 50)
lat_grid, lon_grid = np.meshgrid(lat_values, lon_values)

# Interpolate Incident values for the grid (for visualization purposes)
incident_grid = griddata((equipment['lat'], equipment['lon']), equipment['Zyklus Wartung'], (lat_grid, lon_grid), method='linear')

plt.contourf(lon_grid, lat_grid, incident_grid, cmap='viridis')
plt.colorbar()

plt.savefig("../plots/zyklus_")

In [None]:
plt.figure(figsize=(5,11.8/2))
plt.scatter(equipment["lon"], equipment["lat"], s=5)
plt.savefig("../plots/equipment_scatter.svg", format='svg')
plt.show()

In [None]:
combined = equipment

# Create the plot
fig, ax = plt.subplots(figsize=(10, 10), subplot_kw={'projection': ccrs.PlateCarree()})

# Add geographical features
ax.add_feature(cfeature.BORDERS, linestyle='-')
ax.add_feature(cfeature.LAND, facecolor='white')
ax.add_feature(cfeature.OCEAN, facecolor='lightblue')
ax.add_feature(cfeature.COASTLINE, zorder=5)

# Plot data points

ax.scatter(combined['lon'], combined['lat'], c=combined['Zyklus Wartung'], s=100, alpha=0.7, transform=ccrs.PlateCarree())

# Set plot title and extent
ax.set_title('Equipment Clusters in Germany')
ax.set_extent([5, 15, 47, 55], crs=ccrs.PlateCarree())

# Save the plot as an SVG image
plt.savefig('../plots/cluster_plot_zyklus_before.svg', format='svg')

# Show the plot
plt.show()

In [None]:
from scipy.stats import entropy

# Define a function to calculate entropy for a given column
def calculate_entropy(column):
    # Calculate the frequency of each bin/category
    value_counts = column.value_counts()
    # Calculate the probability distribution
    probabilities = value_counts / len(column)
    # Calculate the entropy
    return entropy(probabilities)

# Group by 'Zyklus_Wartung'
grouped = equipment.groupby('Zyklus Wartung')

# Calculate entropy for lat and lon within each group
entropy_results = []

for name, group in grouped:
    # Discretize lat and lon into bins
    group['lat_bins'] = pd.cut(group['lat'], bins=10000)
    group['lon_bins'] = pd.cut(group['lon'], bins=10000)
    
    # Calculate entropy for the binned lat and lon columns
    lat_entropy = calculate_entropy(group['lat_bins'])
    lon_entropy = calculate_entropy(group['lon_bins'])
    
    entropy_results.append({
        'Zyklus_Wartung': name,
        'lat_entropy': lat_entropy,
        'lon_entropy': lon_entropy
    })

# Convert results to a DataFrame for better visualization
entropy_df = pd.DataFrame(entropy_results)
print(entropy_df)

In [None]:
# Define a function to calculate entropy for a given column
def calculate_entropy(column):
    # Calculate the frequency of each bin/category
    value_counts = column.value_counts()
    # Calculate the probability distribution
    probabilities = value_counts / len(column)
    # Calculate the entropy
    return entropy(probabilities)

# Group by 'Zyklus_Wartung'
grouped = equipment.groupby('Zyklus Wartung')

# Calculate entropy for lat and lon within each group
entropy_results = []

for name, group in grouped:
    # Discretize lat and lon into bins
    group['lat_bins'] = pd.cut(group['lat'], bins=10)
    group['lon_bins'] = pd.cut(group['lon'], bins=10)
    
    # Calculate entropy for the binned lat and lon columns
    lat_entropy = calculate_entropy(group['lat_bins'])
    lon_entropy = calculate_entropy(group['lon_bins'])
    
    entropy_results.append({
        'Zyklus_Wartung': name,
        'lat_entropy': lat_entropy,
        'lon_entropy': lon_entropy
    })

# Convert results to a DataFrame for better visualization
entropy_df = pd.DataFrame(entropy_results)
print(entropy_df)

In [None]:
X_train = equipment[["Equipment", "lat", "lon", "incident_count", "Zyklus Wartung"]].copy()
X_train.dropna(axis=0, inplace=True)

X_train.info()

In [None]:
print(np.round(X_train.max(), 2), "\n", np.round(X_train.min(), 2))

In [None]:
# Separate the 'equipment_id' column
equipment_id = X_train['Equipment']
features = X_train.drop(columns=['Equipment'])
# Add logarithmic transformation to zyklus wartung
features['Zyklus Wartung'] = np.log1p(features['Zyklus Wartung'])

In [None]:
features['Zyklus Wartung'].hist(bins=4)
plt.xlabel("Zyklus Wartung")
plt.ylabel("Frequency")

plt.show()

In [None]:
# Scale only the feature columns
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)
features_scaled = pd.DataFrame(features_scaled, columns=features.columns)
features_scaled["Zyklus Wartung"] = features_scaled["Zyklus Wartung"] * 0.3

# Combine the scaled features with the 'equipment_id' column
X_train_scaled = pd.concat([equipment_id.reset_index(drop=True), features_scaled], axis=1)

In [None]:
# Define the parameter grid
eps_values = np.arange(0.1, 1, 0.1)
min_samples_values = range(30, 150, 10)

print("Number of variations to test", len(eps_values) * len(min_samples_values))

In [None]:
def evaluate_dbscan(eps: int, min_samples: int, x: pd.DataFrame) -> Tuple[int, int, float, int]:
    dbscan_gs = DBSCAN(eps=eps, min_samples=min_samples)
    labels_gs = dbscan_gs.fit_predict(x)
    
    # Silhouette Score requires at least 2 clusters, however, 2 clusters is not useful for our case
    if len(set(labels_gs)) > 2:
        score = silhouette_score(x, labels_gs)
    else:
        score = -1  # Invalid score if less than 2 clusters are found
    
    return eps, min_samples, score, len(set(labels_gs))

In [None]:
par = False
if par: 
    # Perform parallel grid search
    results = Parallel(n_jobs=-1)(delayed(evaluate_dbscan)(eps, min_samples, X_train_scaled[['lat', 'lon', 'incident_count', "Zyklus Wartung"]])
                                  for eps in eps_values
                                  for min_samples in min_samples_values)
    
    # Convert results to a DataFrame for easier analysis
    results_df = pd.DataFrame(results, columns=['eps', 'min_samples', 'score', 'n_clusters'])
    
    # Display results
    print(results_df)


In [None]:
if par:   
    # Run DBSCAN
    # Identify the best combination of parameters
    best_result = results_df.loc[results_df['score'].idxmax()]
    print(best_result)
    
    dbscan = DBSCAN(eps=best_result["eps"], min_samples=int(best_result["min_samples"]))
    # dbscan = DBSCAN(eps=0.3, min_samples=75)
    labels = dbscan.fit_predict(X_train_scaled[['lat', 'lon', 'incident_count', "Zyklus Wartung"]])

In [None]:
# from sklearn.cluster import KMeans
# 
# # Define the parameter grid
# n_clusters_values = range(2, 10)
# 
# def evaluate_kmeans(n_clusters: int, X: pd.DataFrame) -> Tuple[int, float]:
#     kmeans = KMeans(n_clusters=n_clusters)
#     kmeans.fit(X)
#     labels = kmeans.predict(X)
#     score = silhouette_score(X, labels)
#     return (n_clusters, score)
# 
# # Perform parallel grid search
# results = Parallel(n_jobs=-1)(delayed(evaluate_kmeans)(n_clusters, X_train[["lat", "lon", "weather_score"]])
#                               for n_clusters in n_clusters_values)
# 
# # Convert results to a DataFrame for easier analysis
# results_df = pd.DataFrame(results, columns=['n_clusters', 'score'])
# 
# # Display results
# print(results_df)

In [None]:
dbscan = DBSCAN(eps=0.4, min_samples=60)
labels = dbscan.fit_predict(X_train_scaled[['lat', 'lon', 'incident_count', "Zyklus Wartung"]])

In [None]:
num_clusters = len(set(labels) - {-1})

print(f"Number of clusters: {num_clusters}")

In [None]:
X_train_scaled.loc[:, 'cluster'] = labels

# Ensure 'Equipment' is of the same type in both DataFrames
X_train['Equipment'] = X_train['Equipment'].astype(int)
X_train_scaled['Equipment'] = X_train_scaled['Equipment'].astype(int)

X_train = X_train.merge(X_train_scaled[['Equipment', 'cluster']], on='Equipment')

In [None]:
noise_data = X_train_scaled[X_train_scaled['cluster'] == -1]
non_noise_data = X_train_scaled[X_train_scaled['cluster'] != -1]

# Create a 3D scatter plot using Plotly Graph Objects
fig = go.Figure()

# Add trace for non-noise clusters
fig.add_trace(go.Scatter3d(
    x=non_noise_data['lon'],
    y=non_noise_data['lat'],
    z=non_noise_data['incident_count'],
    mode='markers',
    marker=dict(
        size=8,
        color=non_noise_data['cluster'],  # Color by cluster
        colorscale='Viridis',              # Color scale
        opacity=0.8,
        colorbar=dict(title='Cluster')
    ),
    text=non_noise_data['incident_count'],  # Hover text
    hovertemplate='<b>Incident count:</b> %{text}<br><b>Lat:</b> %{y}<br><b>Lon:</b> %{x} <br><b>Cluster:</b> %{marker.color}'
))

# Add trace for noise cluster
fig.add_trace(go.Scatter3d(
    x=noise_data['lon'],
    y=noise_data['lat'],
    z=noise_data['incident_count'],
    mode='markers',
    marker=dict(
        size=8,
        color='purple',  # Color for noise points
        opacity=0.01,
    ),
    text=noise_data['incident_count'],  # Hover text
    hovertemplate='<b>Incident count:</b> %{text}<br><b>Lat:</b> %{y}<br><b>Lon:</b> %{x} <br><b>Cluster:</b> Noise'
))

# Update layout for better visualization
fig.update_layout(
    title='3D Scatter Plot of Equipment Clusters in Germany',
    scene=dict(
        xaxis=dict(title='Longitude'),
        yaxis=dict(title='Latitude'),
        zaxis=dict(title='Incident count'),
        aspectmode='cube'  # Ensure aspect ratio is equal
    )
)

fig.show()

# Save the plot as an HTML file
fig.write_html('../plots/clusters_incidents.html')

In [None]:

# Create a 3D scatter plot using Plotly Graph Objects
fig = go.Figure()

# Add trace for non-noise clusters
fig.add_trace(go.Scatter3d(
    x=non_noise_data['lon'],
    y=non_noise_data['lat'],
    z=non_noise_data['Zyklus Wartung'],
    mode='markers',
    marker=dict(
        size=8,
        color=non_noise_data['cluster'],  # Color by cluster
        colorscale='Viridis',              # Color scale
        opacity=0.8,
        colorbar=dict(title='Cluster')
    ),
    text=non_noise_data['Zyklus Wartung'],  # Hover text
    hovertemplate='<b>Incident count:</b> %{text}<br><b>Lat:</b> %{y}<br><b>Lon:</b> %{x} <br><b>Cluster:</b> %{marker.color}'
))

# Add trace for noise cluster
fig.add_trace(go.Scatter3d(
    x=noise_data['lon'],
    y=noise_data['lat'],
    z=noise_data['Zyklus Wartung'],
    mode='markers',
    marker=dict(
        size=8,
        color='purple',  # Color for noise points
        opacity=0.01,
    ),
    text=noise_data['Zyklus Wartung'],  # Hover text
    hovertemplate='<b>Incident count:</b> %{text}<br><b>Lat:</b> %{y}<br><b>Lon:</b> %{x} <br><b>Cluster:</b> Noise'
))

# Update layout for better visualization
fig.update_layout(
    title='3D Scatter Plot of Equipment Clusters in Germany',
    scene=dict(
        xaxis=dict(title='Longitude'),
        yaxis=dict(title='Latitude'),
        zaxis=dict(title='Zyklus Wartung'),
        aspectmode='cube'  # Ensure aspect ratio is equal
    )
)

fig.show()

# Save the plot as an HTML file
fig.write_html('../plots/clusters_zyklus.html')

In [None]:
reduce_maintenance_list = []
increase_maintenance_list = []

In [None]:
cluster_centers = X_train.groupby('cluster').agg({
    'lat': 'mean',
    'lon': 'mean',
    'incident_count': 'mean',
    'Zyklus Wartung': 'median'
}).reset_index()
cluster_centers = pd.DataFrame(cluster_centers)
def reapply_zyklus(row):
    cluster = row['cluster'].astype(int)
    if cluster == -1:
        return row
    cluster_value = cluster_centers[cluster_centers['cluster'] == cluster]['Zyklus Wartung'].values[0]
    if row['Zyklus Wartung'] != cluster_value:
        if row['Zyklus Wartung'] > cluster_value:
            reduce_maintenance_list.append(row['Equipment'])
        else:
            increase_maintenance_list.append(row['Equipment'])
        row['Zyklus Wartung'] = cluster_value
        return row
    return row

In [None]:
X_train = X_train.apply(reapply_zyklus, axis=1)

In [None]:
print(X_train[X_train['Equipment'].isin(reduce_maintenance_list)])

In [None]:
# Create a 3D scatter plot using Plotly Graph Objects
fig = go.Figure()

reduce_maintenance_equipment = X_train[X_train['Equipment'].isin(reduce_maintenance_list)]
increase_maintenance_equipment = X_train[X_train['Equipment'].isin(increase_maintenance_list)]

fig.add_trace(go.Scatter3d(
    x=reduce_maintenance_equipment['lon'],
    y=reduce_maintenance_equipment['lat'],
    z=reduce_maintenance_equipment['incident_count'],
    mode='markers',
    marker=dict(
        size=8,
        color='green',
        colorscale='Viridis',              # Color scale
        opacity=0.8,
        colorbar=dict(title='Cluster')
    ),
    text=reduce_maintenance_equipment['incident_count'],  # Hover text
    hovertemplate='<b>Incident count:</b> %{text}<br><b>Lat:</b> %{y}<br><b>Lon:</b> %{x}'
))

fig.add_trace(go.Scatter3d(
    x=increase_maintenance_equipment['lon'],
    y=increase_maintenance_equipment['lat'],
    z=increase_maintenance_equipment['incident_count'],
    mode='markers',
    marker=dict(
        size=8,
        color='red',
        colorscale='Viridis',              # Color scale
        opacity=0.8,
        colorbar=dict(title='Cluster')
    ),
    text=increase_maintenance_equipment['incident_count'],  # Hover text
    hovertemplate='<b>Incident count:</b> %{text}<br><b>Lat:</b> %{y}<br><b>Lon:</b> %{x}'
))


# Update layout for better visualization
fig.update_layout(
    title='3D Scatter Plot of Equipment Clusters in Germany',
    scene=dict(
        xaxis=dict(title='Longitude'),
        yaxis=dict(title='Latitude'),
        zaxis=dict(title='Incident count'),
        aspectmode='cube'  # Ensure aspect ratio is equal
    )
)

fig.show()

# Save the plot as an HTML file
fig.write_html('../plots/result_incidents.html')

In [None]:
fig = go.Figure()

fig.add_trace(go.Scatter3d(
    x=reduce_maintenance_equipment['lon'],
    y=reduce_maintenance_equipment['lat'],
    z=reduce_maintenance_equipment['Zyklus Wartung'],
    mode='markers',
    marker=dict(
        size=8,
        color='green',
        colorscale='Viridis',              # Color scale
        opacity=0.8,
        colorbar=dict(title='Cluster')
    ),
    text=reduce_maintenance_equipment['Zyklus Wartung'],  # Hover text
    hovertemplate='<b>Zyklus Wartung:</b> %{text}<br><b>Lat:</b> %{y}<br><b>Lon:</b> %{x}'
))

fig.add_trace(go.Scatter3d(
    x=increase_maintenance_equipment['lon'],
    y=increase_maintenance_equipment['lat'],
    z=increase_maintenance_equipment['Zyklus Wartung'],
    mode='markers',
    marker=dict(
        size=8,
        color='red',
        colorscale='Viridis',              # Color scale
        opacity=0.8,
        colorbar=dict(title='Cluster')
    ),
    text=increase_maintenance_equipment['Zyklus Wartung'],  # Hover text
    hovertemplate='<b>Zyklus Wartung:</b> %{text}<br><b>Lat:</b> %{y}<br><b>Lon:</b> %{x}'
))


# Update layout for better visualization
fig.update_layout(
    title='3D Scatter Plot of Equipment Clusters in Germany',
    scene=dict(
        xaxis=dict(title='Longitude'),
        yaxis=dict(title='Latitude'),
        zaxis=dict(title='Zyklus Wartung'),
        aspectmode='cube'  # Ensure aspect ratio is equal
    )
)

fig.show()

# Save the plot as an HTML file
fig.write_html('../plots/result_zyklus.html')

In [None]:
# Assign colors
increase_maintenance_equipment.loc[:, 'color'] = 'red'
reduce_maintenance_equipment.loc[:, 'color'] = 'green'

combined = pd.concat([reduce_maintenance_equipment, increase_maintenance_equipment])

# Create the plot
fig, ax = plt.subplots(figsize=(10, 10), subplot_kw={'projection': ccrs.PlateCarree()})

# Add geographical features
ax.add_feature(cfeature.BORDERS, linestyle='-')
ax.add_feature(cfeature.LAND, facecolor='white')
ax.add_feature(cfeature.OCEAN, facecolor='lightblue')
ax.add_feature(cfeature.COASTLINE, zorder=5)

# Plot data points
for color, group in combined.groupby('color'):
    print(f"Plotting color: {color} with {len(group)} points")  # Debugging statement
    ax.scatter(group['lon'], group['lat'], color=color, s=100, alpha=0.7, transform=ccrs.PlateCarree())

# Set plot title and extent
ax.set_title('Equipment Clusters in Germany')
ax.set_extent([5, 15, 47, 55], crs=ccrs.PlateCarree())

# Add legend
ax.legend(['Reduce Maintenance', 'Increase Maintenance'], loc='upper left')

# Save the plot as an image
plt.savefig('../plots/cluster_plot_incidents.svg', bbox_inches='tight')

# Show the plot
plt.show()

In [None]:
from matplotlib.cm import ScalarMappable
from matplotlib.colors import Normalize

combined = X_train


# Create the plot
fig, ax = plt.subplots(figsize=(10, 10), subplot_kw={'projection': ccrs.PlateCarree()})

# Add geographical features
ax.add_feature(cfeature.BORDERS, linestyle='-')
ax.add_feature(cfeature.LAND, facecolor='white')
ax.add_feature(cfeature.OCEAN, facecolor='lightblue')
ax.add_feature(cfeature.COASTLINE, zorder=5)

# Normalize the color scale to the range of 'Zyklus Wartung'
norm = Normalize(vmin=combined['Zyklus Wartung'].min(), vmax=combined['Zyklus Wartung'].max())

# Plot data points
scatter = ax.scatter(combined['lon'], combined['lat'], c=combined['Zyklus Wartung'], cmap='viridis', s=100, alpha=0.7, transform=ccrs.PlateCarree(), norm=norm)

# Set plot title and extent
ax.set_title('Equipment Clusters in Germany')
ax.set_extent([5, 15, 47, 55], crs=ccrs.PlateCarree())

# Create a color bar
cbar = plt.colorbar(ScalarMappable(norm=norm, cmap='viridis'), ax=ax, orientation='vertical', pad=0.02, aspect=50)
cbar.set_label('Zyklus Wartung')

# Save the plot as an SVG image
plt.savefig('../plots/cluster_plot_zyklus_after.svg', format='svg')

# Show the plot
plt.show()

In [None]:
plt.figure(figsize=(5,11.8/2))
filtered = X_train[X_train["Zyklus Wartung"] == 1460]
plt.scatter(filtered["lon"], filtered["lat"], s=5)
plt.show()

In [None]:
# Define a function to calculate entropy for a given column
def calculate_entropy(column):
    # Calculate the frequency of each bin/category
    value_counts = column.value_counts()
    # Calculate the probability distribution
    probabilities = value_counts / len(column)
    # Calculate the entropy
    return entropy(probabilities)

# Group by 'Zyklus_Wartung'
grouped = X_train.groupby('Zyklus Wartung')

# Calculate entropy for lat and lon within each group
entropy_results = []

for name, group in grouped:
    # Discretize lat and lon into bins
    group['lat_bins'] = pd.cut(group['lat'], bins=10000)
    group['lon_bins'] = pd.cut(group['lon'], bins=10000)
    
    # Calculate entropy for the binned lat and lon columns
    lat_entropy = calculate_entropy(group['lat_bins'])
    lon_entropy = calculate_entropy(group['lon_bins'])
    
    entropy_results.append({
        'Zyklus_Wartung': name,
        'lat_entropy': lat_entropy,
        'lon_entropy': lon_entropy
    })

# Convert results to a DataFrame for better visualization
entropy_df_after = pd.DataFrame(entropy_results)
print(entropy_df_after)

In [None]:
print(entropy_df)

In [None]:
# Calculate the difference
entropy_diff = entropy_df_after.sub(entropy_df)

# Display the result
print(entropy_diff)

In [None]:
from scipy.stats import ttest_rel

# Assuming entropy_df and entropy_df_after are already defined DataFrames

# Calculate the differences
entropy_diff = entropy_df_after.sub(entropy_df)

# Perform paired t-test for lat_entropy and lon_entropy
lat_ttest = ttest_rel(entropy_df_after['lat_entropy'], entropy_df['lat_entropy'])
lon_ttest = ttest_rel(entropy_df_after['lon_entropy'], entropy_df['lon_entropy'])

print(f'Paired t-test for lat_entropy: t-statistic = {lat_ttest.statistic}, p-value = {lat_ttest.pvalue}')
print(f'Paired t-test for lon_entropy: t-statistic = {lon_ttest.statistic}, p-value = {lon_ttest.pvalue}')

In [None]:
# Berechnung der deskriptiven Statistiken für die bisherigen Wartungszyklen
mean_before = equipment['Zyklus Wartung'].mean()
median_before = equipment['Zyklus Wartung'].median()
std_before = equipment['Zyklus Wartung'].std()

print("Bisherige Wartungszyklen:")
print(f"Mittelwert: {mean_before}")
print(f"Median: {median_before}")
print(f"Standardabweichung: {std_before}")

# Berechnung der deskriptiven Statistiken für die neuen Wartungszyklen
mean_after = X_train['Zyklus Wartung'].mean()
median_after = X_train['Zyklus Wartung'].median()
std_after = X_train['Zyklus Wartung'].std()

print("\nNeue Wartungszyklen:")
print(f"Mittelwert: {mean_after}")
print(f"Median: {median_after}")
print(f"Standardabweichung: {std_after}")

# Vergleich der Statistiken
print("\nVergleich der Wartungszyklen:")
print(f"Änderung des Mittelwerts: {mean_after - mean_before}")
print(f"Änderung des Medians: {median_after - median_before}")
print(f"Änderung der Standardabweichung: {std_after - std_before}")


In [None]:
zyklen = equipment["Zyklus Wartung"].value_counts().sort_index()
print(zyklen)
plt.bar(zyklen.index.astype(str), zyklen.values, align='center')
plt.show()

In [None]:
zyklen = X_train["Zyklus Wartung"].value_counts().sort_index()
print(zyklen)
plt.bar(zyklen.index.astype(str), zyklen.values, align='center')
plt.show()

In [None]:
costs = {
    180: 8,
    365: 4,
    730: 2,
    1460: 1
}

X_train["cost"] = X_train["Zyklus Wartung"].map(costs)

total_cost = X_train["cost"].sum()
print(total_cost)

equipment["cost"] = equipment["Zyklus Wartung"].map(costs)

total_cost_before = equipment["cost"].sum()
print(total_cost_before)

In [None]:
# Function to calculate nearest neighbor distances
def nearest_neighbor_distances(data):
    tree = KDTree(data)
    distances, _ = tree.query(data, k=2)  # k=2 because the nearest neighbor of each point is the point itself
    return distances[:, 1]  # Exclude the zero distance to itself

new_eq = equipment

# Group by 'Zyklus Wartung'
grouped = new_eq.groupby('Zyklus Wartung')

# Calculate nearest neighbor distances for each group
nn_results = []

for name, group in grouped:
    data = group[['lat', 'lon']].values
    nn_distances = nearest_neighbor_distances(data)
    
    nn_results.append({
        'Zyklus Wartung': name,
        'nn_distances': nn_distances,
        'mean_nn_distance': np.mean(nn_distances),
        'median_nn_distance': np.median(nn_distances),
        'std_nn_distance': np.std(nn_distances)
    })

# Convert results to a DataFrame for better visualization
nn_df = pd.DataFrame(nn_results)
print(nn_df)

# Plotting the nearest neighbor distances
for result in nn_results:
    plt.hist(result['nn_distances'], bins=10, alpha=0.5, label=f"Zyklus Wartung {result['Zyklus Wartung']}")
plt.xlabel('Nearest Neighbor Distance')
plt.ylabel('Frequency')
plt.title('Distribution of Nearest Neighbor Distances')
plt.legend()
plt.show()

In [None]:
# Function to calculate nearest neighbor distances
def nearest_neighbor_distances(data):
    tree = KDTree(data)
    distances, _ = tree.query(data, k=2)  # k=2 because the nearest neighbor of each point is the point itself
    return distances[:, 1]  # Exclude the zero distance to itself

new_eq = X_train

# Group by 'Zyklus Wartung'
grouped = new_eq.groupby('Zyklus Wartung')

# Calculate nearest neighbor distances for each group
nn_results = []

for name, group in grouped:
    data = group[['lat', 'lon']].values
    nn_distances = nearest_neighbor_distances(data)
    
    nn_results.append({
        'Zyklus Wartung': name,
        'nn_distances': nn_distances,
        'mean_nn_distance': np.mean(nn_distances),
        'median_nn_distance': np.median(nn_distances),
        'std_nn_distance': np.std(nn_distances)
    })

# Convert results to a DataFrame for better visualization
nn_df = pd.DataFrame(nn_results)
print(nn_df)

# Plotting the nearest neighbor distances
for result in nn_results:
    plt.hist(result['nn_distances'], bins=10, alpha=0.5, label=f"Zyklus Wartung {result['Zyklus Wartung']}")
plt.xlabel('Nearest Neighbor Distance')
plt.ylabel('Frequency')
plt.title('Distribution of Nearest Neighbor Distances')
plt.legend()
plt.show()