# Gaussian mixture model

This notebook will be investigating the proper clustering method based on the results from Feature_Selection notebook. This notebook show some potential that the dimension reduction by feature selection and PCA show some potential that this could have some potential even with visual clustering.
Therefore we will be investigating sophisticated method of clustering and it will improve based on this result.






In [3]:
import os
# Set the environment variable
os.environ['OMP_NUM_THREADS'] = '1'

import pandas as pd
from sklearn.mixture import GaussianMixture
import plotly.express as px
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import StandardScaler
import seaborn as sns


import matplotlib.pyplot as plt
import seaborn as sns
import random
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Load the data

In [4]:
# Read CSV files into DataFrames
df_mi_corr = pd.read_csv("../processed_data/lazy_corr_MI_10_FS.csv")
df_mi_corr_50 = pd.read_csv("../processed_data/lazy_corr_MI_50_FS.csv")
df_corr = pd.read_csv("../processed_data/lazy_corr_FS.csv")
df_mi = pd.read_csv("../processed_data/batch_corr_MI_10_FS.csv")
df = pd.read_csv("../processed_data/batch_corr_FS.csv")
df_mi_50= pd.read_csv("../processed_data/batch_corr_MI_50_FS.csv")


df_corr_scaled_mi_batch_50= pd.read_csv("../processed_data/batch_corr_batch_MI_50_FS.csv")

df_corr_scaled_nomiddle= pd.read_csv("../processed_data/batch_corr_MI_nomiddle_50_FS.csv")

df_corr_scaled_mi_nofreq_50= pd.read_csv("../processed_data/batch_corr_MI_nofreq_50_FS.csv")

First we will be investigating df_mi and df_mi_50 that had good visualization result using PCA. Following figures were the results:

<img src="Figures\MI-batch Correlation with top 10 features_2D.png" alt="sphx_glr_plot_cluster_comparison_001" width="1500"/>
<img src="Figures\MI-batch Correlation with top 10 features_3d.png" alt="sphx_glr_plot_cluster_comparison_001" width="1500"/>

Based on the analytics and the nature of the clustering method where we want to find anomalies Gaussian Mixture model will be first thing to be investigated. 

<img src="https://scikit-learn.org/stable/_images/sphx_glr_plot_cluster_comparison_001.png" alt="sphx_glr_plot_cluster_comparison_001" width="1200"/>


# Gaussian Mixture Model 

Gaussian Mixture Models (GMM) can be used for anomaly detection by identifying data points that do not fit well into any of the Gaussian distributions modeled by the GMM. These points are considered anomalies because they have a low probability of belonging to any of the clusters.



In [5]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.mixture import GaussianMixture
import plotly.express as px

def tune_gmm(data, n_components_range=range(1, 15), criterion='AIC'):
    """
    Perform hyperparameter tuning for a Gaussian Mixture Model (GMM),
    store the results in a DataFrame, visualize the results, and return the best model.

    Parameters:
    - data: pd.DataFrame - The input data for GMM clustering.
    - n_components_range: range - The range of number of components to try.
    - criterion: str - The criterion to use for selecting the best model ('AIC' or 'BIC').

    Returns:
    - best_model: GaussianMixture - The best GMM model based on the chosen criterion.
    - results_df: pd.DataFrame - The DataFrame containing AIC and BIC values for each model.
    """
    
    # Standardize the features
    scaler = StandardScaler()
    features_scaled = scaler.fit_transform(data)

    # Create lists to store the results
    aic_values = []
    bic_values = []
    n_components_list = []

    # Loop over the range of components
    for n_components in n_components_range:
        gmm = GaussianMixture(n_components=n_components, random_state=42)
        gmm.fit(features_scaled)
        
        aic = gmm.aic(features_scaled)
        bic = gmm.bic(features_scaled)
        
        aic_values.append(aic)
        bic_values.append(bic)
        n_components_list.append(n_components)

    # Create a DataFrame to store the results
    results_df = pd.DataFrame({
        'n_components': n_components_list,
        'AIC': aic_values,
        'BIC': bic_values
    })

    # Plot AIC and BIC values
    fig = px.line(results_df, x='n_components', y=['AIC', 'BIC'], 
                  title='AIC and BIC values for different number of components',
                  labels={'value': 'Score', 'n_components': 'Number of Components'},
                  markers=True)
    fig.show()

    # Find the best model based on the chosen criterion
    if criterion == 'AIC':
        best_index = results_df['AIC'].idxmin()
    elif criterion == 'BIC':
        best_index = results_df['BIC'].idxmin()
    else:
        raise ValueError("Criterion must be either 'AIC' or 'BIC'")

    best_model = GaussianMixture(n_components=results_df.loc[best_index, 'n_components'], random_state=42)
    best_model.fit(features_scaled)

    return best_model, results_df


In [6]:
best_gmm_model, tuning_results = tune_gmm(df_mi, n_components_range=range(1, 15), criterion='AIC')
print(best_gmm_model)

GaussianMixture(n_components=14, random_state=42)


Picked best number of component with elbow method and it is seems to be 6

## Perform Cluster

Bsaed on the elbow method we will be pick number of component that seems to be well performing.

In [7]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.mixture import GaussianMixture
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import plotly.express as px

def train_GMM(df, n_components=5, random_state=42, visualization_method='PCA', plot_3d=False):
    """
    Train a Gaussian Mixture Model (GMM) on the given dataframe, predict clusters, and visualize the results.

    Parameters:
    df (DataFrame): The DataFrame containing the features to cluster.
    n_components (int): The number of clusters/components for the GMM.
    random_state (int): Random state for reproducibility.
    visualization_method (str): The method for visualization ('PCA' or 'TSNE').
    plot_3d (bool): Whether to generate a 3D plot. If False, a 2D plot will be generated.

    Returns:
    DataFrame: The original DataFrame with an additional column for cluster labels.
    GaussianMixture: The fitted GMM model.
    """

    # Standardize the features
    df = df.copy()
    sensor_ids = df.index if 'Sensor ID' not in df.columns else df['Sensor ID']
    scaler = StandardScaler()
    features_scaled = scaler.fit_transform(df.drop(columns=['Sensor ID']))

    # Fit a Gaussian Mixture Model
    gmm = GaussianMixture(n_components=n_components, random_state=random_state)
    gmm.fit(features_scaled)

    # Predict cluster labels
    cluster_labels = gmm.predict(features_scaled)
    df['cluster'] = cluster_labels

    print("============ Distribution of Sensors in each Cluster ============")
    print(df.groupby(["cluster"])["Sensor ID"].count())

    # Evaluate the model using BIC and AIC
    bic = gmm.bic(features_scaled)
    aic = gmm.aic(features_scaled)

    print(f"BIC: {bic}")
    print(f"AIC: {aic}")

    # Visualize the clustering results using PCA or t-SNE
    if visualization_method.upper() == 'PCA':
        n_components = 3 if plot_3d else 2
        pca = PCA(n_components=n_components)
        components = pca.fit_transform(features_scaled)
        title = '3D Visualization using PCA' if plot_3d else '2D Visualization using PCA'
    elif visualization_method.upper() == 'TSNE':
        n_components = 3 if plot_3d else 2
        tsne = TSNE(n_components=n_components, random_state=random_state)
        components = tsne.fit_transform(features_scaled)
        title = '3D Visualization using t-SNE' if plot_3d else '2D Visualization using t-SNE'
    else:
        raise ValueError("visualization_method should be either 'PCA' or 'TSNE'.")

    # Create a DataFrame for the components
    components_df = pd.DataFrame(components, columns=[f'Component {i+1}' for i in range(n_components)])
    components_df['cluster'] = cluster_labels
    components_df['Sensor ID'] = sensor_ids

    # Plot the results
    if plot_3d:
        fig = px.scatter_3d(
            components_df, 
            x='Component 1', 
            y='Component 2', 
            z='Component 3', 
            color='cluster', 
            title=title,
            hover_name='Sensor ID'  # Display Sensor ID on hover
        )
    else:
        fig = px.scatter(
            components_df, 
            x='Component 1', 
            y='Component 2', 
            color='cluster', 
            title=title,
            hover_name='Sensor ID'  # Display Sensor ID on hover
        )
    
    fig.show()

    return df, gmm

# Example usage:
# df, gmm = train_GMM(df, n_components=5, random_state=42, visualization_method='PCA', plot_3d=True)


## Evaluation Methods
One of the hard thing about the clustering is how to evaluate the clustering. To do so we will leverage the nature of our dataset.

weighted average of average variability of pingtime: This will reveals the how much variability within the cluster. This does not compare each cluster's quality and having more cluster is always better but this doesn't account that either.


In [8]:
# Load the dataset
file_path = '../processed_data/all_data_v4-1-1_cleaned_sensor211.csv'
all_cleaned_df = pd.read_csv(file_path)
all_cleaned_df = all_cleaned_df.drop("Unnamed: 0",axis=1)

In [9]:
def identify_outliers(series):
    Q1 = series.quantile(0.25)
    Q3 = series.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return ((series < lower_bound) | (series > upper_bound)).sum()

def average_variability_metrics(df_cluster, all_cleaned_df):
    """
    Calculate the average number of outliers and average standard deviation of ping time for each cluster,
    and the weighted averages of these values.

    Parameters:
    df_cluster (DataFrame): The DataFrame containing sensor ID and their respective clusters.
    all_cleaned_df (DataFrame): The DataFrame containing the cleaned data.

    Returns:
    DataFrame: A DataFrame with columns for cluster, average number of outliers, and average std ping time.
    float: The weighted average of the average number of outliers across clusters.
    float: The weighted average of the average standard deviation of ping time across clusters.
    """

    # List to store the results
    results = []

    # Iterate over each cluster
    for target in df_cluster["cluster"].unique():
        # Get the sensor IDs for the current cluster
        cluster_sensors = df_cluster[df_cluster["cluster"] == target]["Sensor ID"].unique()

        # Filter the all_cleaned_df for the current cluster sensors
        all_cleaned_df_target = all_cleaned_df[all_cleaned_df['Sensor ID'].isin(cluster_sensors)]

        # Group by 'Delay (us)' and 'Range (cm)', then calculate the number of outliers in 'Ping Time (us)'
        grouped_outliers = all_cleaned_df_target.groupby(['Delay (us)', 'Range (cm)'])['Ping Time (us)'].apply(identify_outliers).reset_index(name='outliers')
        
        # Calculate the average number of outliers for the current cluster
        avg_count_outliers = grouped_outliers['outliers'].mean()
        max_count_outliers = grouped_outliers['outliers'].max()

        # Group by sensor ID, delay, and range, then calculate the standard deviation of ping time
        grouped_std = all_cleaned_df_target.groupby(['Sensor ID', 'Delay (us)', 'Range (cm)']).agg(
            std_ping_time=('Ping Time (us)', 'std')
        ).reset_index()

        # Calculate the average std ping time for the current cluster
        avg_std_ping_time = grouped_std['std_ping_time'].mean()

        # Append the results to the list
        results.append({'cluster': target, 'max_count_outliers':max_count_outliers, 'avg_count_outliers': avg_count_outliers, 'avg_std_ping_time': avg_std_ping_time, 'count': len(cluster_sensors)})
    
    # Convert the results list to a DataFrame
    results_df = pd.DataFrame(results)
    
    # Calculate the weighted average of the average number of outliers

    results_df['weighted_avg_count_outliers'] =(results_df['max_count_outliers']-results_df['avg_count_outliers']) /results_df['count']
    weighted_avg_count_outliers_score = results_df['weighted_avg_count_outliers'].mean()

    # Calculate the weighted average of the average std ping time
    results_df['weighted_avg_std_ping_time'] = results_df['avg_std_ping_time'] /results_df['count']*(results_df['cluster']+1)
    weighted_avg_std_ping_time_score = results_df['weighted_avg_std_ping_time'].mean()

    return results_df, weighted_avg_count_outliers_score, weighted_avg_std_ping_time_score



In [10]:
df_mi_gmm_8,gmm_8 = train_GMM(df_mi, n_components=8, random_state=42,visualization_method='TSNE')

cluster
0    151
1     17
2     20
3      1
4      1
5      1
6      1
7     18
Name: Sensor ID, dtype: int64
BIC: 728.0219188083929
AIC: -1035.903749879713


In [11]:
results_df, weighted_avg_outliers_score, weighted_avg_std_ping_time_score = average_variability_metrics(df_mi_gmm_8,all_cleaned_df)

results_df

Unnamed: 0,cluster,max_count_outliers,avg_count_outliers,avg_std_ping_time,count,weighted_avg_count_outliers,weighted_avg_std_ping_time
0,7,119,20.377778,19.304321,18,5.479012,8.579698
1,0,1128,552.355556,47.569719,151,3.812215,0.315031
2,6,23,5.4,26.583564,1,17.6,186.084948
3,2,323,53.4,305.171276,20,13.48,45.775691
4,1,140,28.577778,619.375891,17,6.554248,72.867752
5,4,19,4.644444,13.420619,1,14.355556,67.103096
6,5,22,2.622222,380.734807,1,19.377778,2284.408841
7,3,23,3.577778,16.225759,1,19.422222,64.903037


Based on this evaluation method we can do gridsearch on the n_component and also performs elbow method.

In [12]:
def search_gmm_weighted_avg(df,data, n_components_range=range(2, 20)):
    for i in n_components_range:
        # Standardize the features
        scaler = StandardScaler()
        features_scaled = scaler.fit_transform(df)

        # Fit a Gaussian Mixture Model
        gmm = GaussianMixture(n_components=i, random_state=42)
        gmm.fit(features_scaled)

        # Predict cluster labels
        cluster_labels = gmm.predict(features_scaled)
        df['cluster'] = cluster_labels
        
        results_df, weighted_avg_outliers_score, weighted_avg_std_ping_time_score = average_variability_metrics(df,data)
        #print(results_df)
        #print(f"============== {i} component(s) ================")
        print(f"Weighted average of variability score when {i} component(s): ", weighted_avg_std_ping_time_score,f" Outlier score when {i} component(s): ", weighted_avg_outliers_score)

search_gmm_weighted_avg(df_mi,all_cleaned_df)

Weighted average of variability score when 2 component(s):  9.816217937602742  Outlier score when 2 component(s):  3.9305555555555554
Weighted average of variability score when 3 component(s):  33.43590126862295  Outlier score when 3 component(s):  8.50195509097948
Weighted average of variability score when 4 component(s):  29.499380613489773  Outlier score when 4 component(s):  9.77376622450435
Weighted average of variability score when 5 component(s):  64.15166568778453  Outlier score when 5 component(s):  10.700771043771045
Weighted average of variability score when 6 component(s):  37.4106623063501  Outlier score when 6 component(s):  9.999339426006093
Weighted average of variability score when 7 component(s):  41.17142914236316  Outlier score when 7 component(s):  8.63906816525864
Weighted average of variability score when 8 component(s):  364.72501135425426  Outlier score when 8 component(s):  10.684411736811786
Weighted average of variability score when 9 component(s):  305.2390

Based on the result we will be investigating the when n component is 7 and 11.

### Visual Investigate Clustering performance

In [13]:
def visualize_lineplot_ping_time_with_variability(df, target = []):
    """
    Visualize the effect of range on ping time for each delay separately with variability.

    Parameters:
    df (DataFrame): The DataFrame containing the data.
    """
    # Group by sensor ID, delay, and range, then calculate the mean and standard deviation of ping time
    grouped_df = df.groupby(['Sensor ID', 'Delay (us)', 'Range (cm)']).agg(
        mean_ping_time=('Ping Time (us)', 'mean'),
        std_ping_time=('Ping Time (us)', 'std')
    ).reset_index()

    target_df = grouped_df[grouped_df['Sensor ID'].isin(target)]
    # Get unique delays
    unique_delays = target_df['Delay (us)'].unique()

    for delay in unique_delays:
        subset_df = target_df[target_df['Delay (us)'] == delay]

        fig = px.line(

        )

        # Adding error bars
        for sensor_id in subset_df['Sensor ID'].unique():
            sensor_data = subset_df[subset_df['Sensor ID'] == sensor_id]
            fig.add_trace(
                go.Scatter(
                    x=sensor_data['Range (cm)'],
                    y=sensor_data['mean_ping_time'],
                    mode='lines+markers',
                    name=f'Sensor {sensor_id}',
                    error_y=dict(
                        type='data',
                        array=sensor_data['std_ping_time'],
                        visible=True
                    )
                )
            )

        # Plot reference line
        ranges = np.linspace(subset_df['Range (cm)'].min(), subset_df['Range (cm)'].max(), 100)
        reference_ping_times = 57 * ranges
        fig.add_trace(
            go.Scatter(
                x=ranges,
                y=reference_ping_times,
                mode='lines',
                line=dict(color='red', dash='dash'),
                name='Reference Line'
            )
        )

        fig.update_layout(
            xaxis_title='Range (cm)',
            yaxis_title='Mean Ping Time (us)',
            legend_title='Sensor ID',
            template='plotly_white'
        )

        fig.show()

def visualize_lineplot_ping_time_with_variability_simple(df, target=[]):
    """
    Visualize the effect of range on ping time for each delay separately with variability.

    Parameters:
    df (DataFrame): The DataFrame containing the data.
    target (list): List of target sensor IDs to visualize.
    """
    # Group by sensor ID, delay, and range, then calculate the mean and standard deviation of ping time
    grouped_df = df.groupby(['Sensor ID', 'Delay (us)', 'Range (cm)']).agg(
        mean_ping_time=('Ping Time (us)', 'mean'),
        std_ping_time=('Ping Time (us)', 'std')
    ).reset_index()

    target_df = grouped_df[grouped_df['Sensor ID'].isin(target)]
    # Get unique delays
    unique_delays = target_df['Delay (us)'].unique()

    # Define the number of columns for subplots
    num_columns = 2  # 2 columns for the grid
    num_rows = 3  # 3 rows for the grid

    # Create subplots
    fig = make_subplots(rows=num_rows, cols=num_columns, subplot_titles=[f'Delay: {delay} us' for delay in unique_delays])

    for i, delay in enumerate(unique_delays):
        subset_df = target_df[target_df['Delay (us)'] == delay]

        # Get the row and column position for the subplot
        row = (i // num_columns) + 1
        col = (i % num_columns) + 1

        # Adding error bars
        for sensor_id in subset_df['Sensor ID'].unique():
            sensor_data = subset_df[subset_df['Sensor ID'] == sensor_id]
            fig.add_trace(
                go.Scatter(
                    x=sensor_data['Range (cm)'],
                    y=sensor_data['mean_ping_time'],
                    mode='lines+markers',
                    name=f'Sensor {sensor_id}',
                    error_y=dict(
                        type='data',
                        array=sensor_data['std_ping_time'],
                        visible=True
                    )
                ),
                row=row, col=col
            )

        # Plot reference line
        ranges = np.linspace(subset_df['Range (cm)'].min(), subset_df['Range (cm)'].max(), 100)
        reference_ping_times = 57 * ranges
        fig.add_trace(
            go.Scatter(
                x=ranges,
                y=reference_ping_times,
                mode='lines',
                line=dict(color='red', dash='dash'),
                name='Reference Line'
            ),
            row=row, col=col
        )

    fig.update_layout(
        height=1500,  # Adjust height for the grid layout
        width=1500,  # Adjust width for the grid layout
        showlegend=False,
        title_text="Ping Time vs Range for Different Delays"
    )

    fig.show()

def visualize_cluster(df,cluster = 0, simple = True):
    # Load the dataset
    file_path = '../processed_data/all_data_v4-1-1_cleaned_sensor211.csv'
    all_cleaned_df = pd.read_csv(file_path)
    all_cleaned_df = all_cleaned_df.drop("Unnamed: 0",axis=1)
    
    cluster_sensors = df[df["cluster"]==cluster]["Sensor ID"].unique()
    if simple:
        visualize_lineplot_ping_time_with_variability_simple(all_cleaned_df,cluster_sensors)
    else:
        visualize_lineplot_ping_time_with_variability(all_cleaned_df,cluster_sensors)
    

## Test df_mi 

### Investigate df_mi with n_components 7 

Training the model

In [14]:
df_mi_gmm_7,gmm_7 = train_GMM(df_mi, n_components=7, random_state=42,visualization_method='TSNE')

cluster
0    56
1    98
2    13
3    25
4     1
5    16
6     1
Name: Sensor ID, dtype: int64
BIC: 2019.3547706175896
AIC: 195.1811663765693


In [15]:
search_gmm_weighted_avg(df_mi,all_cleaned_df)

Weighted average of variability score when 2 component(s):  9.816217937602742  Outlier score when 2 component(s):  3.9305555555555554
Weighted average of variability score when 3 component(s):  33.43590126862295  Outlier score when 3 component(s):  8.50195509097948
Weighted average of variability score when 4 component(s):  29.499380613489773  Outlier score when 4 component(s):  9.77376622450435
Weighted average of variability score when 5 component(s):  64.15166568778453  Outlier score when 5 component(s):  10.700771043771045
Weighted average of variability score when 6 component(s):  37.4106623063501  Outlier score when 6 component(s):  9.999339426006093
Weighted average of variability score when 7 component(s):  41.17142914236316  Outlier score when 7 component(s):  8.63906816525864
Weighted average of variability score when 8 component(s):  364.72501135425426  Outlier score when 8 component(s):  10.684411736811786
Weighted average of variability score when 9 component(s):  305.2390

In [16]:
results_df, weighted_avg_outliers_score, weighted_avg_std_ping_time_score = average_variability_metrics(df_mi_gmm_7,all_cleaned_df)

print(weighted_avg_outliers_score, weighted_avg_std_ping_time_score)
results_df

9.45483385661957 39.13950458686444


Unnamed: 0,cluster,max_count_outliers,avg_count_outliers,avg_std_ping_time,count,weighted_avg_count_outliers,weighted_avg_std_ping_time
0,5,119,16.177778,20.468185,16,6.426389,7.675569
1,2,173,54.355556,38.73182,13,9.126496,8.938112
2,1,665,130.822222,102.385542,98,5.450794,2.089501
3,0,261,70.466667,22.074261,56,3.402381,0.394183
4,3,244,44.0,498.78772,25,8.0,79.806035
5,6,19,4.644444,13.420619,1,14.355556,93.944334
6,4,23,3.577778,16.225759,1,19.422222,81.128797


#### Cluster 3: Majority cluster 



In [17]:
visualize_cluster(df_mi_gmm_7,3)

#### Cluster 0: another majority cluster

In [18]:
visualize_cluster(df_mi_gmm_7,0)

#### Cluster 2: decent sized cluster

In [19]:
visualize_cluster(df_mi_gmm_7,2)

From this visualziation it showed the potential where comparing the cluster 0 and 2's wehn delay 10000 cluster 0 seems to be cluster of over measuring and cluster 2 is under measuring 

Idea for feature engineering: we can have like a slope 


In [20]:
visualize_cluster(df_mi_gmm_7,1)

In [21]:
visualize_cluster(df_mi_gmm_7,5)

### Investigate df_mi with 11 n_components

Training the model

In [22]:
df_mi_gmm_11,gmm_11 = train_GMM(df_mi, n_components=11, random_state=42,visualization_method='TSNE')

cluster
0     58
1     87
2     13
3     13
4      1
5      1
6     16
7     18
8      1
9      1
10     1
Name: Sensor ID, dtype: int64
BIC: 2561.543730511781
AIC: -306.9274233130893


#### Cluster 3: majority cluster

In [23]:
visualize_cluster(df_mi_gmm_11,3)

#### Cluster 0: another majority cluster

In [24]:
visualize_cluster(df_mi_gmm_11,0)

#### Cluster 10: another majority cluster

In [25]:
visualize_cluster(df_mi_gmm_11,10)

## Test df_mi_50

Training the model

In [26]:
best_gmm_model, tuning_results = tune_gmm(df_mi_50, n_components_range=range(1, 15), criterion='AIC')
print(best_gmm_model)

GaussianMixture(n_components=13, random_state=42)


Test n_component 10 and 13

In [27]:
search_gmm_weighted_avg(df_mi_50,all_cleaned_df)

Weighted average of variability score when 2 component(s):  6.525658107988389  Outlier score when 2 component(s):  7.371708319809586
Weighted average of variability score when 3 component(s):  5.427907692365612  Outlier score when 3 component(s):  7.605090339080934
Weighted average of variability score when 4 component(s):  9.226755473478963  Outlier score when 4 component(s):  7.127887025745258
Weighted average of variability score when 5 component(s):  37.34016607288425  Outlier score when 5 component(s):  6.8433738777720805
Weighted average of variability score when 6 component(s):  12.901021326333996  Outlier score when 6 component(s):  6.368737725318591
Weighted average of variability score when 7 component(s):  19.673693119419834  Outlier score when 7 component(s):  6.761227376048223
Weighted average of variability score when 8 component(s):  23.14252900000961  Outlier score when 8 component(s):  7.914120036342259
Weighted average of variability score when 9 component(s):  30.738

### Investigate df_mi_50 with 13 n_components

In [28]:
df_mi_50_gmm_13,mi_50_gmm_13 = train_GMM(df_mi_50, n_components=13, random_state=42,visualization_method='TSNE',plot_3d=True)

cluster
0     12
1     16
2     24
3     10
4     21
5     42
6     17
7     17
8     14
9      1
10     3
11    22
12    11
Name: Sensor ID, dtype: int64
BIC: 32319.70231907729
AIC: -27637.034878664723


In [29]:
df_mi_50_gmm_13,mi_50_gmm_13 = train_GMM(df_mi_50, n_components=13, random_state=42,visualization_method='TSNE')

cluster
0     12
1     16
2     24
3     10
4     21
5     42
6     17
7     17
8     14
9      1
10     3
11    22
12    11
Name: Sensor ID, dtype: int64
BIC: 32319.70231907729
AIC: -27637.034878664723


In [30]:
results_df, _,_ = average_variability_metrics(df_mi_50_gmm_13,all_cleaned_df)

results_df

Unnamed: 0,cluster,max_count_outliers,avg_count_outliers,avg_std_ping_time,count,weighted_avg_count_outliers,weighted_avg_std_ping_time
0,11,187,48.911111,22.425665,22,6.276768,12.232181
1,12,250,71.533333,44.642995,11,16.224242,52.759903
2,8,58,30.288889,26.009047,14,1.979365,16.720102
3,0,143,18.622222,14.504102,12,10.364815,1.208675
4,4,165,29.022222,136.252529,21,6.475132,32.441078
5,3,94,18.6,234.767292,10,7.54,93.906917
6,5,291,46.733333,71.415563,42,5.815873,10.202223
7,2,111,20.311111,59.173938,24,3.778704,7.396742
8,6,156,55.288889,11.747254,17,5.924183,4.837105
9,7,150,27.733333,161.71309,17,7.192157,76.100277


In [31]:
#visualize_cluster(df_mi_50_gmm_13,1)

From this visualization we can see they are grouped where it trouble detecting at delay 16800 range 13.

## Test df_corr_scaled_mi_batch_50

Training the model

In [32]:
best_gmm_model, tuning_results = tune_gmm(df_corr_scaled_mi_batch_50, n_components_range=range(1, 15), criterion='AIC')
print(best_gmm_model)

GaussianMixture(n_components=14, random_state=42)


In [33]:
search_gmm_weighted_avg(df_corr_scaled_mi_batch_50,all_cleaned_df)

Weighted average of variability score when 2 component(s):  5.646021899490216  Outlier score when 2 component(s):  6.8041338979633
Weighted average of variability score when 3 component(s):  6.306042138053255  Outlier score when 3 component(s):  7.143488200092352
Weighted average of variability score when 4 component(s):  6.3783610466039535  Outlier score when 4 component(s):  8.28623191802224
Weighted average of variability score when 5 component(s):  36.08830881375295  Outlier score when 5 component(s):  7.02294499062541
Weighted average of variability score when 6 component(s):  36.147365810988454  Outlier score when 6 component(s):  8.027664762550584
Weighted average of variability score when 7 component(s):  124.76944202286246  Outlier score when 7 component(s):  9.80185598277929
Weighted average of variability score when 8 component(s):  108.30502871252521  Outlier score when 8 component(s):  7.168141268034837
Weighted average of variability score when 9 component(s):  76.6863852

Test n_component 10 and 15 based on the score result

In [34]:
df_corr_scaled_mi_batch_50_gmm_10,mi_batch_gmm_10 = train_GMM(df_corr_scaled_mi_batch_50, n_components=10, random_state=42,visualization_method='TSNE')

cluster
0    17
1    15
2    20
3    30
4    16
5    32
6    20
7    39
8    20
9     1
Name: Sensor ID, dtype: int64
BIC: 21002.948950586986
AIC: -25116.84571516901


In [35]:
results_df, _,_ = average_variability_metrics(df_corr_scaled_mi_batch_50_gmm_10,all_cleaned_df)

results_df

Unnamed: 0,cluster,max_count_outliers,avg_count_outliers,avg_std_ping_time,count,weighted_avg_count_outliers,weighted_avg_std_ping_time
0,6,149,22.444444,17.479789,20,6.327778,6.117926
1,2,314,44.822222,32.185802,20,13.458889,4.82787
2,0,143,32.511111,42.833711,17,6.499346,2.51963
3,8,165,24.333333,116.251328,20,7.033333,52.313098
4,5,245,41.644444,150.100719,32,6.354861,28.143885
5,4,110,29.422222,29.080173,16,5.036111,9.087554
6,3,201,34.822222,136.069175,30,5.539259,18.142557
7,7,219,35.4,41.848902,39,4.707692,8.58439
8,1,158,34.244444,636.990687,15,8.25037,84.932092
9,9,23,3.577778,16.225759,1,19.422222,162.257594


#### cluster 9: outliers

First thing that I want to look into is Cluster 9 where it has a individual score of 102.
Lower is better but this may not be always true for all since some cluster is just cluster of outliers

In [36]:
visualize_cluster(df_corr_scaled_mi_batch_50_gmm_10,9)

as we can see this cluster is compose of outlier that have outgoing on 16800 range 13

#### Cluster 5: var score 12.607090

In [37]:
visualize_cluster(df_corr_scaled_mi_batch_50_gmm_10,5)

Notice they all at range 13, 18, 33,and 38 have over measurement.

#### Cluter 7: majority cluster 

num: 31 score :19.081381

In [38]:
visualize_cluster(df_corr_scaled_mi_batch_50_gmm_10,7)

#### Cluster 0: with low score
0	44.284186	16	2.767762

In [39]:
visualize_cluster(df_corr_scaled_mi_batch_50_gmm_10,0)

more of constant over measurment but there are some noticable outliers

## Test df_corr_scaled_mi_nofreq_50

Training the model

In [40]:
best_gmm_model, tuning_results = tune_gmm(df_corr_scaled_mi_nofreq_50, n_components_range=range(1, 15), criterion='AIC')
print(best_gmm_model)

GaussianMixture(n_components=11, random_state=42)


In [41]:
search_gmm_weighted_avg(df_corr_scaled_mi_nofreq_50,all_cleaned_df)

Weighted average of variability score when 2 component(s):  6.29114368011365  Outlier score when 2 component(s):  7.426057231369092
Weighted average of variability score when 3 component(s):  13.585077059967409  Outlier score when 3 component(s):  8.050677717344383
Weighted average of variability score when 4 component(s):  11.854616483654862  Outlier score when 4 component(s):  6.969270809270809
Weighted average of variability score when 5 component(s):  16.320763792533562  Outlier score when 5 component(s):  7.2953004249763085
Weighted average of variability score when 6 component(s):  47.01838944597667  Outlier score when 6 component(s):  10.037273651788942
Weighted average of variability score when 7 component(s):  45.22727276490655  Outlier score when 7 component(s):  10.169548273077684
Weighted average of variability score when 8 component(s):  153.25778956308636  Outlier score when 8 component(s):  7.778520373596592
Weighted average of variability score when 9 component(s):  98.

Test n_component 10 and 15 based on the score result

Also test n_component = 9 and 16

### n_component = 10

In [57]:
df_nofreq_50_gmm_10,mi_nofreq_gmm_10 = train_GMM(df_corr_scaled_mi_nofreq_50, n_components=10, random_state=42, visualization_method='PCA', plot_3d=True)

cluster
0    20
1    22
2    24
3    47
4    10
5    20
6    32
7    12
8    15
9     8
Name: Sensor ID, dtype: int64
BIC: 22541.534038531354
AIC: -23578.26062722464


In [43]:
results_df_nofreq, _,_ = average_variability_metrics(df_nofreq_50_gmm_10,all_cleaned_df)

results_df_nofreq

Unnamed: 0,cluster,max_count_outliers,avg_count_outliers,avg_std_ping_time,count,weighted_avg_count_outliers,weighted_avg_std_ping_time
0,8,233,49.822222,89.480497,15,12.211852,53.688298
1,7,196,65.755556,86.452427,12,10.853704,57.634952
2,5,253,60.711111,230.663512,20,9.614444,69.199054
3,0,109,30.155556,13.777298,20,3.942222,0.688865
4,3,314,67.933333,85.163121,47,5.235461,7.247925
5,2,231,23.288889,85.316253,24,8.65463,10.664532
6,6,237,78.555556,17.557155,32,4.951389,3.840628
7,1,196,52.755556,87.027326,22,6.511111,7.911575
8,4,59,10.888889,159.593444,10,4.811111,79.796722
9,9,88,10.266667,901.15145,8,9.716667,1126.439312


#### Cluter 2: majority cluster 

In [44]:
target_cluster = 2
print(results_df_nofreq[results_df_nofreq["cluster"]==target_cluster])
visualize_cluster(df_nofreq_50_gmm_10,target_cluster)

   cluster  max_count_outliers  avg_count_outliers  avg_std_ping_time  count  \
5        2                 231           23.288889          85.316253     24   

   weighted_avg_count_outliers  weighted_avg_std_ping_time  
5                      8.65463                   10.664532  


#### Cluster 7: cluster of under measurement and outliers

In [45]:
target_cluster = 7
print(results_df_nofreq[results_df_nofreq["cluster"]==target_cluster])
visualize_cluster(df_nofreq_50_gmm_10,target_cluster)

   cluster  max_count_outliers  avg_count_outliers  avg_std_ping_time  count  \
1        7                 196           65.755556          86.452427     12   

   weighted_avg_count_outliers  weighted_avg_std_ping_time  
1                    10.853704                   57.634952  


This cluster seems to be collection of under measurement and outliers

#### Cluster 6: over measurement and but with some outliers

In [46]:
target_cluster = 6
print(results_df_nofreq[results_df_nofreq["cluster"]==target_cluster])
visualize_cluster(df_nofreq_50_gmm_10,target_cluster)

   cluster  max_count_outliers  avg_count_outliers  avg_std_ping_time  count  \
6        6                 237           78.555556          17.557155     32   

   weighted_avg_count_outliers  weighted_avg_std_ping_time  
6                     4.951389                    3.840628  


This cluster seems to be above measurement clusters

### Investigate other clusters 0,1,3,4,5,8,9

In [47]:
target_cluster = 0
print(results_df_nofreq[results_df_nofreq["cluster"]==target_cluster])
visualize_cluster(df_nofreq_50_gmm_10,target_cluster)

   cluster  max_count_outliers  avg_count_outliers  avg_std_ping_time  count  \
3        0                 109           30.155556          13.777298     20   

   weighted_avg_count_outliers  weighted_avg_std_ping_time  
3                     3.942222                    0.688865  


Cluster 0 seems to be over measurements

In [48]:
target_cluster = 1
print(results_df_nofreq[results_df_nofreq["cluster"]==target_cluster])
visualize_cluster(df_nofreq_50_gmm_10,target_cluster)

   cluster  max_count_outliers  avg_count_outliers  avg_std_ping_time  count  \
7        1                 196           52.755556          87.027326     22   

   weighted_avg_count_outliers  weighted_avg_std_ping_time  
7                     6.511111                    7.911575  


In [49]:
target_cluster = 3
print(results_df_nofreq[results_df_nofreq["cluster"]==target_cluster])
visualize_cluster(df_nofreq_50_gmm_10,target_cluster)

   cluster  max_count_outliers  avg_count_outliers  avg_std_ping_time  count  \
4        3                 314           67.933333          85.163121     47   

   weighted_avg_count_outliers  weighted_avg_std_ping_time  
4                     5.235461                    7.247925  


In [50]:
target_cluster = 4
print(results_df_nofreq[results_df_nofreq["cluster"]==target_cluster])
visualize_cluster(df_nofreq_50_gmm_10,target_cluster)

   cluster  max_count_outliers  avg_count_outliers  avg_std_ping_time  count  \
8        4                  59           10.888889         159.593444     10   

   weighted_avg_count_outliers  weighted_avg_std_ping_time  
8                     4.811111                   79.796722  


In [51]:
target_cluster = 5
print(results_df_nofreq[results_df_nofreq["cluster"]==target_cluster])
visualize_cluster(df_nofreq_50_gmm_10,target_cluster)

   cluster  max_count_outliers  avg_count_outliers  avg_std_ping_time  count  \
2        5                 253           60.711111         230.663512     20   

   weighted_avg_count_outliers  weighted_avg_std_ping_time  
2                     9.614444                   69.199054  


In [52]:
target_cluster = 8
print(results_df_nofreq[results_df_nofreq["cluster"]==target_cluster])
visualize_cluster(df_nofreq_50_gmm_10,target_cluster)

   cluster  max_count_outliers  avg_count_outliers  avg_std_ping_time  count  \
0        8                 233           49.822222          89.480497     15   

   weighted_avg_count_outliers  weighted_avg_std_ping_time  
0                    12.211852                   53.688298  


In [53]:
target_cluster = 9
print(results_df_nofreq[results_df_nofreq["cluster"]==target_cluster])
visualize_cluster(df_nofreq_50_gmm_10,target_cluster)

   cluster  max_count_outliers  avg_count_outliers  avg_std_ping_time  count  \
9        9                  88           10.266667          901.15145      8   

   weighted_avg_count_outliers  weighted_avg_std_ping_time  
9                     9.716667                 1126.439312  


### n_component = 15

In [54]:
df_nofreq_50_gmm_15,mi_nofreq_gmm_15 = train_GMM(df_corr_scaled_mi_nofreq_50, n_components=15, random_state=42,visualization_method='TSNE')

cluster
0     16
1     23
2     18
3     30
4      9
5     12
6     15
7     11
8     16
9      7
10     3
11     1
12    15
13     1
14    33
Name: Sensor ID, dtype: int64
BIC: 41776.4378965934
AIC: -27404.92765580595


In [62]:
results_df_nofreq,_,_ = average_variability_metrics(df_nofreq_50_gmm_15,all_cleaned_df)

results_df_nofreq

Unnamed: 0,cluster,max_count_outliers,avg_count_outliers,avg_std_ping_time,count,weighted_avg_count_outliers,weighted_avg_std_ping_time
0,8,233,50.644444,84.516848,16,11.397222,47.540727
1,7,250,71.533333,44.642995,11,16.224242,32.467633
2,14,265,47.511111,169.461251,33,6.590572,77.027841
3,0,105,15.4,14.935221,16,5.6,0.933451
4,3,194,34.422222,83.193282,30,5.319259,11.092438
5,2,152,20.933333,14.769418,18,7.281481,2.46157
6,12,90,14.288889,14.085522,15,5.047407,12.207452
7,6,177,39.466667,17.230979,15,9.168889,8.041124
8,1,279,58.511111,141.346355,23,9.586473,12.290987
9,10,59,5.733333,605.100547,3,17.755556,2218.702007


#### Cluter 0:low ping time score

In [63]:
target_cluster = 0
print(results_df_nofreq[results_df_nofreq["cluster"]==target_cluster])
visualize_cluster(df_nofreq_50_gmm_15,target_cluster)

   cluster  max_count_outliers  avg_count_outliers  avg_std_ping_time  count  \
3        0                 105                15.4          14.935221     16   

   weighted_avg_count_outliers  weighted_avg_std_ping_time  
3                          5.6                    0.933451  


#### Cluter 14 and 3: compare aimilar count and similar ping time score

In [64]:
target_cluster = 14
print(results_df_nofreq[results_df_nofreq["cluster"]==target_cluster])
visualize_cluster(df_nofreq_50_gmm_15,target_cluster)

   cluster  max_count_outliers  avg_count_outliers  avg_std_ping_time  count  \
2       14                 265           47.511111         169.461251     33   

   weighted_avg_count_outliers  weighted_avg_std_ping_time  
2                     6.590572                   77.027841  


In [65]:
target_cluster = 3
print(results_df_nofreq[results_df_nofreq["cluster"]==target_cluster])
visualize_cluster(df_nofreq_50_gmm_15,target_cluster)

   cluster  max_count_outliers  avg_count_outliers  avg_std_ping_time  count  \
4        3                 194           34.422222          83.193282     30   

   weighted_avg_count_outliers  weighted_avg_std_ping_time  
4                     5.319259                   11.092438  


#### Cluter 2,6,and 5: compare majority cluster 

In [66]:
target_cluster = 6
print(results_df_nofreq[results_df_nofreq["cluster"]==target_cluster])
visualize_cluster(df_nofreq_50_gmm_15,target_cluster)

   cluster  max_count_outliers  avg_count_outliers  avg_std_ping_time  count  \
7        6                 177           39.466667          17.230979     15   

   weighted_avg_count_outliers  weighted_avg_std_ping_time  
7                     9.168889                    8.041124  


In [67]:
target_cluster = 5
print(results_df_nofreq[results_df_nofreq["cluster"]==target_cluster])
visualize_cluster(df_nofreq_50_gmm_15,target_cluster)

    cluster  max_count_outliers  avg_count_outliers  avg_std_ping_time  count  \
11        5                 109           39.866667          84.933148     12   

    weighted_avg_count_outliers  weighted_avg_std_ping_time  
11                     5.761111                   42.466574  


In [68]:
target_cluster = 2
print(results_df_nofreq[results_df_nofreq["cluster"]==target_cluster])
visualize_cluster(df_nofreq_50_gmm_15,target_cluster)

   cluster  max_count_outliers  avg_count_outliers  avg_std_ping_time  count  \
5        2                 152           20.933333          14.769418     18   

   weighted_avg_count_outliers  weighted_avg_std_ping_time  
5                     7.281481                     2.46157  


As a result n_component 10 seems to be more promising so I will be investigating this further.

#### Investigate rest data points

In [69]:
target_cluster = 1
print(results_df_nofreq[results_df_nofreq["cluster"]==target_cluster])
visualize_cluster(df_nofreq_50_gmm_15,target_cluster)

   cluster  max_count_outliers  avg_count_outliers  avg_std_ping_time  count  \
8        1                 279           58.511111         141.346355     23   

   weighted_avg_count_outliers  weighted_avg_std_ping_time  
8                     9.586473                   12.290987  


In [70]:
target_cluster = 4
print(results_df_nofreq[results_df_nofreq["cluster"]==target_cluster])
visualize_cluster(df_nofreq_50_gmm_15,target_cluster)

    cluster  max_count_outliers  avg_count_outliers  avg_std_ping_time  count  \
10        4                  64           12.066667         176.329309      9   

    weighted_avg_count_outliers  weighted_avg_std_ping_time  
10                      5.77037                   97.960727  


In [71]:
target_cluster = 7
print(results_df_nofreq[results_df_nofreq["cluster"]==target_cluster])
visualize_cluster(df_nofreq_50_gmm_15,target_cluster)

   cluster  max_count_outliers  avg_count_outliers  avg_std_ping_time  count  \
1        7                 250           71.533333          44.642995     11   

   weighted_avg_count_outliers  weighted_avg_std_ping_time  
1                    16.224242                   32.467633  


In [72]:
target_cluster = 8
print(results_df_nofreq[results_df_nofreq["cluster"]==target_cluster])
visualize_cluster(df_nofreq_50_gmm_15,target_cluster)

   cluster  max_count_outliers  avg_count_outliers  avg_std_ping_time  count  \
0        8                 233           50.644444          84.516848     16   

   weighted_avg_count_outliers  weighted_avg_std_ping_time  
0                    11.397222                   47.540727  


### Test n_component 9

In [73]:
df_nofreq_50_gmm_9,mi_nofreq_gmm_9 = train_GMM(df_corr_scaled_mi_nofreq_50, n_components=9, random_state=42)

cluster
0    18
1    28
2    26
3    44
4    15
5    21
6    32
7    13
8    13
Name: Sensor ID, dtype: int64
BIC: 17767.974573209955
AIC: -23739.505915217378


In [74]:
results_df_nofreq,_,_ = average_variability_metrics(df_nofreq_50_gmm_9,all_cleaned_df)

results_df_nofreq

Unnamed: 0,cluster,max_count_outliers,avg_count_outliers,avg_std_ping_time,count,weighted_avg_count_outliers,weighted_avg_std_ping_time
0,8,106,41.311111,60.307581,13,4.976068,41.751402
1,7,196,59.933333,174.61196,13,10.466667,107.453514
2,5,273,47.688889,255.808665,21,10.729101,73.08819
3,0,105,30.244444,13.916496,18,4.153086,0.773139
4,3,361,78.666667,146.449398,44,6.416667,13.313582
5,2,248,21.444444,79.434252,26,8.713675,9.165491
6,6,237,78.555556,17.557155,32,4.951389,3.840628
7,1,164,61.711111,77.96119,28,3.653175,5.568656
8,4,203,26.244444,311.336431,15,11.783704,103.77881


#### Test major cluster 7 where it has low outlier score and moderate avg ping time score

In [75]:
target_cluster = 7
print(results_df_nofreq[results_df_nofreq["cluster"]==target_cluster])
visualize_cluster(df_nofreq_50_gmm_9,target_cluster)

   cluster  max_count_outliers  avg_count_outliers  avg_std_ping_time  count  \
1        7                 196           59.933333          174.61196     13   

   weighted_avg_count_outliers  weighted_avg_std_ping_time  
1                    10.466667                  107.453514  


In [76]:
target_cluster = 0
print(results_df_nofreq[results_df_nofreq["cluster"]==target_cluster])
visualize_cluster(df_nofreq_50_gmm_9,target_cluster,simple=False)

   cluster  max_count_outliers  avg_count_outliers  avg_std_ping_time  count  \
3        0                 105           30.244444          13.916496     18   

   weighted_avg_count_outliers  weighted_avg_std_ping_time  
3                     4.153086                    0.773139  


## Visualize based on cluster

In [1]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from plotly.subplots import make_subplots

def visualize_lineplot_ping_time_with_variability_by_cluster(df, cluster_sensors, delay=None):
    """
    Visualize the effect of range on ping time for each cluster with variability, optionally filtering by delay.

    Parameters:
    df (DataFrame): The DataFrame containing the data.
    cluster_sensors (dict): Dictionary where keys are cluster labels and values are lists of sensor IDs in each cluster.
    delay (int, optional): If specified, only data for this delay will be plotted. Otherwise, all delays are plotted.
    """
    # Group by sensor ID, range, and delay, then calculate the mean and standard deviation of ping time
    grouped_df = df.groupby(['Sensor ID', 'Range (cm)', 'Delay (us)']).agg(
        mean_ping_time=('Ping Time (us)', 'mean'),
        std_ping_time=('Ping Time (us)', 'std')
    ).reset_index()

    # Filter by the specified delay if provided
    if delay is not None:
        grouped_df = grouped_df[grouped_df['Delay (us)'] == delay]

    # Define the number of columns for subplots
    num_clusters = len(cluster_sensors)
    num_columns = 2  # Number of columns for the grid
    num_rows = (num_clusters + num_columns - 1) // num_columns  # Calculate rows based on the number of clusters

    # Create subplots
    fig = make_subplots(rows=num_rows, cols=num_columns, subplot_titles=[f'Cluster {cluster}' for cluster in cluster_sensors.keys()])

    for i, (cluster, sensors) in enumerate(cluster_sensors.items()):
        cluster_df = grouped_df[grouped_df['Sensor ID'].isin(sensors)]

        # Get the row and column position for the subplot
        row = (i // num_columns) + 1
        col = (i % num_columns) + 1

        # Adding error bars
        for sensor_id in sensors:
            sensor_data = cluster_df[cluster_df['Sensor ID'] == sensor_id]
            fig.add_trace(
                go.Scatter(
                    x=sensor_data['Range (cm)'],
                    y=sensor_data['mean_ping_time'],
                    mode='lines+markers',
                    name=f'Sensor {sensor_id}',
                    error_y=dict(
                        type='data',
                        array=sensor_data['std_ping_time'],
                        visible=True
                    )
                ),
                row=row, col=col
            )

        # Plot reference line
        ranges = np.linspace(cluster_df['Range (cm)'].min(), cluster_df['Range (cm)'].max(), 100)
        reference_ping_times = 57 * ranges
        fig.add_trace(
            go.Scatter(
                x=ranges,
                y=reference_ping_times,
                mode='lines',
                line=dict(color='red', dash='dash'),
                name='Reference Line'
            ),
            row=row, col=col
        )

    fig.update_layout(
        height=3000,  # Adjust height for the grid layout
        width=1500,  # Adjust width for the grid layout
        showlegend=False,
        title_text=f"Ping Time vs Range for Different Clusters {delay}us"
    )

    fig.show()

def visualize_cluster_delay(df, delay_pos=4):
    # Load the dataset
    file_path = '../processed_data/all_data_v4-1-1_cleaned_sensor211.csv'
    all_cleaned_df = pd.read_csv(file_path)
    all_cleaned_df = all_cleaned_df.drop("Unnamed: 0", axis=1)
    
    # Dictionary to store sensors grouped by cluster
    cluster_sensors = df.groupby("cluster")["Sensor ID"].apply(list).to_dict()

    delays = [3000,6000,8000,10000,16800]

    visualize_lineplot_ping_time_with_variability_by_cluster(all_cleaned_df, cluster_sensors, delays[delay_pos])



In [2]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from plotly.subplots import make_subplots

def visualize_lineplot_ping_time_with_variability_side_by_side(df, cluster_sensors, delays):
    """
    Visualize the effect of range on ping time for selected clusters and delays side-by-side with variability.

    Parameters:
    df (DataFrame): The DataFrame containing the data.
    cluster_sensors (dict): Dictionary where keys are cluster labels and values are lists of sensor IDs in each cluster.
    delays (list): List of delays to compare across clusters.
    """
    # Group by sensor ID, range, and delay, then calculate the mean and standard deviation of ping time
    grouped_df = df.groupby(['Sensor ID', 'Range (cm)', 'Delay (us)']).agg(
        mean_ping_time=('Ping Time (us)', 'mean'),
        std_ping_time=('Ping Time (us)', 'std')
    ).reset_index()

    # Determine number of rows and columns for subplots
    num_clusters = len(cluster_sensors)
    num_delays = len(delays)
    
    num_columns = num_clusters
    num_rows = num_delays
    
    # Create subplots
    fig = make_subplots(
        rows=num_rows, 
        cols=num_columns, 
        subplot_titles=[f'Cluster {cluster} - Delay {delay} us' for delay in delays for cluster in cluster_sensors.keys()]
    )

    for i, cluster in enumerate(cluster_sensors.keys()):
        sensors = cluster_sensors[cluster]
        cluster_df = grouped_df[grouped_df['Sensor ID'].isin(sensors)]
        
        for j, delay in enumerate(delays):
            delay_df = cluster_df[cluster_df['Delay (us)'] == delay]

            # Get the row and column position for the subplot
            row = j + 1
            col = i + 1

            # Adding error bars
            for sensor_id in sensors:
                sensor_data = delay_df[delay_df['Sensor ID'] == sensor_id]
                fig.add_trace(
                    go.Scatter(
                        x=sensor_data['Range (cm)'],
                        y=sensor_data['mean_ping_time'],
                        mode='lines+markers',
                        name=f'Sensor {sensor_id} (Delay {delay} us)',
                        error_y=dict(
                            type='data',
                            array=sensor_data['std_ping_time'],
                            visible=True
                        )
                    ),
                    row=row, col=col
                )

            # Plot reference line
            ranges = np.linspace(delay_df['Range (cm)'].min(), delay_df['Range (cm)'].max(), 100)
            reference_ping_times = 57 * ranges
            fig.add_trace(
                go.Scatter(
                    x=ranges,
                    y=reference_ping_times,
                    mode='lines',
                    line=dict(color='red', dash='dash'),
                    name='Reference Line'
                ),
                row=row, col=col
            )

    fig.update_layout(
        height=500 * num_rows,  # Adjust height based on the number of delays
        width=800 * num_columns,  # Adjust width based on the number of clusters
        showlegend=False,
        title_text="Side-by-Side Comparison of Clusters Across All Delays"
    )

    fig.show()

def visualize_cluster_delay_side_by_side(df, clusters_to_compare, delays=[3000, 6000, 8000, 10000, 16800]):
    """
    Compare multiple clusters across all specified delays side by side.

    Parameters:
    df (DataFrame): The DataFrame containing the data, including clustering information.
    clusters_to_compare (list): List of cluster labels to compare side-by-side.
    delays (list): List of delays to compare.
    """
    # Load the dataset
    file_path = '../processed_data/all_data_v4-1-1_cleaned_sensor211.csv'
    all_cleaned_df = pd.read_csv(file_path)
    all_cleaned_df = all_cleaned_df.drop("Unnamed: 0",axis=1)

    # Dictionary to store sensors grouped by cluster
    cluster_sensors = {cluster: df[df["cluster"] == cluster]["Sensor ID"].unique() for cluster in clusters_to_compare}

    # Visualize side-by-side comparisons for the selected clusters and delays
    visualize_lineplot_ping_time_with_variability_side_by_side(all_cleaned_df, cluster_sensors, delays)


In [61]:
visualize_cluster_delay(df_nofreq_50_gmm_15,delay_pos=0)

In [66]:
visualize_cluster_delay(df_nofreq_50_gmm_15,delay_pos=3)

In [64]:
visualize_cluster_delay(df_nofreq_50_gmm_15,delay_pos=4)

In [67]:
# Assuming df is your dataset containing the clustering information
visualize_cluster_delay_side_by_side(df_nofreq_50_gmm_10, clusters_to_compare=[0,2])


In [63]:
# Assuming df is your dataset containing the clustering information
visualize_cluster_delay_side_by_side(df_nofreq_50_gmm_10, clusters_to_compare=[4, 9])


In [65]:
# Assuming df is your dataset containing the clustering information
visualize_cluster_delay_side_by_side(df_nofreq_50_gmm_10, clusters_to_compare=[1, 2])
