In [282]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.metrics import silhouette_score
import plotly.express as px

In [283]:
# Merge the data

def get_all_files_in_directory(root_directory):
    file_paths = []
    for root, dirs, files in os.walk(root_directory):
        for file in files:
            file_paths.append(os.path.join(root, file))
    return file_paths


def merge_csv_files(file_paths):
    """
    Merge multiple CSV files into a single DataFrame.

    Parameters:
    file_paths (list of str): List of file paths to the CSV files.

    Returns:
    DataFrame: Merged DataFrame containing data from all input CSV files.
    """
    dataframes = []
    for file in file_paths:
        df = pd.read_csv(file)
        dataframes.append(df)

    # Concatenate all DataFrames
    merged_df = pd.concat(dataframes, ignore_index=True)
    
    return merged_df

In [284]:
def identify_and_remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df_no_outliers = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
    df_outliers_lower = df[(df[column] <= lower_bound)]
    df_outliers_upper = df[(df[column] >= upper_bound)]
    return df_no_outliers,df_outliers_lower,df_outliers_upper

def split_quartiles(df):
    # Group the data by 'Sensor ID', 'Delay (us)', and 'Range (cm)'
    df_copy = df.copy()
    grouped = df_copy.groupby(['Sensor ID', 'Delay (us)', 'Range (cm)'])

    # Identify and remove outliers for each group
    middle_quartile = []
    lower_quartile = []
    upper_quartile = []

    for name, group in grouped:
        cleaned_group = group.copy()
        for column in ['Ping Time (us)']:
            middle_quartile_group,lower_quartile_group,upper_quartile_group = identify_and_remove_outliers(cleaned_group, column)
        middle_quartile.append(middle_quartile_group)
        lower_quartile.append(lower_quartile_group)
        upper_quartile.append(upper_quartile_group)
    # Combine the cleaned groups into a single DataFrame
    df_middle_quartile= pd.concat(middle_quartile)
    df_lower_quartile = pd.concat(lower_quartile)
    df_upper_quartile = pd.concat(upper_quartile)
    
    return df_middle_quartile, df_lower_quartile, df_upper_quartile


def create_range_delay_feature(df_quartile,bound):
    # Step 1: Group and calculate mean
    df_grouped = df_quartile.groupby(['Sensor ID', 'Range (cm)', 'Delay (us)'])['Ping Time (us)'].mean().reset_index()

    # Step 2: Create `range_delay` column
    df_grouped['range_delay'] = df_grouped['Range (cm)'].astype(str) + '_' + df_grouped['Delay (us)'].astype(str)+'_'+'mean'+'_'+bound

    # Step 3: Pivot the table
    df_pivot = df_grouped.pivot(index='Sensor ID', columns='range_delay', values='Ping Time (us)').reset_index()
    
    return df_pivot

In [285]:
def feature_engineering_quartile_means(df):
    df=df[df["Range (cm)"].isin([13,18,23])] # this is necessary features.
    values_to_keep = [16800,10000,8000,6000,3000]
    df = df[df["Delay (us)"].isin(values_to_keep)]

    df_middle_quartile, _, df_upper_quartile = split_quartiles(df)
    df_range_delay_middle = create_range_delay_feature(df_middle_quartile,"middle")
    df_range_delay_upper = create_range_delay_feature(df_upper_quartile,"upper")

    # List of DataFrames
    df_pivots = [df_range_delay_middle, df_range_delay_upper]

    # Initialize the merged DataFrame with the first DataFrame in the list
    df_range_delay_all = df_pivots[0]

    # Iteratively merge each DataFrame in the list
    for df in df_pivots[1:]:
        df_range_delay_all = df_range_delay_all.merge(df, on='Sensor ID')
        
    # Replace all NaN values with 0 in the merged DataFrame
    df_range_delay_all.fillna(0, inplace=True)

    # Define the column list
    column_list = [
        '23_6000_mean_middle',
        '23_16800_mean_middle',
        '18_3000_mean_middle',
        '18_16800_mean_middle',
        '23_10000_mean_middle',
        '13_6000_mean_middle',
        '18_6000_mean_middle',
        '13_3000_mean_middle',
        '18_8000_mean_middle',
        '13_10000_mean_middle',
        'Sensor ID',
        '13_16800_mean_upper'
    ]

    # Select columns that are only in the list
    df_range_delay_all = df_range_delay_all[[col for col in df_range_delay_all.columns if col in column_list]]

    df = df_range_delay_all[column_list]

    df = df.reindex(columns=column_list)
    return df
    


In [286]:
from joblib import load

def predict_KMeans(df):

    # Standardize the features
    df = df.copy()
    sensor_ids = df.index if 'Sensor ID' not in df.columns else df['Sensor ID']
    scaler = load("best_models/final/scaler_final.joblib")
    features_scaled = scaler.transform(df.drop(columns=['Sensor ID']))

    kmeans = load("best_models/final/kmeans_model_final.joblib")
    print("Loaded pre-trained KMeans model.")

    # Predict cluster labels
    cluster_labels = kmeans.predict(features_scaled)
    df['cluster'] = cluster_labels

    # Define the column list
    column_list = ['Sensor ID', 'cluster']

    # Select columns that are only in the list
    df = df[[col for col in df.columns if col in column_list]]

    return df

In [274]:
df_data_v4_1_1 = pd.read_csv("../processed_data/all_data_v4-1-1_cleaned_sensor211.csv")

df_range_delay_all = feature_engineering_quartile_means(df_data_v4_1_1)



In [275]:
df_range_delay_all

range_delay,23_6000_mean_middle,23_16800_mean_middle,18_3000_mean_middle,18_16800_mean_middle,23_10000_mean_middle,13_6000_mean_middle,18_6000_mean_middle,13_3000_mean_middle,18_8000_mean_middle,13_10000_mean_middle,Sensor ID,13_16800_mean_upper
0,1328.200000,1302.076923,1069.240000,1069.285714,1303.000000,827.000000,1069.400000,827.000000,1069.612245,827.000000,1,827.195122
1,1235.000000,1247.560000,967.260870,954.200000,1235.000000,679.000000,954.520000,679.000000,955.560000,679.640000,2,679.869565
2,1296.511111,1296.674419,1007.000000,1020.920000,1297.212766,717.240000,1005.960000,718.040000,1006.069767,717.880000,3,742.333333
3,1318.652174,1319.000000,1028.840000,1031.000000,1317.640000,736.920000,1087.000000,737.160000,1063.240000,737.080000,4,0.000000
4,1332.920000,1332.760000,1047.244898,1045.697674,1333.480000,743.000000,1046.288889,741.880000,1047.240000,743.000000,5,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...
203,1307.000000,1307.170213,1025.960000,1026.760000,1307.000000,709.480000,1033.000000,709.122449,1033.320000,709.560000,207,0.000000
204,1304.920000,1296.444444,1009.720000,1009.883721,1296.727273,720.520000,1009.800000,720.120000,1009.640000,719.897959,208,0.000000
205,1335.833333,1345.080000,1064.440000,1053.875000,1334.707317,759.408163,1069.400000,759.000000,1078.500000,759.000000,209,815.000000
206,1303.750000,1309.000000,1018.520000,1018.744681,1303.000000,723.000000,1018.744681,723.000000,1018.755102,722.200000,210,0.000000


In [276]:
df_range_delay_all.to_csv("../processed_data/batch_corr_MI_10_custom_FS_revised.csv")

In [277]:
# randomly select 3 rows
random_rows = df_range_delay_all.sample(n=100)
sesnor139_cluster5 = df_range_delay_all[df_range_delay_all['Sensor ID'] == 1]

predict_KMeans(sesnor139_cluster5)

Loaded pre-trained KMeans model.


range_delay,Sensor ID,cluster
0,1,0


In [278]:
sesnor139_cluster5

range_delay,23_6000_mean_middle,23_16800_mean_middle,18_3000_mean_middle,18_16800_mean_middle,23_10000_mean_middle,13_6000_mean_middle,18_6000_mean_middle,13_3000_mean_middle,18_8000_mean_middle,13_10000_mean_middle,Sensor ID,13_16800_mean_upper
0,1328.2,1302.076923,1069.24,1069.285714,1303.0,827.0,1069.4,827.0,1069.612245,827.0,1,827.195122


In [279]:
df_result = pd.read_csv("best_models/final/df_custom_cluster_16_final.csv")
df_result[df_result['Sensor ID'] == 1]


Unnamed: 0.1,Unnamed: 0,23_6000_mean_middle,23_16800_mean_middle,18_3000_mean_middle,18_16800_mean_middle,23_10000_mean_middle,13_6000_mean_middle,18_6000_mean_middle,13_3000_mean_middle,18_8000_mean_middle,13_10000_mean_middle,Sensor ID,13_16800_mean_upper,cluster
0,0,1328.2,1302.076923,1069.24,1069.285714,1303.0,827.0,1069.4,827.0,1069.612245,827.0,1,827.195122,0


In [281]:
df_custom = pd.read_csv("../processed_data/batch_corr_MI_10_custom_FS.csv",index_col=0)
df_custom[df_custom['Sensor ID'] == 1]

Unnamed: 0,23_6000_mean_middle,23_16800_mean_middle,18_3000_mean_middle,18_16800_mean_middle,23_10000_mean_middle,13_6000_mean_middle,18_6000_mean_middle,13_3000_mean_middle,18_8000_mean_middle,13_10000_mean_middle,Sensor ID,13_16800_mean_upper
0,1328.2,1302.076923,1069.24,1069.285714,1303.0,827.0,1069.4,827.0,1069.612245,827.0,1,-0.339331
