In [None]:
import os

# Set the environment variable before importing numpy/sklearn
os.environ['OMP_NUM_THREADS'] = '1'

import numpy as np
from sklearn.cluster import KMeans
from collections import Counter
from pathlib import Path
from sklearn.cluster import KMeans
from collections import Counter
from datetime import datetime
import shutil
import glob

In [None]:
input_folder_path = r"D:\IFCB_05_20"
output_folder_path = r"D:\test0"

number_of_clusters = 5
number_of_elements_in_clusters = 6

# length of depth_map should be equal to number_of_elements_in_clusters
folder_name_map = {
    0 : '2_m',
    1 : '6.5_m',
    2 : '7_m',
    3 : '8_m',
    4 : '10_m',
    5 : '16_m' # assuming 6 elements in total
}

In [None]:
import matplotlib.pyplot as plt

timing = list({file.stem.split("_")[0] for file in Path(input_folder_path).iterdir() if file.is_file()})

# Convert to Unix timestamps and sort
timing_unix = [convert_to_unix_time(time) for time in timing]
timing_unix = sorted([(time-min(timing_unix)) for time in timing_unix])

print(f"timing_unix: {timing_unix}")
plt.plot(timing_unix, 'o-')


In [None]:
points = [(i, timing_unix[i]) for i in range(len(timing_unix))]
slope = (timing_unix[1] - timing_unix[0])

distance_list = []
for point in points:
    dist = distance_point_to_line_slope_intercept(point, slope)
    distance_list.append(dist)

distance_list = sorted(distance_list)
print(f"distance_list: {distance_list}")

plt.plot(distance_list, 'o-')

#[(0, [2]), (1, [4]), (2, [3]), (4, [2, 3, 4, 5])]


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import seaborn as sns

def kmeans_1d_clustering(data, k=3, plot=True, find_optimal_k=False, max_k=10):
    """
    Apply K-means clustering to 1D data and visualize results.
    
    Parameters:
    -----------
    data : list or array-like
        1D list or array of numbers
    k : int, default=3
        Number of clusters
    plot : bool, default=True
        Whether to create visualizations
    find_optimal_k : bool, default=False
        Whether to find optimal k using elbow method and silhouette score
    max_k : int, default=10
        Maximum k to test when finding optimal k
    
    Returns:
    --------
    dict : Dictionary containing clustering results
        - 'labels': cluster labels for each data point
        - 'centers': cluster centers
        - 'inertia': within-cluster sum of squares
        - 'silhouette_score': silhouette score (if k > 1)
        - 'optimal_k': optimal k (if find_optimal_k=True)
    """
    
    # Convert to numpy array and reshape for sklearn
    data = np.array(data)
    X = data.reshape(-1, 1)
    
    results = {}
    
    # Find optimal k if requested
    if find_optimal_k:
        optimal_k = find_optimal_clusters(X, max_k, plot)
        results['optimal_k'] = optimal_k
        k = optimal_k
    
    # Apply K-means clustering
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    labels = kmeans.fit_predict(X)
    centers = kmeans.cluster_centers_.flatten()
    
    # Store results
    results.update({
        'labels': labels,
        'centers': centers,
        'inertia': kmeans.inertia_,
        'silhouette_score': silhouette_score(X, labels) if k > 1 else None
    })
    
    # Create visualizations
    if plot:
        create_visualizations(data, labels, centers, k)
    
    return results

def find_optimal_clusters(X, max_k, plot=True):
    """Find optimal number of clusters using elbow method and silhouette analysis."""
    
    inertias = []
    silhouette_scores = []
    k_range = range(2, max_k + 1)
    
    for k in k_range:
        kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
        labels = kmeans.fit_predict(X)
        inertias.append(kmeans.inertia_)
        silhouette_scores.append(silhouette_score(X, labels))
    
    if plot:
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))
        
        # Elbow method plot
        ax1.plot(k_range, inertias, 'bo-')
        ax1.set_xlabel('Number of Clusters (k)')
        ax1.set_ylabel('Inertia (Within-cluster Sum of Squares)')
        ax1.set_title('Elbow Method for Optimal k')
        ax1.grid(True, alpha=0.3)
        
        # Silhouette score plot
        ax2.plot(k_range, silhouette_scores, 'ro-')
        ax2.set_xlabel('Number of Clusters (k)')
        ax2.set_ylabel('Silhouette Score')
        ax2.set_title('Silhouette Analysis for Optimal k')
        ax2.grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.show()
    
    # Find optimal k (highest silhouette score)
    optimal_k = k_range[np.argmax(silhouette_scores)]
    return optimal_k

def create_visualizations(data, labels, centers, k):
    """Create multiple visualizations of the clustering results."""
    
    # Set up colors
    colors = plt.cm.Set1(np.linspace(0, 1, k))
    
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    
    # 1. Scatter plot with clusters
    ax1 = axes[0, 0]
    for i in range(k):
        cluster_data = data[labels == i]
        ax1.scatter(cluster_data, [i] * len(cluster_data), 
                   c=[colors[i]], label=f'Cluster {i}', alpha=0.7, s=50)
    
    # Plot centers
    for i, center in enumerate(centers):
        ax1.axvline(x=center, color=colors[i], linestyle='--', alpha=0.8, linewidth=2)
    
    ax1.set_xlabel('Data Values')
    ax1.set_ylabel('Cluster')
    ax1.set_title('K-means Clustering Results')
    ax1.legend()
    ax1.grid(True, alpha=0.3)
    
    # 2. Histogram with cluster colors
    ax2 = axes[0, 1]
    for i in range(k):
        cluster_data = data[labels == i]
        ax2.hist(cluster_data, bins=20, alpha=0.6, color=colors[i], 
                label=f'Cluster {i}', edgecolor='black', linewidth=0.5)
    
    for i, center in enumerate(centers):
        ax2.axvline(x=center, color=colors[i], linestyle='--', linewidth=2)
    
    ax2.set_xlabel('Data Values')
    ax2.set_ylabel('Frequency')
    ax2.set_title('Distribution of Clusters')
    ax2.legend()
    ax2.grid(True, alpha=0.3)
    
    # 3. Line plot showing data points in order
    ax3 = axes[1, 0]
    for i in range(k):
        cluster_indices = np.where(labels == i)[0]
        ax3.scatter(cluster_indices, data[cluster_indices], 
                   c=[colors[i]], label=f'Cluster {i}', alpha=0.7, s=30)
    
    ax3.plot(range(len(data)), data, 'k-', alpha=0.3, linewidth=1)
    ax3.set_xlabel('Data Point Index')
    ax3.set_ylabel('Data Value')
    ax3.set_title('Clusters by Original Order')
    ax3.legend()
    ax3.grid(True, alpha=0.3)
    
    # 4. Box plot of clusters
    ax4 = axes[1, 1]
    cluster_data = [data[labels == i] for i in range(k)]
    bp = ax4.boxplot(cluster_data, patch_artist=True, labels=[f'Cluster {i}' for i in range(k)])
    
    for patch, color in zip(bp['boxes'], colors):
        patch.set_facecolor(color)
        patch.set_alpha(0.7)
    
    ax4.set_xlabel('Cluster')
    ax4.set_ylabel('Data Values')
    ax4.set_title('Box Plot of Clusters')
    ax4.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    # Print cluster statistics
    print("\nCluster Statistics:")
    print("-" * 50)
    for i in range(k):
        cluster_data = data[labels == i]
        print(f"Cluster {i}:")
        print(f"  Size: {len(cluster_data)}")
        print(f"  Center: {centers[i]:.3f}")
        print(f"  Mean: {np.mean(cluster_data):.3f}")
        print(f"  Std: {np.std(cluster_data):.3f}")
        print(f"  Range: [{np.min(cluster_data):.3f}, {np.max(cluster_data):.3f}]")
        print()

# Example usage
if __name__ == "__main__":
    # Generate sample data
    sample_data = distance_list
    
    print("Example 1: Basic clustering with k=3")
    results = kmeans_1d_clustering(sample_data, k=5)
    
    print(f"Inertia: {results['inertia']:.3f}")
    print(f"Silhouette Score: {results['silhouette_score']:.3f}")
    
    print("\n" + "="*60 + "\n")

In [None]:
def convert_to_unix_time(date_string):
    """
    Convert date string in format 'D20250520T000334' to Unix timestamp.
    
    Args:
        date_string (str): Date in format DYYYYMMDDTHHMMSS
    
    Returns:
        int: Unix timestamp
    """
    # Remove the 'D' prefix and split date and time parts
    clean_string = date_string[1:]  # Remove 'D'
    date_part = clean_string[:8]    # YYYYMMDD
    time_part = clean_string[9:]    # HHMMSS (skip the 'T')
    
    # Parse the datetime
    dt = datetime.strptime(date_part + time_part, '%Y%m%d%H%M%S')
    
    # Convert to Unix timestamp
    return int(dt.timestamp())

def unix_to_original_format(unix_timestamp):
    """
    Convert Unix timestamp back to original format 'DYYYYMMDDTHHMMSS'.
    
    Args:
        unix_timestamp (int or float): Unix timestamp
    
    Returns:
        str: Date string in format DYYYYMMDDTHHMMSS
    """
    # Convert Unix timestamp to datetime object
    dt = datetime.fromtimestamp(unix_timestamp)
    
    # Format as DYYYYMMDDTHHMMSS
    formatted_date = dt.strftime('D%Y%m%dT%H%M%S')
    
    return formatted_date

def distance_point_to_line_slope_intercept(point, slope):
    """
    Calculate distance from point to line y = mx + b.
    
    Args:
        point: (x, y) coordinates of the point
        slope: slope of the line (m)
        intercept: y-intercept of the line (b)
    
    Returns:
        Distance from point to line
    """
    x0, y0 = point
    
    # Convert y = mx + b to ax + by + c = 0 form: mx - y + b = 0
    a = slope
    b = -1
    
    # Distance formula: |ax0 + by0 + c| / sqrt(a^2 + b^2)
    distance = abs(a * x0 + b * y0 ) / np.sqrt(a**2 + b**2)
    
    return distance

def find_small_clusters(data, n_clusters=5, min_size=6):
    """
    Perform K-means clustering, sort clusters by center, and return small clusters info.
    
    Args:
        data: List or array of data points
        n_clusters: Number of clusters for K-means
        min_size: Minimum required cluster size
    
    Returns:
        List of tuples: [(sorted_cluster_index, actual_size), ...]
        Only includes clusters with size < min_size
    """
    # Convert to numpy array and perform K-means
    X = np.array(data).reshape(-1, 1)
    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
    labels = kmeans.fit_predict(X)
    centers = kmeans.cluster_centers_.flatten()
    
    # Count cluster sizes
    cluster_counts = Counter(labels)

    # print(f"Cluster counts: {cluster_counts}")
    
    # Create mapping of original cluster index to sorted index
    # Sort clusters by their center values
    cluster_center_pairs = [(i, centers[i]) for i in range(n_clusters)]

    # print(f"Cluster center pairs before sorting: {cluster_center_pairs}")

    sorted_clusters = sorted(cluster_center_pairs, key=lambda x: x[1])

    # print(f"Sorted cluster center pairs: {sorted_clusters}")
    
    # Create mapping: original_index -> sorted_index
    original_to_sorted = {original_idx: sorted_idx 
                         for sorted_idx, (original_idx, _) in enumerate(sorted_clusters)}
    
    # Find clusters with size < min_size and return with sorted indices
    small_clusters = []
    for original_idx, size in cluster_counts.items():
        if size < min_size:
            sorted_idx = original_to_sorted[original_idx]
            small_clusters.append((sorted_idx, size))
    # print(f"Small clusters before sorting: {small_clusters}")
    # Sort the result by sorted cluster index
    small_clusters.sort(key=lambda x: x[0])
    # print(f"Small clusters after sorting: {small_clusters}")
    
    return small_clusters

def gap_analysis(data, number_of_elements_in_clusters=6, tolerance=1.5):
    # print(f"Analyzing gap in data: {data}")
    gaps = [data[i + 1] - data[i] for i in range(len(data) - 1)]
    avg_gap = sum(gaps) / len(gaps)

    hole_found = 0

    # Identify gaps larger than average gap multiplied by tolerance
    for gap in gaps:
        if gap > (avg_gap * tolerance):
            #print(f"Holes of size {gap} found (average gap: {avg_gap})")
            data.insert((gaps.index(gap) + 1), 'missing_data')
            hole_found += 1

    # print(f"Number of holes found: {hole_found}")
    
    # If you don't find the hole in data then it must be at the end
    if hole_found != (number_of_elements_in_clusters - len(data)):
        for i in range(number_of_elements_in_clusters - len(data) - hole_found):
            data.append('missing_data')
            
    return [i for i, value in enumerate(data) if value == 'missing_data']

def find_missing_data_points(input_folder_path, number_of_clusters = 5, number_of_elements_in_clusters = 6):

    # Extract date/time
    timing = list({file.stem.split("_")[0] for file in Path(input_folder_path).iterdir() if file.is_file()})

    # Convert to Unix timestamps and sort
    timing_unix = [convert_to_unix_time(time) for time in timing]
    timing_unix = sorted([(time-min(timing_unix)) for time in timing_unix])

    # Convert to distance to make clustering more consistent
    points = [(i, timing_unix[i]) for i in range(len(timing_unix))]
    slope = (timing_unix[1] - timing_unix[0])

    distance_list = []
    for point in points:
        dist = distance_point_to_line_slope_intercept(point, slope)
        distance_list.append(dist)
    
    # Perform K-means clustering to find which cluster has less than expected number of elements
    clustering_results = find_small_clusters(distance_list, n_clusters=number_of_clusters, min_size=number_of_elements_in_clusters)

    print(f"Clustering results: {clustering_results}")
    
    # Analyze gap in each cluster for missing data points
    missing_data = []

    processed_data = 0
    for cluster_index, cluster_size in clustering_results:

        # print(f"Processing Cluster {cluster_index} with size {cluster_size}")
        start_idx = processed_data
        # print(f"start_idx: {start_idx}")
        end_idx = start_idx + cluster_size
        # print(f"end_idx: {end_idx}")

        cluster_data = timing_unix[start_idx:end_idx]
        missing_data_points_index = gap_analysis(cluster_data, number_of_elements_in_clusters)
        
        missing_data.append((cluster_index, missing_data_points_index))
        # print(f"Missing data point(s) at position {missing_data_points_index} in Cluster {cluster_index} ")
        processed_data += cluster_size
    
    return missing_data

print(find_missing_data_points(input_folder_path, number_of_clusters, number_of_elements_in_clusters))

# [0, 2352, 6453, 8606, 11282, 17982, 22344, 24413, 26561, 29215, 35985, 38321, 40344, 42413, 44562, 53977, 56318, 58342, 60412, 62563, 65225, 71979, 74325, 76348, 78426, 80575]

In [14]:
timing = list({file.stem.split("_")[0] for file in Path(input_folder_path).iterdir() if file.is_file()})

timing_unix = [convert_to_unix_time(time) for time in timing]

start_time = min(timing_unix)
sorted_timing_unix = sorted([(time-start_time) for time in timing_unix])

missing_data = find_missing_data_points(input_folder_path, number_of_clusters, number_of_elements_in_clusters)

missing_data_indices = []

for cluster_index, missing_indices in missing_data:
    absolute_indices_of_missing_data = [cluster_index * number_of_elements_in_clusters + idx for idx in missing_indices]
    missing_data_indices.extend(absolute_indices_of_missing_data)

print(f"Missing data indices: {missing_data_indices}")

shifting = 0
for idx in missing_data_indices:
    sorted_timing_unix.insert(idx + shifting, 'missing_data')
    shifting += 1

print(f"Sorted timing with missing data: {sorted_timing_unix}")
print(f"length: {len(sorted_timing_unix)}")

# Convert back to original format
sorted_timing_original = [unix_to_original_format(time+start_time) if time != 'missing_data' else 'missing_data' for time in sorted_timing_unix]
# print(f"Sorted timing in original format: {sorted_timing_original}")

for index, date_time in enumerate(sorted_timing_original):
    print(f"{index}: {date_time}")

    if date_time != 'missing_data':
        
        pattern = os.path.join(input_folder_path, f"{date_time}*")

        # Extract YYYYMMDD from the date string
        folder_name = date_time[1:9]    
    
        # Get all matching files
        matching_files = glob.glob(pattern)

        for file in matching_files:
            try:
                base_name = os.path.basename(file)

                depth = folder_name_map[int(index % number_of_elements_in_clusters)]
                desination_folder = os.path.join(output_folder_path, folder_name, depth)
                os.makedirs(desination_folder, exist_ok=True)

                shutil.copy2(file, os.path.join(desination_folder, base_name))
                print(f"Copy {file} to {os.path.join(desination_folder, base_name)}")
            except Exception as e:
                print(f"Error processing file {file}: {e}")
                continue

Clustering results: [(0, 5), (1, 5), (2, 5), (4, 5)]
Missing data indices: [5, 7, 17, 29]
Sorted timing with missing data: [0, 2352, 6453, 8606, 11282, 'missing_data', 17982, 22344, 'missing_data', 24413, 26561, 29215, 35985, 38321, 40344, 42413, 44562, 53977, 56318, 'missing_data', 58342, 60412, 62563, 65225, 71979, 74325, 76348, 78426, 80575, 'missing_data']
length: 30
0: D20250520T134448
Copy D:\IFCB_05_20\D20250520T134448_IFCB135.roi to D:\test0\20250520\2_m\D20250520T134448_IFCB135.roi
Copy D:\IFCB_05_20\D20250520T134448_IFCB135.hdr to D:\test0\20250520\2_m\D20250520T134448_IFCB135.hdr
Copy D:\IFCB_05_20\D20250520T134448_IFCB135.adc to D:\test0\20250520\2_m\D20250520T134448_IFCB135.adc
1: D20250520T142400
Copy D:\IFCB_05_20\D20250520T142400_IFCB135.hdr to D:\test0\20250520\6.5_m\D20250520T142400_IFCB135.hdr
Copy D:\IFCB_05_20\D20250520T142400_IFCB135.roi to D:\test0\20250520\6.5_m\D20250520T142400_IFCB135.roi
Copy D:\IFCB_05_20\D20250520T142400_IFCB135.adc to D:\test0\20250520\6.5