## Helper functions for processing signals

In [None]:
import numpy as np

def slice_iq(iq: np.ndarray, frame_len: int = 2048, hop_len: int | None = None):
    """
    iq: 1‑D complex128 array [I + 1jQ]  
    frame_len: samples per frame  
    hop_len:   overlap; default = frame_len (no overlap)
    """
    if hop_len is None:
        hop_len = frame_len
    num_frames = 1 + (len(iq) - frame_len) // hop_len
    frames = np.lib.stride_tricks.as_strided(
        iq,
        shape=(num_frames, frame_len),
        strides=(iq.strides[0]*hop_len, iq.strides[0]),
        writeable=False,
    )
    return frames.copy()          # make it C‑contiguous


In [None]:
def rms_norm(frames: np.ndarray, eps: float = 1e-12):
    power = np.sqrt(np.mean(np.abs(frames)**2, axis=1, keepdims=True)) + eps
    return frames / power


In [None]:
from numpy.fft import fft

def log_mag_fft(frames: np.ndarray):
    # 1‑sided spectrum (N/2 bins, dropping DC & Nyquist)
    spec = fft(frames, axis=1)[:, 1:frames.shape[1]//2]
    mag  = np.abs(spec)
    return np.log1p(mag)          # log(1 + |X|)


In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

def preprocess(X: np.ndarray, pca_energy: float | None = 0.95):
    scaler = StandardScaler().fit(X)
    Xz = scaler.transform(X)
    if pca_energy:
        pca = PCA(n_components=pca_energy, svd_solver="full").fit(Xz)
        Xz = pca.transform(Xz)
    else:
        pca = None
    return Xz, scaler, pca


In [None]:
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.metrics import silhouette_score

def sweep_k(X, ks=(2,3,4,5,6,8,10), minibatch=True, **kmeans_kw):
    inertias, sils = [], []
    KM = KMeans
    for k in ks:
        print(f"Fitting k={k}...X shape = {X.shape}")
        km = KM(n_clusters=k, random_state=0, n_init="auto", **kmeans_kw).fit(X)
        inertias.append(km.inertia_)
        labels = km.labels_
        sils.append(silhouette_score(X, labels))
    return inertias, sils


## Loading signal data from directory
This section loads signal data from a directory and processes them to be ready for clustering

In [None]:
import pandas as pd
import pathlib
from tqdm import tqdm
import matplotlib.pyplot as plt

def load_and_process_directory(data_dir, frame_len=2048, hop_len=None, max_files=None):
    """
    Load all CSV files from directory and process them for clustering
    
    Parameters:
    - data_dir: Path to directory containing CSV files
    - frame_len: Frame length for slicing (default: 2048)
    - hop_len: Hop length for overlap (default: frame_len, no overlap)
    - max_files: Maximum number of files to process (None = all files)
    
    Returns:
    - X: Feature matrix ready for clustering
    - file_info: List of (filename, frame_indices) for tracking
    """
    
    data_path = pathlib.Path(data_dir)
    csv_files = sorted(list(data_path.glob('*.csv')))
    
    if max_files:
        csv_files = csv_files[:max_files]
    
    print(f"Found {len(csv_files)} CSV files in {data_dir}")
    
    all_features = []
    file_info = []
    
    for file_idx, csv_file in enumerate(tqdm(csv_files, desc="Processing files")):
        try:
            # Load CSV file (assuming it has I/Q data after metadata rows)
            print(f"Loading {csv_file.name}...", end=" ")
            
            # Skip first 10 rows (metadata) and load I/Q data
            df = pd.read_csv(csv_file, skiprows=10, names=['I', 'Q'])
            
            # Remove any non-numeric rows
            df = df.loc[~df['I'].isin(['TimestampOffset', 'TriggerPosition', 'FastFrameID', 'IDInFastFrame', 'TotalInFastFrame'])]
            df['I'] = df['I'].astype('float')
            df['Q'] = df['Q'].astype('float')
            
            # Convert to complex IQ signal
            iq_signal = np.vstack((df['I'].values, df['Q'].values)).reshape(-1, 2)
            idx = np.random.choice(iq_signal.shape[0], size=10000, replace=False)
            sampled = iq_signal[idx]   
            plt.plot(df['I'].values, df['Q'].values, 'o', markersize=1, alpha=0.5)
            plt.show()

            # Identify number of clusters
            ks = [4, 8, 16, 64]
            inertias, sils = sweep_k(sampled, ks)

            
            print(f"Shape: {iq_signal.shape}")
            best_k = ks[int(np.argmax(sils))]
            print(f"Best k based on silhouette score: {best_k}")
            print(f"Silhouette scores: {dict(zip(ks, [f'{s:.3f}' for s in sils]))}")
            
            
            
        except Exception as e:
            print(f"Error processing {csv_file.name}: {e}")
            continue


# Set your data directory path here
data_directory = "C:/Users/UserAdmin/Desktop/Jason - Signal Classification/AI Models/Data/synthetic/visual"

# Process all files in the directory
X_raw, file_info = load_and_process_directory(
    data_directory, 
    frame_len=20000, 
    hop_len=20000,  # No overlap
    max_files=None  # Process all files
)

In [None]:
import pandas as pd
import pathlib
from tqdm import tqdm

def load_and_process_directory(data_dir, frame_len=2048, hop_len=None, max_files=None):
    """
    Load all CSV files from directory and process them for clustering
    
    Parameters:
    - data_dir: Path to directory containing CSV files
    - frame_len: Frame length for slicing (default: 2048)
    - hop_len: Hop length for overlap (default: frame_len, no overlap)
    - max_files: Maximum number of files to process (None = all files)
    
    Returns:
    - X: Feature matrix ready for clustering
    - file_info: List of (filename, frame_indices) for tracking
    """
    
    data_path = pathlib.Path(data_dir)
    csv_files = sorted(list(data_path.glob('*.csv')))
    
    if max_files:
        csv_files = csv_files[:max_files]
    
    print(f"Found {len(csv_files)} CSV files in {data_dir}")
    
    all_features = []
    file_info = []
    
    for file_idx, csv_file in enumerate(tqdm(csv_files, desc="Processing files")):
        try:
            # Load CSV file (assuming it has I/Q data after metadata rows)
            print(f"Loading {csv_file.name}...", end=" ")
            
            # Skip first 10 rows (metadata) and load I/Q data
            df = pd.read_csv(csv_file, skiprows=10, names=['I', 'Q'])
            
            # Remove any non-numeric rows
            df = df.loc[~df['I'].isin(['TimestampOffset', 'TriggerPosition', 'FastFrameID', 'IDInFastFrame', 'TotalInFastFrame'])]
            df['I'] = df['I'].astype('float')
            df['Q'] = df['Q'].astype('float')
            
            # Convert to complex IQ signal
            iq_signal = np.vstack((df['I'].values, df['Q'].values)).reshape(-1, 2)
            
            print(f"Shape: {iq_signal.shape}")
            
            # Slice into frames
            frames = slice_iq(iq_signal, frame_len=frame_len, hop_len=hop_len)
            print(f"  -> {frames.shape[0]} frames")
            
            # Normalize frames
            frames_norm = rms_norm(frames)
            
            # Extract FFT features
            features = log_mag_fft(frames_norm)
            features = frames_norm
            
            # Store features and track which file they came from
            all_features.append(features)
            
            # Track file info for each frame
            for frame_idx in range(features.shape[0]):
                file_info.append({
                    'file_name': csv_file.name,
                    'file_index': file_idx,
                    'frame_index': frame_idx,
                    'total_frames': features.shape[0]
                })
            
        except Exception as e:
            print(f"Error processing {csv_file.name}: {e}")
            continue
    
    # Combine all features
    if all_features:
        X = np.vstack(all_features)
        print(f"\nTotal features shape: {X.shape}")
        print(f"Total frames from {len(csv_files)} files: {len(file_info)}")
        return X, file_info
    else:
        print("No features extracted!")
        return None, None

# Set your data directory path here
data_directory = "C:/Users/UserAdmin/Desktop/Jason - Signal Classification/AI Models/Data/synthetic/synthetic_set0"

# Process all files in the directory
X_raw, file_info = load_and_process_directory(
    data_directory, 
    frame_len=20000, 
    hop_len=20000,  # No overlap
    max_files=None  # Process all files
)

In [None]:
X_raw.shape

In [None]:
# Preprocess the features for clustering
if X_raw is not None:
    print("Preprocessing features...")
    Xz, scaler, pca = preprocess(X_raw, pca_energy=0.95)
    
    print(f"Original features shape: {X_raw.shape}")
    print(f"After preprocessing: {Xz.shape}")
    if pca is not None:
        print(f"PCA explained variance ratio: {pca.explained_variance_ratio_.sum():.3f}")
        print(f"Number of components: {pca.n_components_}")
    
    # Show some statistics about the data
    file_counts = {}
    for info in file_info:
        file_name = info['file_name']
        if file_name not in file_counts:
            file_counts[file_name] = 0
        file_counts[file_name] += 1
    
    print(f"\nFrames per file:")
    for file_name, count in file_counts.items():
        print(f"  {file_name}: {count} frames")
else:
    print("No data loaded. Please check your directory path and file format.")

## Clustering

In [None]:
import matplotlib.pyplot as plt

# Only run clustering if we have data
print("Running K-means clustering analysis...")
    
ks = range(2, 11)
inertias, sils = sweep_k(Xz, ks)

fig, ax = plt.subplots(1, 2, figsize=(12, 4))
ax[0].plot(ks, inertias, "o-")
ax[0].set_title("Elbow Method (WCSS)")
ax[0].set_xlabel("Number of Clusters (k)")
ax[0].set_ylabel("Within-Cluster Sum of Squares")
ax[0].grid(True, alpha=0.3)

ax[1].plot(ks, sils, "o-")
ax[1].set_title("Average Silhouette Score")
ax[1].set_xlabel("Number of Clusters (k)")
ax[1].set_ylabel("Silhouette Score")
ax[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

best_k = ks[int(np.argmax(sils))]
print(f"Best k based on silhouette score: {best_k}")
print(f"Silhouette scores: {dict(zip(ks, [f'{s:.3f}' for s in sils]))}")



In [None]:
# Perform final clustering with best k

print(f"Performing final clustering with k={best_k}...")

km = KMeans(n_clusters=best_k, random_state=0, n_init="auto").fit(Xz)
labels = km.labels_
centroids = km.cluster_centers_

print(f"Clustering completed!")
print(f"Cluster assignments:")
unique_labels, counts = np.unique(labels, return_counts=True)
for label, count in zip(unique_labels, counts):
    percentage = count / len(labels) * 100
    print(f"  Cluster {label}: {count} frames ({percentage:.1f}%)")

# Analyze cluster assignments by file
print(f"\nCluster distribution by file:")
cluster_by_file = {}
for i, info in enumerate(file_info):
    file_name = info['file_name']
    cluster = labels[i]
    
    if file_name not in cluster_by_file:
        cluster_by_file[file_name] = {}
    if cluster not in cluster_by_file[file_name]:
        cluster_by_file[file_name][cluster] = 0
    cluster_by_file[file_name][cluster] += 1

for file_name, clusters in cluster_by_file.items():
    total_frames = sum(clusters.values())
    print(f"\n  {file_name} ({total_frames} frames):")
    for cluster in sorted(clusters.keys()):
        count = clusters[cluster]
        percentage = count / total_frames * 100
        print(f"    Cluster {cluster}: {count} frames ({percentage:.1f}%)")


In [None]:
import umap

# Create UMAP visualization if clustering is complete
print("Creating UMAP visualization...")

embedding = umap.UMAP(n_neighbors=50, min_dist=0.1, metric="euclidean",
                        random_state=0).fit_transform(Xz)

# Create two visualizations: one colored by cluster, one by file
plt.figure(figsize=(12, 6))

# Plot 1: Color by cluster
colors = plt.cm.tab10(labels)  # Use tab10 colormap for distinct colors
plt.scatter(embedding[:, 0], embedding[:, 1], c=colors, s=3, alpha=0.7)
plt.title(f"UMAP: Colored by Cluster (k={best_k})")
plt.axis("off")
# Create custom legend for clusters
unique_labels = np.unique(labels)
legend_elements = []
for label in unique_labels:
    color = plt.cm.tab10(label)
    count = np.sum(labels == label)
    legend_elements.append(plt.Line2D([0], [0], marker='o', color='w', 
                                    markerfacecolor=color, markersize=8, 
                                    label=f'Cluster {label} ({count} points)'))
plt.legend(handles=legend_elements, loc='center left', bbox_to_anchor=(1, 0.5))

# Print legend for file indices
print("File index legend:")
unique_files = {}
for info in file_info:
    unique_files[info['file_index']] = info['file_name']

for idx in sorted(unique_files.keys()):
    print(f"  Index {idx}: {unique_files[idx]}")
        

In [None]:
import joblib, pathlib, json

# Save the clustering model and results if everything is complete
print("Saving clustering model and results...")

pathlib.Path("models").mkdir(exist_ok=True)

# Save the trained models
model_data = {
    "scaler": scaler, 
    "pca": pca, 
    "kmeans": km,
    "best_k": best_k,
    "feature_shape": X_raw.shape,
    "processed_shape": Xz.shape
}
joblib.dump(model_data, "models/clustering_model.pkl")

# Save metadata
meta_data = {
    "frame_len": 2048, 
    "hop_len": 2048,
    "pca_components": pca.n_components_ if pca else None,
    "pca_variance_explained": float(pca.explained_variance_ratio_.sum()) if pca else None,
    "num_clusters": best_k,
    "total_frames": len(labels),
    "files_processed": len(set(info['file_name'] for info in file_info))
}
json.dump(meta_data, open("models/clustering_meta.json", "w"), indent=2)

# Save detailed results
results_data = {
    "file_info": file_info,
    "cluster_labels": labels.tolist(),
    "cluster_counts": {int(k): int(v) for k, v in zip(*np.unique(labels, return_counts=True))}
}
json.dump(results_data, open("models/clustering_results.json", "w"), indent=2)

print(f"✓ Saved clustering model to: models/clustering_model.pkl")
print(f"✓ Saved metadata to: models/clustering_meta.json") 
print(f"✓ Saved results to: models/clustering_results.json")
print(f"✓ Processed {len(set(info['file_name'] for info in file_info))} files")
print(f"✓ Generated {len(labels)} feature vectors")
print(f"✓ Found {best_k} optimal clusters")
