This cell is used to create the ground truth for the data.

In [None]:
"""
Process original PPM data and generate ground truth
Read raw data following csv_conversion.ipynb method, but downsample to 256G instead of 32G
No noise added, used for generating ground truth
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib

# Set global font
matplotlib.rcParams['font.family'] = 'Arial'

# Define input filename (original PPM data)
input_filename = "../csv/8PPM_500Mbps.csv"

# Define target resampling frequency as 256G
target_frequency_hz = 256 * 1e9  # 256 GHz
resampling_interval_s = 1 / target_frequency_hz

print(f"Target resampling frequency: {target_frequency_hz/1e9} GHz")
print(f"Resampling interval: {resampling_interval_s:.2e} s")

try:
    # Read CSV file
    df = pd.read_csv(input_filename, header=0)
    
    # Rename columns for easier processing
    df.columns = ['original_time', 'original_data']
    
    # Convert to numeric type
    df['original_time'] = pd.to_numeric(df['original_time'], errors='coerce')
    df['original_data'] = pd.to_numeric(df['original_data'], errors='coerce')
    
    # Remove rows containing NaN
    df.dropna(subset=['original_time', 'original_data'], inplace=True)
    
    if df.empty:
        print(f"Error: No valid numeric data found in file {input_filename}")
    else:
        # Adjust time axis to start from 0
        min_time = df['original_time'].min()
        df['time'] = df['original_time'] - min_time
        
        # Prepare for resampling
        start_resample_time = 0
        end_resample_time = df['time'].max()
        
        new_time_axis = np.arange(start_resample_time, end_resample_time, resampling_interval_s)
        
        # Perform linear interpolation
        df_sorted = df.sort_values(by='time')
        resampled_data_values = np.interp(new_time_axis, df_sorted['time'], df_sorted['original_data'])
        
        # Create resampled DataFrame
        df_256ghz = pd.DataFrame({'time': new_time_axis, 'data': resampled_data_values})
        
        print(f"Processing complete")
        print(f"Original data shape: {df.shape}")
        print(f"Data shape after 256G resampling: {df_256ghz.shape}")
        print(f"Data time range: 0 to {df_256ghz['time'].max():.2e} s")
        print("First few rows of data:")
        print(df_256ghz.head())
        
        # Visualize first 10ns of data
        plt.figure(figsize=(12, 6))
        time_limit = 1e-8
        mask = df_256ghz['time'] <= time_limit
        plt.plot(df_256ghz.loc[mask, 'time'], df_256ghz.loc[mask, 'data'])
        plt.xlabel('Time (s)')
        plt.ylabel('Data')
        plt.title('PPM Data after 256G Resampling (First 10ns)')
        plt.grid(True)
        plt.xlim(0, time_limit)
        plt.show()

except FileNotFoundError:
    print(f"Error: File {input_filename} not found")
except Exception as e:
    print(f"Error occurred during processing: {e}")

In [None]:
# Time alignment processing (following the method in csv_conversion.ipynb)
# Remove the initial data so that the symbol aligns to the correct time

# Calculate the square of the data for power analysis
df_256ghz['Data_Squared'] = df_256ghz['data'] ** 2

# Define folding period (assume 500 Mbps symbol rate -> 2 ns period)
period = 2e-9  # 2 ns

# Use squared data for folding analysis
data_to_fold = df_256ghz['Data_Squared'].dropna()
time_to_fold = df_256ghz.loc[data_to_fold.index, 'time']

# Calculate time modulo period
folded_time = time_to_fold % period

# Determine time resolution
time_resolution = time_to_fold.diff().mean()
if pd.isna(time_resolution):
    time_resolution = (df_256ghz['time'].iloc[1] - df_256ghz['time'].iloc[0]) if len(df_256ghz['time']) > 1 else 1e-12

# Create bins in the range 0-2ns
num_bins = max(1, int(period / time_resolution))
bins = np.linspace(0, period, num_bins + 1)

# Create DataFrame for folding analysis
fold_df = pd.DataFrame({'time': folded_time, 'data': data_to_fold})

# Assign folded time to bins
fold_df['time_bin'] = pd.cut(fold_df['time'], bins=bins, labels=False, include_lowest=True)

# Group by bin and sum data
summed_data = fold_df.groupby('time_bin')['data'].sum()

# Create bin center time axis
bin_centers = (bins[:-1] + bins[1:]) / 2

# Reindex summed data to match bins, fill missing bins with 0
summed_data = summed_data.reindex(range(len(bin_centers)), fill_value=0)

# Normalize summed data to 0-1 range
if summed_data.max() > summed_data.min():
    summed_data = (summed_data - summed_data.min()) / (summed_data.max() - summed_data.min())

# Visualize folding result
plt.figure(figsize=(12, 6))
plt.plot(bin_centers, summed_data)
plt.xlabel('Time (s) within 2ns period')
plt.ylabel('Summed Data (Normalized)')
plt.title('Folding and Summing Data within 2ns Period')
plt.grid(True)
plt.show()

# Find the midpoint of intervals above the threshold
threshold = 0.2

# Find positions where data is above the threshold
above_threshold = summed_data > threshold

if above_threshold.any():
    # Get the time values of these points
    time_above_threshold = bin_centers[above_threshold]
    
    # Calculate the circular mean of the time points to handle wrap-around
    # Convert time to angle (radians)
    angles = (time_above_threshold / period) * 2 * np.pi
    
    # Calculate mean sine and cosine of the angles
    mean_sin = np.mean(np.sin(angles))
    mean_cos = np.mean(np.cos(angles))
    
    # Calculate mean angle from mean sine and cosine
    mean_angle = np.arctan2(mean_sin, mean_cos)
    
    # Convert mean angle back to time
    midpoint_time = (mean_angle / (2 * np.pi)) * period
    
    # Adjust midpoint to [0, period] range
    if midpoint_time < 0:
        midpoint_time += period
    
    print(f"Midpoint position of interval where data > {threshold}: {midpoint_time:.4e} s")
    print(f"Need to delay signal by {period - midpoint_time:.4e} s to align to end of period")
    print(f"Corresponds to {int((period - midpoint_time) * 256e9)} samples at 256 GHz sampling rate")
    
    # Visualize midpoint
    plt.figure(figsize=(12, 6))
    plt.plot(bin_centers, summed_data, label='Summed Data')
    plt.axhline(y=threshold, color='r', linestyle='--', label=f'Threshold ({threshold})')
    plt.axvline(x=midpoint_time, color='g', linestyle='-', label=f'Midpoint ({midpoint_time:.2e} s)')
    plt.xlabel('Time (s) within 2ns period')
    plt.ylabel('Summed Data (Normalized)')
    plt.title('Folding and Summing Data within 2ns Period with Midpoint Marked')
    plt.grid(True)
    plt.legend()
    plt.show()
    
    # Remove initial data for time alignment
    # Calculate time threshold
    time_threshold = midpoint_time + 1e-9  # Add 1ns to midpoint
    
    # Find indices to remove
    indices_to_remove = df_256ghz['time'] < time_threshold
    num_points_to_remove = indices_to_remove.sum()
    
    print(f"Time threshold: {time_threshold:.4e} s")
    print(f"Number of data points to remove: {num_points_to_remove}")
    
    # Create trimmed dataframe
    df_256ghz_aligned = df_256ghz[~indices_to_remove].copy()
    
    # Reset time axis to start from 0
    df_256ghz_aligned['time'] = df_256ghz_aligned['time'] - df_256ghz_aligned['time'].min()
    
    # Reset index
    df_256ghz_aligned.reset_index(drop=True, inplace=True)
    
    print(f"Original data shape: {df_256ghz.shape}")
    print(f"Aligned data shape: {df_256ghz_aligned.shape}")
    print(f"Number of data points removed: {df_256ghz.shape[0] - df_256ghz_aligned.shape[0]}")
    print("First few rows of aligned data:")
    print(df_256ghz_aligned.head())
    
    # Visualize aligned data
    plt.figure(figsize=(12, 6))
    time_limit = 3e-8
    mask = df_256ghz_aligned['time'] <= time_limit
    plt.plot(df_256ghz_aligned.loc[mask, 'time'], df_256ghz_aligned.loc[mask, 'data'])
    plt.xlabel('Time (s)')
    plt.ylabel('Data')
    plt.title('Aligned Data (First 30ns)')
    plt.grid(True)
    plt.xlim(0, time_limit)
    # Set x-axis ticks every 2ns
    plt.xticks(np.arange(0, time_limit + 2e-9, 2e-9))
    plt.show()
    
    # Update main DataFrame to use aligned data
    df_256ghz = df_256ghz_aligned.copy()
    print("\nTime alignment complete, subsequent processing will use aligned data")
    
else:
    print(f"No data points found above threshold {threshold}, skipping time alignment")


In [None]:
# PPM data decoding processing (refer to analyze.ipynb)
from scipy.signal import butter, filtfilt
from scipy import signal
from sklearn.cluster import KMeans

# Calculate the square of the data (power)
df_256ghz['Data_Squared'] = df_256ghz['data'] ** 2

# Visualize squared data
plt.figure(figsize=(10, 6))
plt.plot(df_256ghz['time'], df_256ghz['Data_Squared'])
plt.xlabel('Time (s)')
plt.ylabel('Data Squared')
plt.title('Data Squared vs Time')
plt.grid(True)
plt.xlim(0, 2e-8)
plt.show()

# Butterworth low-pass filter parameters
native_sampling_rate = 256e9  # 256 GHz sampling rate
cutoff = 4e9  # 4 GHz cutoff frequency
N = 4  # Filter order
nyq = native_sampling_rate / 2
cutoff_norm = cutoff / nyq

# Design Butterworth filter
b, a = butter(N, cutoff_norm, btype='low')

# Apply zero-phase filtering
df_256ghz['Data_MA'] = filtfilt(b, a, df_256ghz['Data_Squared'])

plt.figure(figsize=(10, 6))
plt.plot(df_256ghz['time'], df_256ghz['Data_Squared'], label='Data Squared', alpha=0.7)
plt.plot(df_256ghz['time'], df_256ghz['Data_MA'], label=f'Butterworth Low-pass Filter (N={N})', linewidth=2)
plt.xlabel('Time (s)')
plt.ylabel('Amplitude')
plt.title('Butterworth Low-pass Filter Effect')
plt.legend()
plt.xlim(0, 2e-8)
plt.grid(True)
plt.show()

print("Filtering complete")
print(f"Filter parameters: Cutoff frequency={cutoff/1e9}G Hz, Order={N}")
print(f"Filtered data range: {df_256ghz['Data_MA'].min():.6f} to {df_256ghz['Data_MA'].max():.6f}")


In [None]:
# Pulse detection and edge identification
# Set threshold
threshold = 0.002
symbol_rate = 500e6  # 500 MHz

# Get filtered data
filtered_data = df_256ghz["Data_MA"].values

# Calculate rising and falling edges
above = filtered_data > threshold
rising_edges = np.where(np.diff(above.astype(int)) == 1)[0] + 1  # Rising edge
falling_edges = np.where(np.diff(above.astype(int)) == -1)[0] + 1  # Falling edge

# Ensure each rising edge is followed by a falling edge
if falling_edges.size > 0 and rising_edges.size > 0:
    if falling_edges[0] < rising_edges[0]:
        falling_edges = falling_edges[1:]
    if rising_edges.shape[0] > falling_edges.shape[0]:
        rising_edges = rising_edges[:-1]

    # Calculate the midpoint of each rising/falling edge pair
    mid_indices = ((rising_edges + falling_edges) / 2).astype(int)
    mid_times = df_256ghz.loc[mid_indices, 'time'].values

    print(f"Detected {len(mid_times)} pulse midpoints")
    print(f"Midpoint times: {mid_times}")
else:
    mid_indices = np.array([])
    mid_times = np.array([])
    print("No valid rising/falling edge pairs detected")

# Visualize edge detection results
plt.figure(figsize=(12, 6))
plt.plot(df_256ghz['time'], df_256ghz['Data_MA'], label='Filtered Data (Data_MA)')
if rising_edges.size > 0:
    plt.plot(df_256ghz.loc[rising_edges, 'time'], filtered_data[rising_edges], 'g^', label='Rising Edge')
if falling_edges.size > 0:
    plt.plot(df_256ghz.loc[falling_edges, 'time'], filtered_data[falling_edges], 'rv', label='Falling Edge')
if mid_indices.size > 0:
    plt.plot(df_256ghz.loc[mid_indices, 'time'], filtered_data[mid_indices], 'ko', label='Midpoint')
plt.axhline(y=threshold, color='r', linestyle='--', label=f'Threshold ({threshold})')
plt.xlabel('Time (s)')
plt.ylabel('Filtered Data')
plt.title('Rising/Falling Edge and Midpoint Detection')
plt.grid(True)
plt.legend()
plt.xlim(0, 2e-8)
plt.show()

print(f"Detection parameters: Threshold={threshold}, Symbol rate={symbol_rate/1e6}M Hz")


In [None]:
# K-means clustering analysis for PPM symbol classification
if 'mid_indices' in locals() and 'mid_times' in locals() and len(mid_times) > 0:
    # Extract list of midpoint positions
    mid_indices_list = mid_indices.tolist()
    mid_positions = mid_times.tolist()
    
    print("Midpoint analysis result:")
    print(f"Number of midpoints found: {len(mid_positions)}")
    print(f"Midpoint indices: {mid_indices_list[:10]}...")  # Show only first 10
    print(f"Midpoint time positions (seconds): {mid_positions[:10]}...")  # Show only first 10
    
    # Convert to nanoseconds for display
    mid_positions_ns = [pos * 1e9 for pos in mid_positions]
    print(f"Midpoint time positions (ns): {mid_positions_ns[:10]}...")
    
    # Calculate positions relative to the first midpoint
    if len(mid_positions) > 1:
        relative_positions = [(pos - mid_positions[0]) * 1e9 for pos in mid_positions]
        print(f"Relative positions to first midpoint (ns): {relative_positions[:10]}...")
        
        # Calculate intervals between consecutive midpoints
        intervals_ns = [pos * 1e9 for pos in np.diff(mid_positions)]
        print(f"Intervals between consecutive midpoints (ns): {intervals_ns[:10]}...")
        
    # Use K-means clustering to analyze midpoint positions (modulo 2ns period)
    if len(mid_positions_ns) > 8:  # Need at least 8 points for 8 clusters
        mid_positions_mod = np.array([pos % 2 for pos in mid_positions_ns]).reshape(-1, 1)
        
        # Apply K-means, 8 clusters
        kmeans = KMeans(n_clusters=8, random_state=42)
        cluster_labels_original = kmeans.fit_predict(mid_positions_mod)
        cluster_centers = kmeans.cluster_centers_.flatten()
        
        # Create mapping to sort clusters by time position
        center_time_pairs = [(i, center) for i, center in enumerate(cluster_centers)]
        center_time_pairs.sort(key=lambda x: x[1])  # Sort by time position
        
        # Create mapping from original cluster ID to time-sorted cluster ID
        old_to_new_mapping = {}
        for new_id, (old_id, _) in enumerate(center_time_pairs):
            old_to_new_mapping[old_id] = new_id
        
        # Apply mapping to cluster labels
        cluster_labels = np.array([old_to_new_mapping[label] for label in cluster_labels_original])
        
        # Sort cluster centers by time position
        sorted_centers = [pair[1] for pair in center_time_pairs]
        
        print("\n=== K-means Clustering Result (8 clusters) - Sorted by Time ===")
        for i, center in enumerate(sorted_centers):
            cluster_size = np.sum(cluster_labels == i)
            print(f"Cluster {i}: Time = {center:.4f} ns (mod 2), Points = {cluster_size}")
        
        # Convert cluster centers to picoseconds and display as list
        cluster_centers_ps = [int(round(center * 1000)) for center in sorted_centers]
        print(f"\nCluster centers (ps, sorted by time): {cluster_centers_ps}")

        # Print nanosecond list
        print(f"\nCluster centers (ns, sorted by time): [", end="")
        for i in range(len(sorted_centers)):
            print(f"{sorted_centers[i]:.4f}", end=" " if i < len(sorted_centers) - 1 else "")
        print("]")

        # Visualize clustering result
        plt.figure(figsize=(12, 8))
        colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f']

        # Plot cluster centers (now sorted by time)
        plt.scatter(sorted_centers, range(8), color='red', marker='x', s=150, 
                   linewidths=3, label='Cluster Center')
        
        # Plot cluster points
        for i in range(8):
            cluster_points = mid_positions_mod[cluster_labels == i]
            plt.scatter(cluster_points, np.ones(len(cluster_points)) * i, 
                       color=colors[i], alpha=0.7, s=30, label=f'Cluster {i}')
        
        plt.xlabel('Midpoint Position (ns, mod 2)')
        plt.ylabel('Cluster ID (Sorted by Time)')
        plt.title('K-means Clustering of Midpoint Positions (8 Clusters, Sorted by Time)')
        plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
        plt.grid(True, alpha=0.3)
        plt.tight_layout()
        plt.show()
        
        print(f"\nClustering analysis complete, variables 'cluster_labels' and 'sorted_centers' created")
        
    else:
        print(f"Not enough midpoints for K-means clustering (found {len(mid_positions_ns)}, need > 8)")
        cluster_labels = None
        
else:
    print("Please run the midpoint detection cell first to generate midpoint data")


In [None]:
# Symbol classification and Ground Truth generation
# Symbol rate: 500MHz, each symbol period is 2ns, aligned to [0, 2, 4, 6...] ns

symbol_rate = 500e6  # 500 MHz
symbol_period_ns = 1 / symbol_rate * 1e9  # 2 ns

# Get results from previous clustering analysis
if 'cluster_labels' in locals() and 'mid_positions_ns' in locals() and cluster_labels is not None:
    # Create symbol classification data
    symbol_data = []
    
    # Classify each detected pulse midpoint
    for i, (mid_pos_ns, cluster_id) in enumerate(zip(mid_positions_ns, cluster_labels)):
        # Calculate the corresponding symbol time slot (2ns period)
        symbol_slot = int(round(mid_pos_ns / symbol_period_ns))
        
        # Use cluster ID as data value (0-7, representing 8 different symbols)
        data_value = cluster_id
        
        symbol_data.append({
            'index': i,
            'data': data_value
        })
    
    # Create new DataFrame with only index and data columns
    ground_truth_df = pd.DataFrame(symbol_data)
    
    print("=== Ground Truth Symbol Classification Result ===")
    print(f"Total number of classified symbols: {len(ground_truth_df)}")
    print(f"Symbol period: {symbol_period_ns:.1f} ns")
    print("\nFirst 20 symbol classification results:")
    print(ground_truth_df.head(20))
    
    print("\nStatistics for each symbol type:")
    data_counts = ground_truth_df['data'].value_counts().sort_index()
    for data_val, count in data_counts.items():
        print(f"Symbol type {data_val}: {count} occurrences")
    
    # Visualize symbol classification
    plt.figure(figsize=(15, 10))
    
    # Subplot 1: Symbol sequence
    plt.subplot(2, 2, 1)
    plt.plot(ground_truth_df['index'], ground_truth_df['data'], 'o-', markersize=3, linewidth=0.5)
    plt.xlabel('Symbol Index')
    plt.ylabel('Symbol Type')
    plt.title('Ground Truth Symbol Sequence')
    plt.grid(True, alpha=0.3)
    plt.xlim(0, min(100, len(ground_truth_df)))  # Show first 100 symbols
    
    # Subplot 2: Symbol type distribution
    plt.subplot(2, 2, 2)
    colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f']
    data_counts.plot(kind='bar', color=colors[:len(data_counts)])
    plt.xlabel('Symbol Type')
    plt.ylabel('Occurrences')
    plt.title('Symbol Type Distribution')
    plt.xticks(rotation=0)
    plt.grid(True, alpha=0.3)
    
    # Subplot 3: Histogram of symbol sequence
    plt.subplot(2, 2, 3)
    plt.hist(ground_truth_df['data'], bins=8, alpha=0.7, edgecolor='black', color='skyblue')
    plt.xlabel('Symbol Type')
    plt.ylabel('Frequency')
    plt.title('Symbol Type Frequency Distribution')
    plt.grid(True, alpha=0.3)
    
    # Subplot 4: Symbol transition matrix (simplified)
    plt.subplot(2, 2, 4)
    if len(ground_truth_df) > 1:
        transitions = np.zeros((8, 8))
        for i in range(len(ground_truth_df) - 1):
            from_symbol = ground_truth_df.iloc[i]['data']
            to_symbol = ground_truth_df.iloc[i + 1]['data']
            transitions[from_symbol, to_symbol] += 1
        
        plt.imshow(transitions, cmap='Blues', interpolation='nearest')
        plt.xlabel('To Symbol')
        plt.ylabel('From Symbol')
        plt.title('Symbol Transition Matrix')
        plt.colorbar()
        for i in range(8):
            for j in range(8):
                plt.text(j, i, str(int(transitions[i, j])), ha='center', va='center')
    
    plt.tight_layout()
    plt.show()
    
    # Save Ground Truth DataFrame as CSV file
    output_csv_path = '../csv/ground_truth_ppm.csv'
    ground_truth_df.to_csv(output_csv_path, index=False)
    print(f"\nGround Truth data saved to: {output_csv_path}")
    print(f"Saved DataFrame shape: {ground_truth_df.shape}")
    print(f"Column names: {list(ground_truth_df.columns)}")
    
    # Show first few rows of saved file for verification
    print("\nSaved file verification (first 10 rows):")
    saved_df = pd.read_csv(output_csv_path)
    print(saved_df.head(10))
    
else:
    print("Error: Please run the previous clustering analysis cell to generate clustering results")


In [None]:
# Check if Ground Truth data repeats with a period of 127
# Verify periodic characteristics of the PPM sequence

if 'ground_truth_df' in locals() and len(ground_truth_df) > 0:
    
    # Get data sequence
    data_sequence = ground_truth_df['data'].values
    total_length = len(data_sequence)
    
    print(f"=== Ground Truth Periodicity Analysis ===")
    print(f"Total data length: {total_length}")
    print(f"Check period: 127")
    
    # Check if divisible by 127
    if total_length >= 127:
        complete_periods = total_length // 127
        remaining_samples = total_length % 127
        
        print(f"Number of complete periods: {complete_periods}")
        print(f"Number of remaining samples: {remaining_samples}")
        
        if complete_periods >= 2:  # Need at least two complete periods to compare
            
            # Extract the first 127 samples as reference pattern
            reference_pattern = data_sequence[:127]
            
            # Check the match of each complete period with the reference pattern
            period_matches = []
            period_differences = []
            
            for period_idx in range(complete_periods):
                start_idx = period_idx * 127
                end_idx = start_idx + 127
                current_period = data_sequence[start_idx:end_idx]
                
                # Calculate match
                matches = np.sum(current_period == reference_pattern)
                match_percentage = (matches / 127) * 100
                period_matches.append(match_percentage)
                
                # Calculate difference positions
                differences = np.where(current_period != reference_pattern)[0]
                period_differences.append(len(differences))
                
                print(f"Period {period_idx + 1}: Match {match_percentage:.1f}% ({matches}/127), Differences {len(differences)}")
            
            # Overall statistics
            avg_match = np.mean(period_matches)
            min_match = np.min(period_matches)
            max_match = np.max(period_matches)
            
            print(f"\n=== Periodicity Statistics ===")
            print(f"Average match: {avg_match:.1f}%")
            print(f"Minimum match: {min_match:.1f}%")
            print(f"Maximum match: {max_match:.1f}%")
            
            # Determine if periodic
            is_periodic = avg_match > 99.0  # Consider periodic if match > 99%
            print(f"Is 127-periodic: {'Yes' if is_periodic else 'No'}")
            
            # Visualization analysis
            fig, axes = plt.subplots(2, 2, figsize=(15, 10))
            
            # Subplot 1: Comparison of first few periods
            ax1 = axes[0, 0]
            periods_to_show = min(4, complete_periods)
            for i in range(periods_to_show):
                start_idx = i * 127
                end_idx = start_idx + 127
                period_data = data_sequence[start_idx:end_idx]
                ax1.plot(range(127), period_data, 'o-', markersize=2, 
                        label=f'Period {i+1}', alpha=0.7)
            
            ax1.set_xlabel('Position within period')
            ax1.set_ylabel('Symbol Value')
            ax1.set_title(f'Comparison of First {periods_to_show} Periods')
            ax1.legend()
            ax1.grid(True, alpha=0.3)
            
            # Subplot 2: Match percentage bar chart
            ax2 = axes[0, 1]
            bars = ax2.bar(range(1, complete_periods + 1), period_matches, 
                          color='skyblue', alpha=0.7, edgecolor='black')
            ax2.set_xlabel('Period Number')
            ax2.set_ylabel('Match (%)')
            ax2.set_title('Match Percentage of Each Period with Reference Pattern')
            ax2.grid(True, alpha=0.3)
            ax2.set_ylim(0, 105)
            
            # Annotate values on bar chart
            for i, bar in enumerate(bars):
                height = bar.get_height()
                ax2.text(bar.get_x() + bar.get_width()/2., height + 0.5,
                        f'{height:.1f}%', ha='center', va='bottom', fontsize=8)
            
            # Subplot 3: Difference position heatmap (if there are differences)
            ax3 = axes[1, 0]
            if complete_periods >= 2:
                difference_matrix = np.zeros((complete_periods, 127))
                for period_idx in range(complete_periods):
                    start_idx = period_idx * 127
                    end_idx = start_idx + 127
                    current_period = data_sequence[start_idx:end_idx]
                    differences = (current_period != reference_pattern).astype(int)
                    difference_matrix[period_idx, :] = differences
                
                im = ax3.imshow(difference_matrix, cmap='Reds', aspect='auto', interpolation='nearest')
                ax3.set_xlabel('Position within period')
                ax3.set_ylabel('Period Number')
                ax3.set_title('Difference Position Heatmap (Red=Difference)')
                plt.colorbar(im, ax=ax3)
            
            # Subplot 4: Reference pattern visualization
            ax4 = axes[1, 1]
            ax4.plot(range(127), reference_pattern, 'o-', markersize=3, linewidth=1.5, color='darkblue')
            ax4.set_xlabel('Position')
            ax4.set_ylabel('Symbol Value')
            ax4.set_title('Reference Pattern (First 127 Symbols)')
            ax4.grid(True, alpha=0.3)
            ax4.set_ylim(-0.5, 7.5)
            
            plt.tight_layout()
            plt.show()
            
            # Detailed periodicity analysis
            if not is_periodic and complete_periods >= 2:
                print(f"\n=== Detailed Non-periodicity Analysis ===")
                
                # Find most common difference positions
                all_diff_positions = []
                for period_idx in range(1, complete_periods):  # Skip first period (reference)
                    start_idx = period_idx * 127
                    end_idx = start_idx + 127
                    current_period = data_sequence[start_idx:end_idx]
                    diff_positions = np.where(current_period != reference_pattern)[0]
                    all_diff_positions.extend(diff_positions)
                
                if all_diff_positions:
                    from collections import Counter
                    diff_counter = Counter(all_diff_positions)
                    most_common_diffs = diff_counter.most_common(10)
                    
                    print("Most common difference positions:")
                    for pos, count in most_common_diffs:
                        print(f"  Position {pos}: {count} differences")
            
            # Save periodicity analysis result
            period_analysis = {
                'total_length': total_length,
                'complete_periods': complete_periods,
                'remaining_samples': remaining_samples,
                'average_match_percentage': avg_match,
                'is_periodic': is_periodic,
                'period_matches': period_matches
            }
            
            print(f"\nPeriodicity analysis complete, result saved in variable 'period_analysis'")
            
        else:
            print("Data length insufficient, need at least 2 complete 127 periods for comparison")
    
    else:
        print(f"Data length {total_length} is less than 127, cannot perform periodicity check")
        
    # Extra: Check other possible period lengths
    print(f"\n=== Check Other Period Lengths ===")
    possible_periods = [31, 63, 127, 255, 511]  # Common PRBS period lengths
    
    for period_len in possible_periods:
        if total_length >= period_len * 2:  # Need at least two periods
            first_period = data_sequence[:period_len]
            second_period = data_sequence[period_len:2*period_len]
            matches = np.sum(first_period == second_period)
            match_rate = (matches / period_len) * 100
            print(f"Period length {period_len}: Match {match_rate:.1f}%")
        else:
            print(f"Period length {period_len}: Not enough data")

else:
    print("Error: ground_truth_df data not found, please run the previous symbol classification cell first")
