In [None]:
from google.colab import drive
drive.mount('/content/drive')
import os

# Définir le chemin vers ton dossier
folder_path = '/content/drive/MyDrive/Data_ptcloud'

# Vérifier si le chemin est correct
if os.path.isdir(folder_path):
    print("Le dossier existe.")
else:
    print("Le dossier n'existe pas.")


Mounted at /content/drive
Le dossier existe.


In [None]:
import os
import pandas as pd
import random

# Directory containing the .txt files
folder_path = '/content/drive/MyDrive/Data_ptcloud'
output_folder_path = 'path_to_output_folder'

# Ensure the output folder exists
os.makedirs(output_folder_path, exist_ok=True)

# Column names
column_names = ['centroid_x', 'centroid_y', 'meanVelocity', 'length', 'width', 'density', 'rcsEq', 'rcsStd', 'track_id', 'label']

# Set Pandas display options to show all columns
pd.set_option('display.max_columns', None)

# List to hold all DataFrames
dataframes = []

# Load all DataFrames from .txt files
for file_name in os.listdir(folder_path):
    if file_name.endswith('.txt'):
        file_path = os.path.join(folder_path, file_name)

        # Check if the file is empty
        if os.path.getsize(file_path) == 0:
            os.remove(file_path)
            print(f"Deleted empty file: {file_path}")
            continue  # Skip to the next file

        # Read the file into a DataFrame
        df = pd.read_csv(file_path, header=None, delimiter=',')

        # Check if the DataFrame has the correct number of columns
        if df.shape[1] < len(column_names):
            # If fewer columns, add NaNs to make up the difference
            df = df.reindex(columns=range(len(column_names)))

        # Assign column names
        df.columns = column_names[:df.shape[1]]

        # Add DataFrame to the list
        dataframes.append((file_name, df))

# Shuffle the list of DataFrames
random.shuffle(dataframes)

# Process and save each shuffled DataFrame
for file_name, df in dataframes:
    # Save the processed (and shuffled) data to a new CSV file
    output_file_path = os.path.join(output_folder_path, f"processed_{file_name.replace('.txt', '.csv')}")
    df.to_csv(output_file_path, index=False)

    # Optionally, display the saved DataFrame
    print(f"Processed and saved {file_name} to {output_file_path}")


Processed and saved 01361.txt to path_to_output_folder/processed_01361.csv
Processed and saved 00778.txt to path_to_output_folder/processed_00778.csv
Processed and saved 00565.txt to path_to_output_folder/processed_00565.csv
Processed and saved 00884.txt to path_to_output_folder/processed_00884.csv
Processed and saved 01678.txt to path_to_output_folder/processed_01678.csv
Processed and saved 00783.txt to path_to_output_folder/processed_00783.csv
Processed and saved 01095.txt to path_to_output_folder/processed_01095.csv
Processed and saved 00498.txt to path_to_output_folder/processed_00498.csv
Processed and saved 00480.txt to path_to_output_folder/processed_00480.csv
Processed and saved 00214.txt to path_to_output_folder/processed_00214.csv
Processed and saved 01948.txt to path_to_output_folder/processed_01948.csv
Processed and saved 00888.txt to path_to_output_folder/processed_00888.csv
Processed and saved 00916.txt to path_to_output_folder/processed_00916.csv
Processed and saved 00148

In [None]:
import os
import pandas as pd

# Directory containing the processed CSV files
input_folder_path = 'path_to_output_folder'
combined_output_file_path = 'combined_data.csv'

# Column names
column_names = ['centroid_x', 'centroid_y', 'meanVelocity', 'length', 'width', 'density', 'rcsEq', 'rcsStd', 'track_id', 'label']

# Initialize an empty list to hold DataFrames
dataframes = []

# Loop through all files in the directory
for file_name in os.listdir(input_folder_path):
    if file_name.endswith('.csv'):  # Process only .csv files
        file_path = os.path.join(input_folder_path, file_name)

        df = pd.read_csv(file_path)

        df = df.reindex(columns=column_names)
        df['filename'] = file_name
        # Append to the list of DataFrames
        dataframes.append(df)

# Concatenate all DataFrames into a single DataFrame
combined_df = pd.concat(dataframes, ignore_index=True)

# Save the combined DataFrame to a new CSV file
combined_df.to_csv(combined_output_file_path, index=False)

print(f"Combined data saved to {combined_output_file_path}")


Combined data saved to combined_data.csv


In [None]:
import pandas as pd

# Load your data
combined_file_path = '/content/combined_data.csv'
combined_df = pd.read_csv(combined_file_path)

# Check if DataFrame is empty
if combined_df.empty:
    print("The DataFrame is empty.")
else:
    # Check the columns
    print("Columns in DataFrame:", combined_df.columns)


    # Directly count occurrences of each label
    if 'label' in combined_df.columns:
        print("Counts for each label:")
        label_counts = combined_df['label'].value_counts()
        print(f"Class: number of samples{label_counts}")
    else:
        print("Label column not found.")



Columns in DataFrame: Index(['centroid_x', 'centroid_y', 'meanVelocity', 'length', 'width',
       'density', 'rcsEq', 'rcsStd', 'track_id', 'label', 'filename'],
      dtype='object')
Counts for each label:
Class: number of sampleslabel
5      2436
100    1451
2      1255
0       430
1       213
6       160
4       116
3        88
Name: count, dtype: int64


In [None]:
import numpy as np
import pandas as pd

def add_jitter(data, noise_level=0.01):
    """Applies jitter (random noise) to numeric data."""
    jitter = noise_level * np.random.randn(*data.shape)
    return data + jitter

# Load combined data
combined_file_path = '/content/combined_data.csv'
combined_df = pd.read_csv(combined_file_path)

# Ensure 'track_id' is treated as an integer
combined_df['track_id'] = combined_df['track_id'].astype(int)

# Exclude noise label (label == 100)
combined_df = combined_df[combined_df['label'] != 100]

# Calculate the count of each label before jittering
original_counts = combined_df['label'].value_counts()
print(f"Original counts before jittering:\n{original_counts}")

# Set target count for minor classes
target_count = max(label_counts)
minor_classes = original_counts[original_counts < target_count].index.tolist()

# Track the number of jittered samples generated
jittered_counts = {label: 0 for label in minor_classes}
oversampled_data = combined_df.copy()

for minor_class in minor_classes:
    class_data = combined_df[combined_df['label'] == minor_class]
    num_samples_needed = target_count - len(class_data)

    while num_samples_needed > 0:
        # Add jitter to all numeric features except 'track_id'
        jittered_samples = class_data.copy()
        for column in class_data.columns:
            if column != 'track_id' and column != 'label':
                if pd.api.types.is_numeric_dtype(class_data[column]):
                    jittered_samples[column] = add_jitter(class_data[column].to_numpy(), noise_level=0.01)

        # Increment 'track_id' to ensure unique values
        jittered_samples['track_id'] += 1000

        # Limit the jittered samples to the exact number needed
        num_to_add = min(num_samples_needed, len(class_data))
        jittered_samples = jittered_samples.sample(n=num_to_add, replace=True)

        # Append jittered samples to the oversampled data
        oversampled_data = pd.concat([oversampled_data, jittered_samples], ignore_index=True)

        # Update the count of jittered samples
        jittered_counts[minor_class] += num_to_add
        num_samples_needed -= num_to_add

#shuffling
oversampled_data = oversampled_data.sample(frac=1, random_state=42).reset_index(drop=True)


# Save the balanced data to a new file
balanced_file_path = '/content/balanced_data.csv'
oversampled_data.to_csv(balanced_file_path, index=False)
print(f"Balanced data with jittering saved to {balanced_file_path}")

# Print jittered sample counts for each label
print("Jittered sample counts for each label:")
for label, count in jittered_counts.items():
    print(f"Label {label}: {count} samples")


Original counts before jittering:
label
5    2436
2    1255
0     430
1     213
6     160
4     116
3      88
Name: count, dtype: int64
Balanced data with jittering saved to /content/balanced_data.csv
Jittered sample counts for each label:
Label 2: 1181 samples
Label 0: 2006 samples
Label 1: 2223 samples
Label 6: 2276 samples
Label 4: 2320 samples
Label 3: 2348 samples


In [None]:
# Load the balanced data
balanced_file_path = '/content/balanced_data.csv'
balanced_df = pd.read_csv(balanced_file_path)

# Get the total number of samples for each label
label_counts = balanced_df['label'].value_counts()
print("Total number of samples for each label after jittering:")
print(label_counts)


Total number of samples for each label after jittering:
label
6    2436
3    2436
1    2436
4    2436
5    2436
2    2436
0    2436
Name: count, dtype: int64


In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt

def plot_histograms_for_features(original_df, jittered_df, output_path):
    """
    Plot histograms for each feature in the original and jittered datasets.

    Parameters:
        original_df (pd.DataFrame): The original dataset before jittering.
        jittered_df (pd.DataFrame): The dataset after jittering.
        output_path (str): The directory path to save histogram plots.
    """
    # Create the directory if it doesn't exist
    os.makedirs(output_path, exist_ok=True)

    # List of features (columns) to plot
    features = original_df.columns

    for feature in features:
        # Only plot numeric features
        if pd.api.types.is_numeric_dtype(original_df[feature]):
            plt.figure(figsize=(12, 6))

            # Histogram of original data
            plt.subplot(1, 2, 1)
            plt.hist(original_df[feature], bins=30, edgecolor='black', alpha=0.7)
            plt.title(f'Avant Jittering - {feature}')
            plt.xlabel('Valeur')
            plt.ylabel('Fréquence')

            # Histogram of jittered data
            plt.subplot(1, 2, 2)
            plt.hist(jittered_df[feature], bins=30, edgecolor='black', alpha=0.7)
            plt.title(f'Après Jittering - {feature}')
            plt.xlabel('Valeur')
            plt.ylabel('Fréquence')

            plt.tight_layout()

            # Save the histogram
            plt_path = os.path.join(output_path, f'{feature}_histograms.png')
            plt.savefig(plt_path)
            plt.close()
            print(f"Histogramme pour {feature} sauvegardé à {plt_path}")

# Load the original and balanced datasets
original_file_path = '/content/combined_data.csv'
balanced_file_path = '/content/balanced_data.csv'

original_df = pd.read_csv(original_file_path)
df_balanced = pd.read_csv(balanced_file_path)

# Set the output path for histograms
output_histogram_path = '/content/histograms'

# Plot histograms
plot_histograms_for_features(original_df, df_balanced, output_histogram_path)


Histogramme pour centroid_x sauvegardé à /content/histograms/centroid_x_histograms.png
Histogramme pour centroid_y sauvegardé à /content/histograms/centroid_y_histograms.png
Histogramme pour meanVelocity sauvegardé à /content/histograms/meanVelocity_histograms.png
Histogramme pour length sauvegardé à /content/histograms/length_histograms.png
Histogramme pour width sauvegardé à /content/histograms/width_histograms.png
Histogramme pour density sauvegardé à /content/histograms/density_histograms.png
Histogramme pour rcsEq sauvegardé à /content/histograms/rcsEq_histograms.png
Histogramme pour rcsStd sauvegardé à /content/histograms/rcsStd_histograms.png
Histogramme pour track_id sauvegardé à /content/histograms/track_id_histograms.png
Histogramme pour label sauvegardé à /content/histograms/label_histograms.png


In [None]:
import os
import pandas as pd
import random

# Define folder paths
folder_path = '/content/drive/MyDrive/Testing_data'  # Directory containing the .txt files
output_folder_path = '/content/original_data_labels'  # Folder to save processed CSV files
combined_output_file_path = 'TestingData_data.csv'  # Output for combined CSV file

# Create the output folder if it doesn't exist
if not os.path.exists(output_folder_path):
    os.makedirs(output_folder_path)

# Column names for the DataFrames
column_names = ['centroid_x', 'centroid_y', 'meanVelocity', 'length', 'width', 'density', 'rcsEq', 'rcsStd', 'track_id', 'label']

# Set Pandas display option to show all columns
pd.set_option('display.max_columns', None)

# List to hold individual DataFrames
dataframes = []

# Load all .txt files from the folder
for file_name in os.listdir(folder_path):
    if file_name.endswith('.txt'):
        file_path = os.path.join(folder_path, file_name)

        # Skip and delete empty files
        if os.path.getsize(file_path) == 0:
            os.remove(file_path)
            print(f"Deleted empty file: {file_path}")
            continue

        # Read the file into a DataFrame
        df = pd.read_csv(file_path, header=None, delimiter=',')

        # Ensure the DataFrame has the correct number of columns
        if df.shape[1] < len(column_names):
            df = df.reindex(columns=range(len(column_names)))

        # Assign column names
        df.columns = column_names[:df.shape[1]]

        # Add DataFrame to the list
        dataframes.append((file_name, df))

# Shuffle the DataFrames list
random.shuffle(dataframes)

# Save each shuffled DataFrame to a new CSV file
for file_name, df in dataframes:
    # Define the output path for each processed file
    output_file_path = os.path.join(output_folder_path, f"processed_{file_name.replace('.txt', '.csv')}")
    df.to_csv(output_file_path, index=False)
    print(f"Processed and saved {file_name} to {output_file_path}")

# Now, combine all the processed CSV files
# List to hold the combined DataFrames
combined_dataframes = []

# Loop through all processed CSV files in the output folder
for file_name in os.listdir(output_folder_path):
    if file_name.endswith('.csv'):
        file_path = os.path.join(output_folder_path, file_name)
        df = pd.read_csv(file_path)

        # Ensure correct column order and add file name as a new column
        df = df.reindex(columns=column_names)
        df['filename'] = file_name

        # Add DataFrame to the list of combined DataFrames
        combined_dataframes.append(df)

# Concatenate all individual DataFrames into one
combined_df = pd.concat(combined_dataframes, ignore_index=True)

# Save the combined DataFrame to a new CSV file
combined_df.to_csv(combined_output_file_path, index=False)
print(f"Combined data saved to {combined_output_file_path}")


Processed and saved 07345.txt to /content/original_data_labels/processed_07345.csv
Processed and saved 09880.txt to /content/original_data_labels/processed_09880.csv
Processed and saved 07313.txt to /content/original_data_labels/processed_07313.csv
Processed and saved 07306.txt to /content/original_data_labels/processed_07306.csv
Processed and saved 09906.txt to /content/original_data_labels/processed_09906.csv
Processed and saved 09903.txt to /content/original_data_labels/processed_09903.csv
Processed and saved 09752.txt to /content/original_data_labels/processed_09752.csv
Processed and saved 09800.txt to /content/original_data_labels/processed_09800.csv
Processed and saved 09911.txt to /content/original_data_labels/processed_09911.csv
Processed and saved 09777.txt to /content/original_data_labels/processed_09777.csv
Processed and saved 09853.txt to /content/original_data_labels/processed_09853.csv
Processed and saved 07309.txt to /content/original_data_labels/processed_07309.csv
Proc

In [None]:
import pandas as pd

# Load combined data
combined_file_path = '/content/TestingData_data.csv'
combined_df = pd.read_csv(combined_file_path)

# Ensure 'track_id' is treated as an integer
combined_df['track_id'] = combined_df['track_id'].astype(int)

# Exclude noise label (label == 100)
combined_df = combined_df[combined_df['label'] != 100]

# Define the target sample size for each label
target_sample_size = 300

# List to hold resampled data for each label
resampled_data = []

# Resample data for each label
for label in combined_df['label'].unique():
    label_data = combined_df[combined_df['label'] == label]
    if len(label_data) > target_sample_size:
        # Downsample if there are more than target samples
        label_data = label_data.sample(n=target_sample_size, random_state=42)
    elif len(label_data) < target_sample_size:
        # Upsample if there are fewer samples
        label_data = label_data.sample(n=target_sample_size, replace=True, random_state=42)
    resampled_data.append(label_data)

# Combine all resampled data into a single DataFrame
balanced_df = pd.concat(resampled_data, ignore_index=True)

# Shuffle the rows in the balanced DataFrame
balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Save the balanced data to a new file named 'original_data_labels.csv'
balanced_file_path = '/content/original_data_labels11.csv'
balanced_df.to_csv(balanced_file_path, index=False)
print(f"Balanced data with each label having {target_sample_size} samples saved to {balanced_file_path}")

# Print the count of samples per label in the balanced dataset
print("Sample counts for each label in the balanced dataset:")
print(balanced_df['label'].value_counts())


Balanced data with each label having 300 samples saved to /content/original_data_labels11.csv
Sample counts for each label in the balanced dataset:
label
4    300
0    300
2    300
5    300
3    300
Name: count, dtype: int64
