# Import Library

In [1]:
from kaggle.api.kaggle_api_extended import KaggleApi
import os
import pandas as pd 

# Data Ingestion from Kaggle

In [3]:
# Initialize Kaggle API
api = KaggleApi()
api.authenticate()

# Dataset path
kaggle_dataset = 'fabriciotorquato/eeg-data-from-hands-movement'
output_dir = './kaggle2_downloads'

# Create output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Download and unzip all files from the Kaggle dataset (download everything at once)
print("Downloading and unzipping all files from the dataset...")
api.dataset_download_files(kaggle_dataset, path=output_dir, force=True, unzip=True)
print("All files downloaded and unzipped to:", output_dir)

Downloading and unzipping all files from the dataset...
Dataset URL: https://www.kaggle.com/datasets/fabriciotorquato/eeg-data-from-hands-movement
All files downloaded and unzipped to: ./kaggle2_downloads


# Read dataset


In [3]:
b = pd.read_csv("/home/quan/PROJECT/Machine Learning with Biomedical Signals/kaggle_downloads/user_b.csv")
b.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2880 entries, 0 to 2879
Columns: 1024 entries, Class to AF3.77
dtypes: float64(1024)
memory usage: 22.5 MB


In [4]:
b.isnull().sum()  # Check for missing values in the DataFrame

Class        0
AF3          0
AF3.1        0
AF3.2        0
AF3.3        0
          ... 
AF3.73    2880
AF3.74    2880
AF3.75    2880
AF3.76    2880
AF3.77    2880
Length: 1024, dtype: int64

In [5]:
# Remove columns with all missing values
b = b.dropna(axis=1, how='all')
print('Columns after removing those with all missing values:', b.columns)

Columns after removing those with all missing values: Index(['Class', 'AF3', 'AF3.1', 'AF3.2', 'AF3.3', 'AF3.4', 'AF3.5', 'AF3.6',
       'AF3.7', 'F7',
       ...
       'F8.6', 'F8.7', 'AF4', 'AF4.1', 'AF4.2', 'AF4.3', 'AF4.4', 'AF4.5',
       'AF4.6', 'AF4.7'],
      dtype='object', length=113)


In [6]:
b.to_csv("user_b_cleaned.csv", index=False)

In [8]:
b_clean = pd.read_csv("user_b_cleaned.csv")
b_clean.shape

(2880, 113)

In [9]:
b2 = pd.read_csv("/home/quan/PROJECT/Machine Learning with Biomedical Signals/kaggle2_downloads/Dataset/user_b.csv")
b2.shape

(2880, 113)

# Pre_Process Data

In [10]:
import pandas as pd
import numpy as np

# Step 1: Remove columns with missing values
def preprocess_dataframe(df):
    """
    Preprocess DataFrame: remove columns with all missing values and prepare the data
    """
    # Create a copy to avoid modifying the original DataFrame
    processed_df = df.copy()
    
    # Remove columns where all values are missing
    processed_df = processed_df.dropna(axis=1, how='all')
    
    # Check if the 'Class' column exists
    if 'Class' not in processed_df.columns:
        raise ValueError("Column 'Class' does not exist in the data after handling missing values")
    
    return processed_df

# Step 2: Rename columns
def transform_dataframe(df):
    """
    Rename columns according to the new structure
    """
    # List of electrodes
    electrodes = [
        'AF3', 'F7', 'F3', 'FC5', 'T7', 'P7', 'O1', 'O2', 
        'P8', 'T8', 'FC6', 'F4', 'F8', 'AF4'
    ]

    # Attributes for each electrode
    attributes = ['delta std', 'delta m', 'theta std', 'theta m', 
                  'alpha std', 'alpha m', 'beta std', 'beta m']

    # Create new column list
    new_columns = ['Class']

    # Create new DataFrame
    new_df = pd.DataFrame()
    new_df['Class'] = df['Class']

    # Get the list of remaining columns (excluding Class)
    remaining_columns = [col for col in df.columns if col != 'Class']
    
    # Check if the number of columns matches expectation
    expected_electrode_columns = len(electrodes) * 8
    if len(remaining_columns) != expected_electrode_columns:
        print(f"Warning: Number of columns after processing ({len(remaining_columns)}) " 
              f"does not match expected ({expected_electrode_columns})")
        print("Continue processing with the current number of columns...")

    # Process each electrode
    electrode_index = 0
    col_index = 0
    
    while col_index < len(remaining_columns) and electrode_index < len(electrodes):
        electrode = electrodes[electrode_index]
        
        # Get the next 8 columns for the current electrode
        if col_index + 8 <= len(remaining_columns):
            electrode_data = df.iloc[:, df.columns.get_loc(remaining_columns[col_index]):df.columns.get_loc(remaining_columns[col_index]) + 8]
            
            # Ensure exactly 8 columns are selected
            if len(electrode_data.columns) == 8:
                # Rename columns
                electrode_data.columns = [f"{electrode} {attr}" for attr in attributes]
                
                # Concatenate to the new DataFrame
                new_df = pd.concat([new_df, electrode_data], axis=1)
                
                col_index += 8
                electrode_index += 1
            else:
                print(f"Warning: Not enough 8 columns for electrode {electrode}")
                break
        else:
            print(f"Warning: Not enough columns for electrode {electrode}")
            break

    return new_df

# Main function to process all steps
def process_eeg_data(input_df):
    """
    Main function to process EEG data
    """
    # Step 1: Preprocessing - remove columns with missing values
    print("Step 1: Remove columns with missing values...")
    processed_df = preprocess_dataframe(input_df)
    print(f"Number of columns after handling missing values: {len(processed_df.columns)}")
    
    # Step 2: Rename columns
    print("Step 2: Rename columns...")
    final_df = transform_dataframe(processed_df)
    print(f"Final number of columns: {len(final_df.columns)}")
    
    return final_df

# Usage:
# df = pd.read_csv('your_data.csv')
# result_df = process_eeg_data(df)
# result_df.to_csv('processed_data.csv', index=False)

# Or if you want to see information about the data:
# print("Original DataFrame info:")
# print(f"Number of columns: {len(df.columns)}")
# print(f"Columns: {list(df.columns)}")
# 
# result_df = process_eeg_data(df)
# print("\nDataFrame info after processing:")
# print(f"Number of columns: {len(result_df.columns)}")
# print(f"First 5 columns: {list(result_df.columns[:5])}")

In [13]:
b = pd.read_csv("/home/quan/PROJECT/Machine Learning with Biomedical Signals/kaggle_downloads/user_a.csv")
b_cleaned = process_eeg_data(b)
print("Original DataFrame info:")
print(f"Number of columns: {len(b.columns)}")
print(f"Columns: {list(b.columns)}")
print("Shape of original DataFrame:", b.shape)

print("\nDataFrame info after processing:")
print(f"Number of columns: {len(b_cleaned.columns)}")
print(f"First 5 columns: {list(b_cleaned.columns[:5])}")
print("Shape of processed DataFrame:", b_cleaned.shape)

Step 1: Remove columns with missing values...
Number of columns after handling missing values: 113
Step 2: Rename columns...
Final number of columns: 113
Original DataFrame info:
Number of columns: 113
Columns: ['Class', 'AF3', 'AF3.1', 'AF3.2', 'AF3.3', 'AF3.4', 'AF3.5', 'AF3.6', 'AF3.7', 'F7', 'F7.1', 'F7.2', 'F7.3', 'F7.4', 'F7.5', 'F7.6', 'F7.7', 'F3', 'F3.1', 'F3.2', 'F3.3', 'F3.4', 'F3.5', 'F3.6', 'F3.7', 'FC5', 'FC5.1', 'FC5.2', 'FC5.3', 'FC5.4', 'FC5.5', 'FC5.6', 'FC5.7', 'T7', 'T7.1', 'T7.2', 'T7.3', 'T7.4', 'T7.5', 'T7.6', 'T7.7', 'P7', 'P7.1', 'P7.2', 'P7.3', 'P7.4', 'P7.5', 'P7.6', 'P7.7', 'O1', 'O1.1', 'O1.2', 'O1.3', 'O1.4', 'O1.5', 'O1.6', 'O1.7', 'O2', 'O2.1', 'O2.2', 'O2.3', 'O2.4', 'O2.5', 'O2.6', 'O2.7', 'P8', 'P8.1', 'P8.2', 'P8.3', 'P8.4', 'P8.5', 'P8.6', 'P8.7', 'T8', 'T8.1', 'T8.2', 'T8.3', 'T8.4', 'T8.5', 'T8.6', 'T8.7', 'FC6', 'FC6.1', 'FC6.2', 'FC6.3', 'FC6.4', 'FC6.5', 'FC6.6', 'FC6.7', 'F4', 'F4.1', 'F4.2', 'F4.3', 'F4.4', 'F4.5', 'F4.6', 'F4.7', 'F8', 'F8.1

In [None]:
import pandas as pd
import numpy as np
import glob
from pathlib import Path

# List of electrodes
electrodes = [
    'AF3', 'F7', 'F3', 'FC5', 'T7', 'P7', 'O1', 'O2', 
    'P8', 'T8', 'FC6', 'F4', 'F8', 'AF4'
]

# Attributes for each electrode
attributes = ['delta std', 'delta m', 'theta std', 'theta m', 
              'alpha std', 'alpha m', 'beta std', 'beta m']

def preprocess_dataframe(df):
    """
    Preprocess DataFrame: remove columns with all missing values
    """
    # Create a copy to avoid modifying original DataFrame
    processed_df = df.copy()
    
    # Remove columns with all missing values
    processed_df = processed_df.dropna(axis=1, how='all')
    
    # Check if Class column exists
    if 'Class' not in processed_df.columns:
        raise ValueError("'Class' column does not exist in data after missing values processing")
    
    return processed_df

def transform_dataframe(df):
    """
    Transform column names to new structure
    """
    # Create new DataFrame
    new_df = pd.DataFrame()
    new_df['Class'] = df['Class']

    # Get remaining columns (excluding Class)
    remaining_columns = [col for col in df.columns if col != 'Class']
    
    # Check if column count matches expected
    expected_electrode_columns = len(electrodes) * 8
    if len(remaining_columns) != expected_electrode_columns:
        print(f"Warning: Number of columns after processing ({len(remaining_columns)}) " 
              f"does not match expected ({expected_electrode_columns})")
        print("Continuing processing with current column count...")

    # Process each electrode
    electrode_index = 0
    col_index = 0
    
    while col_index < len(remaining_columns) and electrode_index < len(electrodes):
        electrode = electrodes[electrode_index]
        
        # Get next 8 columns for current electrode
        if col_index + 8 <= len(remaining_columns):
            # Get the actual column indices
            start_col_name = remaining_columns[col_index]
            end_col_name = remaining_columns[col_index + 7] if col_index + 7 < len(remaining_columns) else remaining_columns[-1]
            
            # Get column indices
            start_idx = df.columns.get_loc(start_col_name)
            end_idx = df.columns.get_loc(end_col_name) + 1
            
            electrode_data = df.iloc[:, start_idx:end_idx]
            
            # Ensure we have exactly 8 columns
            if len(electrode_data.columns) == 8:
                # Rename columns
                electrode_data.columns = [f"{electrode} {attr}" for attr in attributes]
                
                # Concatenate to new DataFrame
                new_df = pd.concat([new_df, electrode_data], axis=1)
                
                col_index += 8
                electrode_index += 1
            else:
                print(f"Warning: Not enough columns for electrode {electrode}")
                break
        else:
            print(f"Warning: Not enough columns for electrode {electrode}")
            break

    return new_df

def process_eeg_data(input_df, file_name=""):
    """
    Main function to process EEG data
    """
    if file_name:
        print(f"Processing file: {file_name}")
    
    # Step 1: Preprocessing - remove columns with missing values
    print("Step 1: Removing columns with missing values...")
    processed_df = preprocess_dataframe(input_df)
    print(f"Columns after missing values processing: {len(processed_df.columns)}")
    
    # Step 2: Transform column names
    print("Step 2: Transforming column names...")
    final_df = transform_dataframe(processed_df)
    print(f"Final number of columns: {len(final_df.columns)}")
    
    return final_df

def process_multiple_files(file_patterns, output_folder="processed_data"):
    """
    Process multiple CSV files simultaneously
    """
    # Create output folder if it doesn't exist
    Path(output_folder).mkdir(exist_ok=True)
    
    results = {}
    
    # Process each file
    for file_pattern in file_patterns:
        # Find files matching the pattern
        file_paths = glob.glob(file_pattern)
        
        for file_path in file_paths:
            try:
                print(f"\n{'='*50}")
                print(f"Processing: {file_path}")
                
                # Read CSV file
                df = pd.read_csv(file_path)
                print(f"Original columns: {len(df.columns)}")
                
                # Process the data
                processed_df = process_eeg_data(df, Path(file_path).name)
                
                # Save processed file
                output_path = Path(output_folder) / f"{Path(file_path).name}"
                processed_df.to_csv(output_path, index=False)
                
                results[file_path] = {
                    'status': 'success',
                    'output_path': str(output_path),
                    'original_columns': len(df.columns),
                    'processed_columns': len(processed_df.columns)
                }
                
                print(f"Successfully processed and saved to: {output_path}")
                
            except Exception as e:
                print(f"Error processing {file_path}: {str(e)}")
                results[file_path] = {
                    'status': 'error',
                    'error_message': str(e)
                }
    
    return results

# Example usage
if __name__ == "__main__":
    # Define your file patterns (can use wildcards)
    file_patterns = [
        "/home/quan/PROJECT/Machine Learning with Biomedical Signals/kaggle_downloads/user_a.csv",
        "/home/quan/PROJECT/Machine Learning with Biomedical Signals/kaggle_downloads/user_b.csv", 
        "/home/quan/PROJECT/Machine Learning with Biomedical Signals/kaggle_downloads/user_c.csv",
        "/home/quan/PROJECT/Machine Learning with Biomedical Signals/kaggle_downloads/user_d.csv"
    ]
    
    # Alternative: use wildcard pattern to match multiple files
    # file_patterns = ["data_*.csv"]  # This will process all files starting with "data_"
    
    # Process all files
    processing_results = process_multiple_files(file_patterns, output_folder="cleaned_data")
    
    # Print summary
    print(f"\n{'='*50}")
    print("PROCESSING SUMMARY:")
    print(f"{'='*50}")
    
    for file_path, result in processing_results.items():
        if result['status'] == 'success':
            print(f"{Path(file_path).name}: SUCCESS "
                  f"({result['original_columns']} → {result['processed_columns']} columns)")
        else:
            print(f"{Path(file_path).name}: ERROR - {result['error_message']}")


Processing: /home/quan/PROJECT/Machine Learning with Biomedical Signals/kaggle_downloads/user_a.csv
Original columns: 113
Processing file: user_a.csv
Step 1: Removing columns with missing values...
Columns after missing values processing: 113
Step 2: Transforming column names...
Final number of columns: 113
Successfully processed and saved to: data/user_a.csv

Processing: /home/quan/PROJECT/Machine Learning with Biomedical Signals/kaggle_downloads/user_b.csv
Original columns: 1024
Processing file: user_b.csv
Step 1: Removing columns with missing values...
Columns after missing values processing: 113
Step 2: Transforming column names...
Final number of columns: 113
Successfully processed and saved to: data/user_a.csv

Processing: /home/quan/PROJECT/Machine Learning with Biomedical Signals/kaggle_downloads/user_b.csv
Original columns: 1024
Processing file: user_b.csv
Step 1: Removing columns with missing values...
Columns after missing values processing: 113
Step 2: Transforming column 