asymmetrische Septumhypertrophie (linksventrikulär): Wenn Verhältnis von Septumdicke zur Hinterwanddicke größer als 1,3 ist.  
Septumdicke: IVS (intraventrikuläres Septum)  
Hinterwanddicke: LVPW 

Create new feature: asym = IVS / LVPW

In [1]:
import csv

def new_feature(input_file, output_file):

    unique_file_names = set()
    final_rows = [['file', 'IVSd', 'LVPWd', 'fraction']]

    with open(input_file, mode='r') as infile:
        reader = csv.reader(infile)
        rows = list(reader)

    # Skip the header
    rows = rows[1:]

    # Loop through the rows in the input CSV file
    for row in rows:
        # Check if the file is already included
        unique_file_names.add(row[1])

    for file in unique_file_names:

        ivsd = 0
        lvpwd = 0
        new_row = [file]

        for row in rows:

            if row[1] == file and row[2] == 'IVSd':
                # If a match is found, retrieve the value from the third column
                ivsd = float(row[3])
                new_row.append(ivsd)
            if row[1] == file and row[2] == 'LVPWd':
                lvpwd = float(row[3])
                new_row.append(lvpwd)

        # calculate fraction
        if lvpwd != 0:
            new_row.append(ivsd/lvpwd)

        if len(new_row) == 4: # only append if all values could be found
            final_rows.append(new_row)

    # Open the CSV file in write mode
    with open(output_file, 'w', newline='') as file:
        writer = csv.writer(file)
    
        # Write each inner list as a row in the CSV
        writer.writerows(final_rows)

    print("CSV file created successfully.")


input_file = "/vol/ideadata/ep56inew/EchoNet/EchoNet-LVH/MeasurementsList.csv"
output_file = "/vol/ideadata/ep56inew/EchoNet/EchoNet-LVH/MeasurementsList_new_clean.csv"

file_names = new_feature(input_file, output_file)


CSV file created successfully.


Now when the IVS/LVPW fraction is greater than 1/1,3 we have asymmetric septal hypertrophy (ASH) (Class 1, if less: Class 0)

In [10]:
import csv

def append_column_based_on_4th_column(input_file, output_file):
    threshold = 1 / 1.3  # This is approximately 0.769

    # Read the CSV file
    with open(input_file, mode='r') as infile:
        reader = csv.reader(infile)
        rows = list(reader)  # Read all rows into memory

    # Add a new header for the additional column
    rows[0].append('ASH')

    # Loop through each row and apply the condition based on the 4th column
    for row in rows[1:]:  # Skip the header row
        # Ensure we are working with a numeric value in the 4th column
        try:
            col_value = float(row[3])  # Convert the 4th column value to float
            if col_value <= threshold:
                row.append(0)
            else:
                row.append(1)
        except ValueError:
            # If conversion fails, append a default value (e.g., 0 or handle the error)
            row.append(2)

    # Write the updated rows back to a new CSV file
    with open(output_file, mode='w', newline='') as outfile:
        writer = csv.writer(outfile)
        writer.writerows(rows)

    print(f"New CSV file created with an additional column: {output_file}")

# Example usage
input_file = '/vol/ideadata/ep56inew/EchoNet/EchoNet-LVH/MeasurementsList_new_clean.csv'
output_file = '/vol/ideadata/ep56inew/EchoNet/EchoNet-LVH/MeasurementsList_new_clean_ASH.csv'
append_column_based_on_4th_column(input_file, output_file)

New CSV file created with an additional column: /vol/ideadata/ep56inew/EchoNet/EchoNet-LVH/MeasurementsList_new_clean_ASH.csv


Count Categories

In [1]:
import csv

def count_train_test_val(input_file):
    # Initialize counters for "TRAIN", "TEST", and "VAL"
    train_count = 0
    test_count = 0
    val_count = 0

    # Open and read the CSV file
    with open(input_file, mode='r') as infile:
        reader = csv.reader(infile)
        rows = list(reader)
        number_of_rows = len(rows)
        # next(reader)  # Skip the header row

        # Loop through each row in the file
        # for row in rows:
        #     if row[4] == 'TRAIN':
        #         train_count += 1
        #     elif row[4] == 'TEST':
        #         test_count += 1
        #     elif row[4] == 'VAL':
        #         val_count += 1

        for row in rows:
            if row[4] == '0':
                train_count += 1
            elif row[4] == '1':
                test_count += 1
            elif row[4] == '2':
                val_count += 1

    # Print or return the counts
    print(f"healthy: {train_count}, {train_count/number_of_rows} %")
    print(f"sick: {test_count}, {test_count/number_of_rows} %")
    print(f"other: {val_count}, {val_count/number_of_rows} %")

# Example usage
input_file = '/vol/ideadata/ep56inew/EchoNet/EchoNet-LVH/MeasurementsList_new_clean_ASH.csv'
# input_file = '/vol/ideadata/ep56inew/EchoNet/EchoNet-LVH/MeasurementsList_new_clean_ASH_reduced.csv'
count_train_test_val(input_file)

healthy: 247, 0.021348314606741574 %
sick: 11322, 0.9785652549697493 %
other: 0, 0.0 %


Now we want to have a balanced test and validation set but the training set can be imbalanced.  
247 / 3 = 82 Rest 1
Put 82 of category 0 into validation and test set each, and 83 into training set.
Put further 82 of category 1 into validation and test set, and the rest into training set. 

In [2]:
import pandas as pd

# Load the CSV file into a pandas DataFrame
df = pd.read_csv('/vol/ideadata/ep56inew/EchoNet/EchoNet-LVH/MeasurementsList_new_clean_ASH.csv')

# Sort the DataFrame by the specific column (replace 'status' with your column name)
sorted_df = df.sort_values(by='ASH')

# Save the sorted DataFrame back to a CSV file (or overwrite the original)
sorted_df.to_csv('/vol/ideadata/ep56inew/EchoNet/EchoNet-LVH/MeasurementsList_new_clean_ASH_sorted.csv', index=False)

print("CSV file sorted successfully!")

CSV file sorted successfully!


sick: 11322 (category 1)  
11322 - (2 * 82)  
= 11158

In [3]:
set_column = []
set = ["set"]
val_0 = ["VAL"] * 82
test_0 = ["TEST"] * 82
train_0 = ["TRAIN"] * 83
val_1 = ["VAL"] * 82
test_1 = ["TEST"] * 82
train_1 = ["TRAIN"] * 11158

set_column = set + val_0 + test_0 + train_0 + val_1 + test_1 + train_1


Append set row

In [4]:
import pandas as pd

# Load the existing CSV file into a pandas DataFrame
df = pd.read_csv('/vol/ideadata/ep56inew/EchoNet/EchoNet-LVH/MeasurementsList_new_clean_ASH_sorted.csv')

# Define your list (with header as the first element)
new_column = set_column # from previous block

# Extract the header and the data separately
column_name = new_column[0]  # First element is the header
column_data = new_column[1:]  # Rest of the list is the data

# Ensure the list has the same length as the DataFrame's rows
if len(column_data) != len(df):
    raise ValueError("Length of new column data does not match the number of rows in the CSV")

# Add the new column to the DataFrame
df[column_name] = column_data

# Save the updated DataFrame back to a CSV (or overwrite the original)
df.to_csv('/vol/ideadata/ep56inew/EchoNet/EchoNet-LVH/MeasurementsList_new_clean_ASH_sorted_set.csv', index=False)

print("New column added successfully!")

New column added successfully!


We need to use the official train-val-test split. Use the split from MeasurementsList.csv and append it to MeasurementsList_new_clean_ASH.csv

In [None]:
import csv

def match_and_append(file1_path, file2_path, output_file_path):
    """
    Reads two CSV files and appends the last entry of a matching row 
    from file1 to the corresponding row in file2.
    
    Args:
        file1_path (str): Path to the first input CSV file.
        file2_path (str): Path to the second input CSV file.
        output_file_path (str): Path to save the updated second file.
    """
    # Read file1 into a dictionary for fast lookups
    file1_data = {}
    with open(file1_path, 'r', newline='', encoding='utf-8') as file1:
        reader = csv.reader(file1)
        for row in reader:
            if row:  # Skip empty rows
                file1_data[row[1]] = row[-1]  # Store video name and split name

    # Read file2, compare, and append matching data
    updated_rows = []
    with open(file2_path, 'r', newline='', encoding='utf-8') as file2:
        reader = csv.reader(file2)
        for row in reader:
            if row:  # Skip empty rows
                match_key = row[0]
                if match_key in file1_data:
                    row.append(file1_data[match_key])  # Append matching value
            updated_rows.append(row)

    # Write the updated rows to a new output file
    with open(output_file_path, 'w', newline='', encoding='utf-8') as output_file:
        writer = csv.writer(output_file)
        writer.writerows(updated_rows)

# Example usage:
file1 = "/vol/ideadata/ep56inew/EchoNet/EchoNet-LVH/MeasurementsList.csv"
file2 = "/vol/ideadata/ep56inew/EchoNet/EchoNet-LVH/MeasurementsList_new_clean_ASH.csv"
output_file = "/vol/ideadata/ep56inew/EchoNet/EchoNet-LVH/MeasurementsList_new_clean_ASH_official_split.csv"
# match_and_append('file1.csv', 'file2.csv', 'output.csv')   

Now we want to count the class distribution in train, test, val

In [None]:
import csv 
from collections import defaultdict

def count_classes_by_set(csv_file_path):
    """
    Reads a CSV file and counts how many instances of each class are in each set (train, test, val).

    Args:
        csv_file_path (str): Path to the CSV file.

    Returns:
        dict: A dictionary with counts of each class in each set.
    """
    # Dictionary to store counts: {set_name: {class_name: count}}
    counts = defaultdict(lambda: defaultdict(int))
    
    with open(csv_file_path, 'r', newline='', encoding='utf-8') as file:
        reader = csv.reader(file)
        
        # Iterate through the rows
        for row in reader:
            if len(row) < 2:
                continue  # Skip rows with insufficient columns
            
            # Extract class and set
            class_name = row[-2]  # Second last column
            set_name = row[-1]    # Last column
            
            # Increment the count
            counts[set_name][class_name] += 1
    
    return counts

# Example usage:
csv_file_path = '/vol/ideadata/ep56inew/EchoNet/EchoNet-LVH/MeasurementsList_new_clean_ASH_official_split_more_test.csv'  # Replace with your CSV file path
result = count_classes_by_set(csv_file_path)

# Print the results
for set_name, class_counts in result.items():
    print(f"Set: {set_name}")
    for class_name, count in class_counts.items():
        print(f"  Class {class_name}: {count}")