# Data preparation

The following code reads the data from the CSV files and prepares it for training. The data is split into training, testing, and validation sets. The training set is used to train the model, the validation set is used to tune the hyperparameters, and the testing set is used to evaluate the model.

In [1]:
import pandas as pd

# Load the dataset
df = pd.read_csv('../dataset.csv')

# Display the first 5 rows
df.head()

Unnamed: 0,image_id,hand,leg,hip,shoulder,mixed,hardware,multiscan,fractured,fracture_count,frontal,lateral,oblique
0,IMG0000000.jpg,0,1,0,0,0,0,1,0,0,1,1,0
1,IMG0000001.jpg,0,1,0,0,0,0,1,0,0,1,1,0
2,IMG0000002.jpg,0,1,0,0,0,0,1,0,0,1,1,0
3,IMG0000003.jpg,0,1,0,0,0,0,1,0,0,0,1,1
4,IMG0000004.jpg,0,1,0,0,0,0,1,0,0,0,1,1


In [2]:
# Count the number of rows for each class
df.groupby("fractured").size()

fractured
0    3366
1     717
dtype: int64

In [None]:
# Filter out the rows where only the hand is fractured
df_hands = df.loc[(df['hand'] == 1) & (df['mixed'] == 0)]

# Display information about the dataset
df_hands.info()

In [None]:
# Count the number of rows for each class
df_hands.groupby("fractured").size()

In [3]:
# Save the filtered dataset

df.to_csv('output/dataset/distribution/all_included_fractures.csv', index=False)  
#df_hands.to_csv('output/datasets/distribution/only_hands.csv', index=False) # Uncomment this line to save the filtered dataset

In [4]:
df_fractured = df.loc[(df['fractured'] == 1)]
#df_hands_fractured = df.loc[(df['fractured'] == 1)] # Uncomment this line to filter out the rows where only the hand is fractured
#df_hands_fractured.head() # Uncomment this line to display the first 5 rows if created the filtered dataset above

In [5]:
df_images_id = df_fractured['image_id']
#df_hands_images_id = df_hands_fractured['image_id'] # Uncomment this line to filter out the rows where only the hand is fractured
#df_hands_images_id.head() # Uncomment this line to display the first 5 rows if created the filtered dataset above

In [6]:
# !pip install scikit-learn # Uncomment if scikit-learn isn't installed

from sklearn.model_selection import train_test_split

# Split the dataset into train, validation and test sets
train, test = train_test_split(df_images_id, test_size=0.2) # Train
#train, test = train_test_split(df_hands_images_id, test_size=0.2) # Train
validation, test = train_test_split(test, test_size=0.4) # Test, Validation

In [7]:
# Display the number of rows for each set
test.count()
train.count()
validation.count()

58

In [8]:
# Save the train, validation and test sets to CSV files
train.to_csv('output/dataset/distribution/train.csv', index=False)  
test.to_csv('output/dataset/distribution/test.csv', index=False)  
validation.to_csv('output/dataset/distribution/validation.csv', index=False)  

In [11]:
import os
import shutil

# Current directory of the notebook
current_dir = os.getcwd()

# Directory one level up from the current directory
one_level_up = os.path.dirname(current_dir)

# Source and destination directories
current_dir = os.getcwd()
all_images_dir = os.path.join(one_level_up, 'images', 'all_images')
all_labels_dir = os.path.join(all_images_dir, 'labels')

test_dir = os.path.join(current_dir, 'output', 'dataset', 'test')
test_images_dir = os.path.join(current_dir, 'output', 'dataset', 'test', 'images')
test_labels_dir = os.path.join(current_dir, 'output', 'dataset', 'test', 'labels')

validation_dir = os.path.join(current_dir, 'output', 'dataset', 'val')
validation_images_dir = os.path.join(current_dir, 'output', 'dataset', 'val', 'images')
validation_labels_dir = os.path.join(current_dir, 'output', 'dataset', 'val', 'labels')

train_dir = os.path.join(current_dir, 'output', 'dataset', 'train')
train_images_dir = os.path.join(current_dir, 'output', 'dataset', 'train', 'images')
train_labels_dir = os.path.join(current_dir, 'output', 'dataset', 'train', 'labels')

# Create destination directories if they do not exist
os.makedirs(test_dir, exist_ok=True)
os.makedirs(test_images_dir, exist_ok=True)
os.makedirs(test_labels_dir, exist_ok=True)
os.makedirs(validation_dir, exist_ok=True)
os.makedirs(validation_images_dir, exist_ok=True)
os.makedirs(validation_labels_dir, exist_ok=True)
os.makedirs(train_dir, exist_ok=True)
os.makedirs(train_images_dir, exist_ok=True)
os.makedirs(train_labels_dir, exist_ok=True)

def copy_files(image_ids, dest_images_dir, dest_labels_dir):
    for image_id in image_ids:
        src_image_path = os.path.join(all_images_dir, image_id)
        dest_image_path = os.path.join(dest_images_dir, image_id)
        shutil.copy(src_image_path, dest_image_path)
        
        label_filename = image_id.replace('.jpg', '.txt')
        src_label_path = os.path.join(all_labels_dir, label_filename)
        dest_label_path = os.path.join(dest_labels_dir, label_filename)
        shutil.copy(src_label_path, dest_label_path)
            
# Copy files to the corresponding folders
copy_files(train, train_images_dir, train_labels_dir)
copy_files(test, test_images_dir, test_labels_dir)
copy_files(validation, validation_images_dir, validation_labels_dir)