In [1]:
pip install opencv-python


Note: you may need to restart the kernel to use updated packages.


In [2]:
# Notebook 1: Data Loading and Preparation

import os
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# Define the paths and classes
train_path = r"C:\Users\Admin8\Desktop\NTU\Project\kneedataset\train"
test_path = r"C:\Users\Admin8\Desktop\NTU\Project\kneedataset\test"
valid_path = r"C:\Users\Admin8\Desktop\NTU\Project\kneedataset\val"
list_of_classes = ['Healthy', 'Doubtful', 'Minimal', 'Moderate', 'Severe']

# Load and organize the dataset
for d in [train_path, test_path, valid_path]:
    filepaths = []
    labels = []
    classlist = os.listdir(d)
    
    for klass in classlist:
        intklass = int(klass)
        label = list_of_classes[intklass]
        classpath = os.path.join(d, klass)
        flist = os.listdir(classpath)
        
        for f in flist:
            fpath = os.path.join(classpath, f)
            filepaths.append(fpath)
            labels.append(label)
    
    Fseries = pd.Series(filepaths, name='filepaths')
    Lseries = pd.Series(labels, name='labels')
    pdf = pd.concat([Fseries, Lseries], axis=1)
    
    if d == test_path:
        test_df = pdf
    elif d == valid_path:
        valid_df = pdf
    else:
        train_df = pdf
  # Display class distribution and analyze images
print('The number of classes in the dataset is:', len(list_of_classes))

groups = train_df.groupby('labels')
print('{0:^30s} {1:^13s}'.format('CLASS', 'IMAGE COUNT'))
countlist = []
classlist = []
 # Calculate and display the image count for each class
for label in sorted(list(train_df['labels'].unique())):
    group = groups.get_group(label)
    countlist.append(len(group))
    classlist.append(label)
    print('{0:^30s} {1:^13d}'.format(label, len(group)))  

# Find the classes with the maximum and minimum number of train images
max_value = max(countlist)
max_index = countlist.index(max_value)
max_class = classlist[max_index]
min_value = min(countlist)
min_index = countlist.index(min_value)
min_class = classlist[min_index]

print(max_class, 'has the most images =', max_value)
print(min_class, 'has the least images =', min_value)

# Calculate the average height and width of a sample of the train images
total_height = 0
total_width = 0
num_samples = 100  # Number of samples to analyze

# Select a random sample of train_df for analysis
train_df_sample = train_df.sample(n=num_samples, random_state=123, axis=0)

for i in range(len(train_df_sample)):
    fpath = train_df_sample['filepaths'].iloc[i]
    img = plt.imread(fpath)
    shape = img.shape
    total_height += shape[0]
    total_width += shape[1]

average_height = total_height // num_samples
average_width = total_width // num_samples
aspect_ratio = total_height / total_width

print('Average height =', average_height, ', Average width =', average_width, ', Aspect ratio =', aspect_ratio)


# Perform train-validation-test split on image filepaths and labels
# Save train_df, valid_df, test_df as pandas DataFrames


The number of classes in the dataset is: 5
            CLASS               IMAGE COUNT 
           Doubtful                1046     
           Healthy                 2286     
           Minimal                 1516     
           Moderate                 757     
            Severe                  173     
Healthy has the most images = 2286
Severe has the least images = 173
Average height = 224 , Average width = 224 , Aspect ratio = 1.0


In [3]:
# Combine train_df and valid_df for initial split
train_valid_df = pd.concat([train_df, valid_df], ignore_index=True)

# Perform initial train-validation split
train_filepaths, valid_filepaths, train_labels, valid_labels = train_test_split(
    train_valid_df['filepaths'], train_valid_df['labels'], test_size=0.2, random_state=123
)

# Further split the test_df for validation and test sets
valid_filepaths, test_filepaths, valid_labels, test_labels = train_test_split(
    test_df['filepaths'], test_df['labels'], test_size=0.5, random_state=123
)


In [4]:
# Create new DataFrames for train, valid, and test sets
train_split_df = pd.DataFrame({'filepaths': train_filepaths, 'labels': train_labels})
valid_split_df = pd.DataFrame({'filepaths': valid_filepaths, 'labels': valid_labels})
test_split_df = pd.DataFrame({'filepaths': test_filepaths, 'labels': test_labels})

# Save the new DataFrames as CSV files
train_split_df.to_csv('train_split_df.csv', index=False)
valid_split_df.to_csv('valid_split_df.csv', index=False)
test_split_df.to_csv('test_split_df.csv', index=False)
