## Imports

In [1]:
import numpy as np
import os
import pandas as pd

from PIL import Image
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer

## Dataset Loading

In [2]:
labels_test = pd.read_csv('miccai2023_nih-cxr-lt_labels_test.csv')
labels_train = pd.read_csv('miccai2023_nih-cxr-lt_labels_train.csv')
labels_val = pd.read_csv('miccai2023_nih-cxr-lt_labels_val.csv')

combined = pd.concat([labels_test, labels_train, labels_val])
combined = combined.sort_values(by=['id'])
combined = combined.reset_index(drop = True)

## Dataset Generating

In [None]:
# Images were split into multiple folders when downloading
folder_paths = ["images", "images 2", "images 3", "images 4", "images 5", "images 6", "images 7", "images 8", "images 9", "images 10", "images 11", "images 12"]
image_data = []
disease_names = combined.columns[1:21]
image_names = []
diseases_found = []

# For every folder that contains images
for folder_path in folder_paths:

    # Get list of image files
    image_files = [f for f in os.listdir(folder_path) if f.endswith('.png')]

    # Initialize list to store image data
    for i in range(len(image_files)):
        img_path = os.path.join(folder_path, image_files[i])
        img = Image.open(img_path).convert('L')
        img.thumbnail((64, 64), Image.ANTIALIAS)
        img_array = np.array(img).flatten()
        for disease in disease_names:
            if int(combined[combined["id"]==image_files[i]][disease]) == 1:
                image_data.append(img_array)
                diseases_found.append(disease)
                image_names.append(image_files[i])
                
df = pd.DataFrame(image_data)
df["File Name"] = image_names
df["Disease"] = diseases_found
df = df.sort_values(by=['File Name'])

df.to_csv("all_testing_image_dataset_with_duplicates.csv", index=False)

In [None]:
# Binary Dataset
folder_paths = ["images", "images 2", "images 3", "images 4", "images 5", "images 6", "images 7", "images 8", "images 9", "images 10", "images 11", "images 12"]
image_data = []
disease_names = combined.columns[1:21]
image_names = []
diseases_found = []

for folder_path in folder_paths:

    # Get list of image files
    image_files = [f for f in os.listdir(folder_path) if f.endswith('.png')]

    # Initialize list to store image data
    for i in range(len(image_files)):
        img_path = os.path.join(folder_path, image_files[i])
        img = Image.open(img_path).convert('L')# Convert to grayscale
        img.thumbnail((64, 64), Image.ANTIALIAS)
        img_array = np.array(img).flatten()  # Flatten the image into 1D
        image_data.append(img_array)
        image_names.append(image_files[i])
        if int(combined[combined["id"]==image_files[i]]["No Finding"]) == 1:
            diseases_found.append(0)
        else:
            diseases_found.append(1)
                
df = pd.DataFrame(image_data)
df["File Name"] = image_names
df["Disease Found"] = diseases_found
df = df.sort_values(by=['File Name'])

df.to_csv("binary_dataset.csv", index=False)

# Random Forest Classifier Models

In [3]:
# Prepare the data
def prepare_data(df):
    
    X = df.iloc[:, 0:4096].values
    
    y = df['Disease'].apply(lambda x: x.split(',') if isinstance(x, str) else [x])
    
    mlb = MultiLabelBinarizer()
    y_encoded = mlb.fit_transform(y)
    
    disease_classes = mlb.classes_
    
    return X, y_encoded, disease_classes

# Build and train the model
def build_model(X, y):
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
    
    # Random Forest Classifier
    base_classifier = RandomForestClassifier(n_estimators = 100, max_depth = 10, random_state = 42, n_jobs = -1)
    
    # Multi-Label Classification
    model = OneVsRestClassifier(base_classifier)
    
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("\nClassification Report:\n", classification_report(y_test, y_pred))
    
    return model, X_test, y_test

df = pd.read_csv('all_testing_image_dataset_with_duplicates.csv')
X, y_encoded, disease_classes = prepare_data(df)
X = X / 255.0
model, X_test, y_test = build_model(X, y_encoded)

Accuracy: 0.22346542884855203

Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00      2320
           1       0.00      0.00      0.00        97
           2       0.00      0.00      0.00       584
           3       0.00      0.00      0.00       935
           4       0.00      0.00      0.00       426
           5       0.00      0.00      0.00      2669
           6       0.00      0.00      0.00       500
           7       0.00      0.00      0.00       378
           8       0.00      0.00      0.00        55
           9       0.00      0.00      0.00      3942
          10       0.00      0.00      0.00      1169
          11       0.67      0.55      0.60     11836
          12       0.00      0.00      0.00      1270
          13       0.00      0.00      0.00       630
          14       0.00      0.00      0.00        51
          15       0.00      0.00      0.00       311
          16       0.00   

In [3]:
# Prepare the data
def prepare_data(df):
    
    X = df.iloc[:, 0:4096].values  # Pixel values
    
    y = df['Disease Found'].apply(lambda x: x.split(',') if isinstance(x, str) else [x])
    
    mlb = MultiLabelBinarizer()
    y_encoded = mlb.fit_transform(y)
    
    disease_classes = mlb.classes_
    
    return X, y_encoded, disease_classes

# Build and train the model
def build_model(X, y):
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
    
    # Random Forest Classifier
    model = RandomForestClassifier(n_estimators = 100, max_depth = 10, random_state = 42, n_jobs = -1)
    
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("\nClassification Report:\n", classification_report(y_test, y_pred))
    
    return model, X_test, y_test

df = pd.read_csv('binary_dataset.csv')
X, y_encoded, disease_classes = prepare_data(df)
X = X / 255.0
model, X_test, y_test = build_model(X, y_encoded)

Accuracy: 0.6497056724937567

Classification Report:
               precision    recall  f1-score   support

           0       0.65      0.74      0.69     11974
           1       0.65      0.55      0.59     10450

   micro avg       0.65      0.65      0.65     22424
   macro avg       0.65      0.64      0.64     22424
weighted avg       0.65      0.65      0.65     22424
 samples avg       0.65      0.65      0.65     22424

