<a href="https://www.kaggle.com/code/mohdmuttalib/spine-fracture-detection-project-eda?scriptVersionId=143435628" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# 1. Import libraries

In [None]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import seaborn as sns
import os
import re
import gc
import pydicom
from pydicom.pixel_data_handlers.util import apply_voi_lut
from pprint import pprint
import nibabel as nib
from glob import glob

# 2. Load the CSV files in the dataset:

In [None]:
# Load csv files
train_df = pd.read_csv('../input/rsna-2022-cervical-spine-fracture-detection/train.csv')
train_bbox = pd.read_csv('../input/rsna-2022-cervical-spine-fracture-detection/train_bounding_boxes.csv')

In [None]:
train_df.head()

In [None]:
train_bbox.head()

In [None]:
# Get the list of verterbraes
verterbraes = list(train_df.columns[-7:])
verterbraes

In [None]:
print('Number of patients in the training dataset:', len(train_df.StudyInstanceUID.unique()))

In [None]:
train_df.describe()

This dataset's statistics depicts some following information:

This dataset is nearly balanced (as nearly 50% of patients have fractures).
Fractures in C3 have the lowest percentage (about 3.6%), and fractures in C7 have the highest one (nearly 19.5%).

# 3. EDA for training dataset:

#a. Overview analysis of training dataset:

In [None]:
plt.figure(figsize=(10, 5))
ax1 = sns.countplot(x='patient_overall', data=train_df)
for container in ax1.containers:
    ax1.bar_label(container)
plt.title('Fractures distribution by patient')
plt.ylim([0, 1200])
plt.show()

In [None]:
train_melt = pd.melt(train_df, id_vars=['StudyInstanceUID', 'patient_overall'],
                     value_vars=verterbraes, var_name='verterbrae', value_name='fractured')
train_melt.head(10)

In [None]:
plt.figure(figsize=(10, 5))
ax2 = sns.countplot(x='verterbrae', hue='fractured', data=train_melt)
for container in ax2.containers:
    ax2.bar_label(container)
plt.title('Fractures distribution by verterbrae')
plt.ylim([0, 2250])
plt.show()

In [None]:
plt.figure(figsize=(10, 5))
ax3 = sns.countplot(x=train_df[verterbraes].sum(axis=1))
for container in ax3.containers:
    ax3.bar_label(container)
plt.title('Number of fractures by patient')
plt.ylim([0, 1200])
plt.show()

Most of fractured patients have only fractures on 1 verterbrae.

In [None]:
plt.figure(figsize=(6, 5))
sns.heatmap(train_df[verterbraes].corr(), cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlations')
plt.show()

This correlation indicates that patients, who has fracture in an any verterbrae, may have another fracture in an adjacent verterbrae.

# b. Visualization of several image samples:

In [None]:
# Load some sample images in the dataset
sample_path = '../input/rsna-2022-cervical-spine-fracture-detection/train_images/1.2.826.0.1.3680043.10921/15.dcm'
sample_dcm = pydicom.dcmread(sample_path)
sample_dcm

In [None]:
# Extract image from sample DCM metadata
image = apply_voi_lut(sample_dcm.pixel_array, sample_dcm)

In [None]:
# Visualize the sample image
plt.figure(figsize=(5, 5))
plt.imshow(image, cmap='bone')
plt.title('ID: 1.2.826.0.1.3680043.10921, Slice: 15', weight='bold', size=12, y=1.05)
plt.show()

In [None]:
# Visualize 15 images of a case
def visualize_15_images(instance_uid):
    dcm_paths = [f'../input/rsna-2022-cervical-spine-fracture-detection/train_images/{instance_uid}/{i+100}.dcm' for i in range(15)]
    dcm_files = [pydicom.dcmread(path) for path in dcm_paths]
    images = [apply_voi_lut(file.pixel_array, file) for file in dcm_files]
    fig, ax = plt.subplots(3, 5, figsize=(20, 12))
    fig.suptitle(f'ID: {instance_uid}', weight='bold', size=20)
    for i in range(15):
        image = images[i]
        row = i // 5
        col = i % 5
        ax[row, col].imshow(image, cmap='bone')
        ax[row, col].set_title(f'Slice no: {i+100}', size=14)
        ax[row, col].axis('off')
    plt.show()

In [None]:
visualize_15_images('1.2.826.0.1.3680043.10921')

# c. Visualization of several segmented masks:

In [None]:
# Load some segmented masks
sample_mask_path = '../input/rsna-2022-cervical-spine-fracture-detection/segmentations/1.2.826.0.1.3680043.10921.nii'
sample_nii = nib.load(sample_mask_path)
sample_nii

In [None]:
sample_seg = sample_nii.get_fdata()[:, ::-1, ::-1].transpose([2, 1, 0])
sample_seg.shape

In [None]:
# Visualize 15 masks of an instance (patient)
def visualize_15_masks(instance_uid):
    mask_path = f'../input/rsna-2022-cervical-spine-fracture-detection/segmentations/{instance_uid}.nii'
    nii_data = nib.load(mask_path)
    seg_masks = nii_data.get_fdata()[:, ::-1, ::-1].transpose([2, 1, 0])
    fig, ax = plt.subplots(3, 5, figsize=(20, 12))
    fig.suptitle(f'ID: {instance_uid}', weight='bold', size=20)
    for i in range(15):
        row = i // 5
        col = i % 5
        mask = seg_masks[i+100]
        ax[row, col].imshow(mask, cmap='inferno')
        ax[row, col].set_title(f'Slice: {i+100}', size=14)
        ax[row, col].axis('off')
    plt.show()

In [None]:
visualize_15_masks('1.2.826.0.1.3680043.10921')

In [None]:
print('Number of cases that have segmentations:', len(glob('../input/rsna-2022-cervical-spine-fracture-detection/segmentations/*')))

Number of cases that have segmentations: 87

# 4. EDA for training set with bounding boxes:

a. Overview analysis of training set with bounding boxes:

In [None]:
# Get training with bounding boxes data
train_bbox.head()

In [None]:
print('Number of cases with bounding boxes:', len(train_bbox.StudyInstanceUID.unique()))

Only 235 cases (out of 2019 cases), equivalent to 11.64% of all cases, have bounding boxes on their CT images.

In [None]:
train_df_bbox = train_df[train_df.StudyInstanceUID.isin(train_bbox.StudyInstanceUID)]
train_df_bbox.head()

In [None]:
train_df_bbox.describe()

All of cases which have bounding boxes are fractured.
C1-typed fractures have the lowest bounding box rate (only 19.57%), and C6-typed ones have the highest bounding box rate (just over 37%).

In [None]:
plt.figure(figsize=(7, 5))
ax4 = sns.countplot(x='patient_overall', data=train_df_bbox)
for container in ax4.containers:
    ax4.bar_label(container)
plt.title('Fractures overall (patients with bounding boxes)')
plt.ylim([0, 270])
plt.show()

In [None]:
train_df_bbox_melt = pd.melt(train_df_bbox, id_vars=['StudyInstanceUID', 'patient_overall'],
                             var_name='cervical_verterbrae', value_name='fractured')
train_df_bbox_melt.head(10)

In [None]:
plt.figure(figsize=(10, 5))
ax5 = sns.countplot(x='cervical_verterbrae', hue='fractured', data=train_df_bbox_melt)
for container in ax5.containers:
    ax5.bar_label(container)
plt.title('Fractures by verterbrae (patients with bounding boxes)')
plt.ylim([0, 220])
plt.show()

In [None]:
plt.figure(figsize=(15, 5))
sns.histplot(train_bbox.StudyInstanceUID.value_counts().values, kde=True, bins=50)
plt.title('Number of slices with bounding boxes per patient')
plt.xlabel('No. of bounding boxes')
plt.show()

Most of the cases, which have bounding box measurement, have 10 - 35 image slices which contain bounding boxes.

In [None]:
plt.figure(figsize=(10, 5))
sns.histplot(train_bbox[['StudyInstanceUID', 'slice_number']].value_counts().values, bins=10)
plt.title('Number of bounding boxes per slice')
plt.xlabel('No. of bounding boxes')
plt.show()

There is only 1 bounding box for every slice which have bounding box measurements.