## MURA dataset pre-processing for Deep Learning
This notebook does the following things:

* 1. **Read** the images from the dataset.
* 2. **Match** the data and the label.(Along with human part, patient number, study number, etc)
* 3. **Reshape** the images to a standard size (here, $64\times 64$)

<h1 style="text-align:right">$\mathcal{ZLF}$ </h1>

## 1. Read the image path

In [1]:
# import packages
from __future__ import print_function
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import os
import cv2
from skimage.transform import resize
from PIL import Image
from skimage.filters import sobel
from scipy import ndimage

%matplotlib inline

In [2]:
# get the data paths (and labels)
train_img_path = pd.read_csv("train_image_paths.csv", header=None)
# train_img_label = pd.read_csv("train_labeled_studies.csv", header=None)
valid_img_path = pd.read_csv("valid_image_paths.csv", header=None)
# valid_img_label = pd.read_csv("valid_labeled_studies.csv", header=None)

In [3]:
# show the data
train_img_path.head()

Unnamed: 0,0
0,MURA-v1.1/train/XR_SHOULDER/patient00001/study...
1,MURA-v1.1/train/XR_SHOULDER/patient00001/study...
2,MURA-v1.1/train/XR_SHOULDER/patient00001/study...
3,MURA-v1.1/train/XR_SHOULDER/patient00002/study...
4,MURA-v1.1/train/XR_SHOULDER/patient00002/study...


## 2. Add labels to the data

In [4]:
# add labels to the data sets
redundant_prefix = 'MURA-v1.1/'    ##preparing labels
len_prefix = len(redundant_prefix) # len_prefix = 10

train_img_path['Path2Img'] = train_img_path[0].apply(lambda x: x[len_prefix:])
valid_img_path['Path2Img'] = valid_img_path[0].apply(lambda x: x[len_prefix:])

def extract_label(string, d):
    """
    extract labels "positive/negative" from file path
    """
    try:
        pre_str = os.path.splitext(string)[0]
        label = pre_str.split("/")[-2].split("_")[-1]
        if np.isin(label, list(d.keys())):
            return(d[label])
        else:
            return('No corresponding key value')
    except IOError:
        return(np.nan)

d = {'positive':1, 'negative':0}

train_img_path['label'] = train_img_path[0].apply(lambda x: extract_label(x, d=d))
valid_img_path['label'] = valid_img_path[0].apply(lambda x: extract_label(x, d=d))

In [5]:
train_img_path.head()

Unnamed: 0,0,Path2Img,label
0,MURA-v1.1/train/XR_SHOULDER/patient00001/study...,train/XR_SHOULDER/patient00001/study1_positive...,1
1,MURA-v1.1/train/XR_SHOULDER/patient00001/study...,train/XR_SHOULDER/patient00001/study1_positive...,1
2,MURA-v1.1/train/XR_SHOULDER/patient00001/study...,train/XR_SHOULDER/patient00001/study1_positive...,1
3,MURA-v1.1/train/XR_SHOULDER/patient00002/study...,train/XR_SHOULDER/patient00002/study1_positive...,1
4,MURA-v1.1/train/XR_SHOULDER/patient00002/study...,train/XR_SHOULDER/patient00002/study1_positive...,1


In [6]:
train_img_path.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36808 entries, 0 to 36807
Data columns (total 3 columns):
0           36808 non-null object
Path2Img    36808 non-null object
label       36808 non-null int64
dtypes: int64(1), object(2)
memory usage: 862.8+ KB


In [7]:
valid_img_path.head()

Unnamed: 0,0,Path2Img,label
0,MURA-v1.1/valid/XR_WRIST/patient11185/study1_p...,valid/XR_WRIST/patient11185/study1_positive/im...,1
1,MURA-v1.1/valid/XR_WRIST/patient11185/study1_p...,valid/XR_WRIST/patient11185/study1_positive/im...,1
2,MURA-v1.1/valid/XR_WRIST/patient11185/study1_p...,valid/XR_WRIST/patient11185/study1_positive/im...,1
3,MURA-v1.1/valid/XR_WRIST/patient11185/study1_p...,valid/XR_WRIST/patient11185/study1_positive/im...,1
4,MURA-v1.1/valid/XR_WRIST/patient11186/study1_p...,valid/XR_WRIST/patient11186/study1_positive/im...,1


In [8]:
valid_img_path.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3197 entries, 0 to 3196
Data columns (total 3 columns):
0           3197 non-null object
Path2Img    3197 non-null object
label       3197 non-null int64
dtypes: int64(1), object(2)
memory usage: 75.0+ KB


## 3. Read the images

In [9]:
def read_image(path, rescale = None, normalize = False):
    """
    Read the image data into tensor forms.
    
    Arguments:
    path -- path to the image data
    rescale -- A tuple, gives the new shape of the image, e.g.: rescale = (512, 512)
    normalize -- If True, perform normalization

    Returns:
    img -- A tensor corresponding to the image
    """
    img = cv2.imread(path)
    if rescale != None:
        img = cv2.resize(img, rescale, interpolation = cv2.INTER_CUBIC)
    if normalize:
        img = img/255
    return(img)

In [10]:
train_img_path['img_vector'] = train_img_path['Path2Img'].apply(lambda x: read_image(x, rescale = (64, 64), normalize = False))
train_img_path.head()

Unnamed: 0,0,Path2Img,label,img_vector
0,MURA-v1.1/train/XR_SHOULDER/patient00001/study...,train/XR_SHOULDER/patient00001/study1_positive...,1,"[[[6, 6, 6], [7, 7, 7], [8, 8, 8], [8, 8, 8], ..."
1,MURA-v1.1/train/XR_SHOULDER/patient00001/study...,train/XR_SHOULDER/patient00001/study1_positive...,1,"[[[0, 0, 0], [0, 0, 0], [129, 129, 129], [59, ..."
2,MURA-v1.1/train/XR_SHOULDER/patient00001/study...,train/XR_SHOULDER/patient00001/study1_positive...,1,"[[[59, 59, 59], [45, 45, 45], [41, 41, 41], [4..."
3,MURA-v1.1/train/XR_SHOULDER/patient00002/study...,train/XR_SHOULDER/patient00002/study1_positive...,1,"[[[58, 58, 58], [57, 57, 57], [57, 57, 57], [5..."
4,MURA-v1.1/train/XR_SHOULDER/patient00002/study...,train/XR_SHOULDER/patient00002/study1_positive...,1,"[[[0, 0, 0], [0, 0, 0], [40, 40, 40], [54, 54,..."


In [11]:
valid_img_path['img_vector'] = valid_img_path['Path2Img'].apply(lambda x: read_image(x, rescale = (64, 64), normalize = False))
valid_img_path.head()

Unnamed: 0,0,Path2Img,label,img_vector
0,MURA-v1.1/valid/XR_WRIST/patient11185/study1_p...,valid/XR_WRIST/patient11185/study1_positive/im...,1,"[[[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], ..."
1,MURA-v1.1/valid/XR_WRIST/patient11185/study1_p...,valid/XR_WRIST/patient11185/study1_positive/im...,1,"[[[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], ..."
2,MURA-v1.1/valid/XR_WRIST/patient11185/study1_p...,valid/XR_WRIST/patient11185/study1_positive/im...,1,"[[[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], ..."
3,MURA-v1.1/valid/XR_WRIST/patient11185/study1_p...,valid/XR_WRIST/patient11185/study1_positive/im...,1,"[[[147, 147, 147], [50, 50, 50], [33, 33, 33],..."
4,MURA-v1.1/valid/XR_WRIST/patient11186/study1_p...,valid/XR_WRIST/patient11186/study1_positive/im...,1,"[[[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], ..."


## 4. Other info

In [12]:
###Prepare other labels that may be helpful
train_img_path.drop(labels=0, axis=1,inplace=True) #drop redundant column
valid_img_path.drop(labels=0, axis=1,inplace=True) #drop redundant column


def human_part(path):
    expected_output = path.split('/')[1].split("_")[1]
    return(expected_output)

def patient_number(path):
    expected_output = path.split('/')[2][-5:]
    try:
        expected_output = np.int(expected_output)
        return(expected_output)
    except IOError:
        return(np.nan)

def study_number(path):
    expected_output = path.split('/')[3].split("_")[0][-1]
    try:
        expected_output = np.int(expected_output)
        return(expected_output)
    except IOError:
        return(np.nan)

train_img_path['human_part'] = train_img_path['Path2Img'].apply(lambda x: human_part(x))
train_img_path['patient_number'] = train_img_path['Path2Img'].apply(lambda x: patient_number(x))
train_img_path['study_number'] = train_img_path['Path2Img'].apply(lambda x: study_number(x))

valid_img_path['human_part'] = valid_img_path['Path2Img'].apply(lambda x: human_part(x))
valid_img_path['patient_number'] = valid_img_path['Path2Img'].apply(lambda x: patient_number(x))
valid_img_path['study_number'] = valid_img_path['Path2Img'].apply(lambda x: study_number(x))

train_img_path.drop(labels='Path2Img', axis=1,inplace=True) #drop redundant column
valid_img_path.drop(labels='Path2Img', axis=1,inplace=True) #drop redundant column

In [13]:
# show final data structure
train_img_path.head()

Unnamed: 0,label,img_vector,human_part,patient_number,study_number
0,1,"[[[6, 6, 6], [7, 7, 7], [8, 8, 8], [8, 8, 8], ...",SHOULDER,1,1
1,1,"[[[0, 0, 0], [0, 0, 0], [129, 129, 129], [59, ...",SHOULDER,1,1
2,1,"[[[59, 59, 59], [45, 45, 45], [41, 41, 41], [4...",SHOULDER,1,1
3,1,"[[[58, 58, 58], [57, 57, 57], [57, 57, 57], [5...",SHOULDER,2,1
4,1,"[[[0, 0, 0], [0, 0, 0], [40, 40, 40], [54, 54,...",SHOULDER,2,1


In [14]:
# show final data structure
valid_img_path.tail()

Unnamed: 0,label,img_vector,human_part,patient_number,study_number
3192,0,"[[[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], ...",FINGER,11967,1
3193,0,"[[[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], ...",FINGER,11967,1
3194,0,"[[[3, 3, 3], [3, 3, 3], [3, 3, 3], [3, 3, 3], ...",FINGER,11738,1
3195,0,"[[[3, 3, 3], [3, 3, 3], [3, 3, 3], [3, 3, 3], ...",FINGER,11738,1
3196,0,"[[[3, 3, 3], [3, 3, 3], [3, 3, 3], [3, 3, 3], ...",FINGER,11738,1


In [15]:
def array_to_str(array):
    """
    This function is to convert the numpy.ndarry into string
    Arguments:
    array -- an array
    
    Outputs:
    string -- a string
    """
    array = array.reshape(-1) # drop the brackets
    string = " ".join(str(x) for x in array)
    return(string)

In [16]:
train_img_path['str_vector'] = train_img_path['img_vector'].apply(lambda x: array_to_str(x))
train_img_path.head()

Unnamed: 0,label,img_vector,human_part,patient_number,study_number,str_vector
0,1,"[[[6, 6, 6], [7, 7, 7], [8, 8, 8], [8, 8, 8], ...",SHOULDER,1,1,6 6 6 7 7 7 8 8 8 8 8 8 8 8 8 7 7 7 9 9 9 7 7 ...
1,1,"[[[0, 0, 0], [0, 0, 0], [129, 129, 129], [59, ...",SHOULDER,1,1,0 0 0 0 0 0 129 129 129 59 59 59 1 1 1 5 5 5 4...
2,1,"[[[59, 59, 59], [45, 45, 45], [41, 41, 41], [4...",SHOULDER,1,1,59 59 59 45 45 45 41 41 41 42 42 42 41 41 41 4...
3,1,"[[[58, 58, 58], [57, 57, 57], [57, 57, 57], [5...",SHOULDER,2,1,58 58 58 57 57 57 57 57 57 57 57 57 57 57 57 5...
4,1,"[[[0, 0, 0], [0, 0, 0], [40, 40, 40], [54, 54,...",SHOULDER,2,1,0 0 0 0 0 0 40 40 40 54 54 54 53 53 53 52 52 5...


In [17]:
valid_img_path['str_vector'] = valid_img_path['img_vector'].apply(lambda x: array_to_str(x))
valid_img_path.head()

Unnamed: 0,label,img_vector,human_part,patient_number,study_number,str_vector
0,1,"[[[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], ...",WRIST,11185,1,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
1,1,"[[[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], ...",WRIST,11185,1,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
2,1,"[[[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], ...",WRIST,11185,1,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 96 96 96 6...
3,1,"[[[147, 147, 147], [50, 50, 50], [33, 33, 33],...",WRIST,11185,1,147 147 147 50 50 50 33 33 33 29 29 29 31 31 3...
4,1,"[[[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], ...",WRIST,11186,1,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...


## 5. Obtain the whole dataset

In [18]:
# concat the train and valid data
# data = pd.concat([train_img_path, valid_img_path], ignore_index=True)

In [19]:
# data.info()

In [20]:
# drop the un-needed columns
# data.drop(labels = 'img_vector', axis=1, inplace=True)
train_img_path.drop(labels = 'img_vector', axis=1, inplace=True)
valid_img_path.drop(labels = 'img_vector', axis=1, inplace=True)

In [21]:
# save the data
# data.to_csv('data.csv')
train_img_path.to_csv('train.csv')
valid_img_path.to_csv('test.csv')

In [22]:
# split the data
# elbow = data[data['human_part'] == 'ELBOW']
# finger = data[data['human_part'] == 'FINGER']
# forearm = data[data['human_part'] == 'FOREARM']
# hand = data[data['human_part'] == 'HAND']
# humerus = data[data['human_part'] == 'HUMERUS']
# shoulder = data[data['human_part'] == 'SHOULDER']
# wrist = data[data['human_part'] == 'WRIST']

## <center>$\mathcal{FIN}$</center>