# Exploratory Data Analysis
-------

## Import all necessary packages

In [4]:
from itertools import chain
import numpy as np
import pandas as pd
import os
from glob import glob
%matplotlib inline
import matplotlib.pyplot as plt
from skimage import io
import seaborn as sns

## Examine the Dataset
Below is some code to read and and view some observations in the dataset. Note that the filepath below are relative to the
Udacity Workspace and you will need to change them to your local filepath if you want to run this locally.
[Here](https://www.kaggle.com/nih-chest-xrays/data) is the where the dataset is located.

In [5]:
# Read in the data sets
all_xray_df = pd.read_csv('/data/Data_Entry_2017.csv')
all_xray_df.sample(3)
data_sample = pd.read_csv('sample_labels.csv')
data_sample.sample(3)

### Explanation of Columns
Below is an explanation of all the columns present in the current dataset. Note that I do perform some data parsing in the next section
so the actual columns in the dataframe will be slightly different. Specifically `Finding Labels` will be parsed into one hot encoding for each
individual disease.

&emsp;**Image Index:** This is the filename for the image for this observation.

&emsp;**Finding Labels:** This is the disease type label for this observation. Note that there can be more than one and they are
separated by a '|' character.

&emsp;**Follow-up #:** This observations follow up #.

&emsp;**Patient ID:** Unique identifier for each Patient. The same patient appears more than once in this dataset.

&emsp;**Patient Age:** Age of the patient for this observation.

&emsp;**Patient Gender:** Gender of the patient for this observation.

&emsp;**View Position:** This observations radiographic view of the image relative to the imaging subject's orientation.

&emsp;**Original Image\[Width Height]:** The original image width and height for this observation.

&emsp;**Original Image Pixel Spacing\[x y]** All pixel spacing related Attributes are encoded as the physical distance between the centers of each two-dimensional pixel,
specified by two numeric values. The first value is the row spacing in mm, that is the spacing between the centers of adjacent rows, or vertical spacing.


### Parse `Image Index` to get correct image filepath
We need to parse the Image Index filepath to the correct location of the image files for this project.
The image files are found in '\data\images_XXX\images directories.

In [None]:
all_image_paths = {os.path.basename(x): x for x in glob(os.path.join('/data', 'images_*', '*', '*.png'))}
all_xray_df['path'] = all_xray_df['Image Index'].map(all_image_paths.get)
all_xray_df.sample(3)


### Parse `Finding Labels` by Disease
Below I parse the `Finding Labels` column to create a 0/1 encoding for each specific type of disease for each observation.
If the pateint has the disease the column value for that disease will be 1 otherwise it will be 0.
This will make future operations more efficient.

In [None]:
all_labels = np.unique(list(chain(*all_xray_df['Finding Labels'].map(lambda x: x.split('|')).tolist())))
all_labels = [x for x in all_labels if len(x)>0]
for label in all_labels:
    if len(label)>1:
        all_xray_df[label] = all_xray_df['Finding Labels'].map(lambda diseases: 1.0 if label in diseases else 0)

print(f'Number of Labels: {len(all_labels)}')
print(f'Labels: {all_labels}')

all_xray_df.sample(3)


### Demographic information
In this section I will look at patient age and gender for the entire dataset and for the population with pneumonia.
Upon inspection of the patient age we noticed an outlier of 414. We removed it from the dataset below.
As you can see below the distribution of age and the male/female split looks about the same between the two populations.

In [None]:
# Identify and remove outlier
all_xray_df = all_xray_df.loc[all_xray_df['Patient Age'] < 130]

# ----        Entire Dataset Demographics        ---- #
# Age demographic
min_age = all_xray_df['Patient Age'].min()
max_age = all_xray_df['Patient Age'].max()
mean_age = all_xray_df['Patient Age'].mean()
print(f'Dataset min_age: {min_age}, max_age: {max_age}, mean: {round(mean_age,2)}')
hist = plt.hist(all_xray_df['Patient Age'], bins=20)
# Gender demographic
all_xray_df['Patient Gender'].value_counts().plot(kind='bar')

# ----        Pneumonia Population Demographics        ---- #
pneumonia_subset = all_xray_df.loc[all_xray_df['Pneumonia']==1]
# Age demographics
min_age = pneumonia_subset['Patient Age'].min()
max_age = pneumonia_subset['Patient Age'].max()
mean_age = pneumonia_subset['Patient Age'].mean()
print(f'Dataset min_age: {min_age}, max_age: {max_age}, mean: {mean_age}')
hist = plt.hist(pneumonia_subset['Patient Age'], bins=20)
# Gender demographics
pneumonia_subset['Patient Gender'].value_counts().plot(kind='bar')


### Examination of X-Ray View Positions
Below we look at the different `View Position` available for X-Rays in this dataset. There are 2 different view positions.
`PA` is when the patient is standing facing the image receptor and the rays pass through their back and exit their front (Posterior -> Anterior),
`AP` is the opposite (Anterior -> Posterior). Below we look at how many of each type exist in this dataset and a some examples
of each type to see if there is any noticeable difference. (I personally do not notice a difference but that is not saying much because
I am not a trained radiologist. We will treat these two categories as being the same thing unless I read or find something that
changes my mind in the future.)

In [None]:
# Different types
view_position_counts = all_xray_df['View Position'].value_counts()
print(view_position_counts)
view_position_counts.plot(kind='bar')

# View random 'PA' X-Rays
random_PA_xray = all_xray_df.loc[all_xray_df['View Position']=='PA'].sample(3)['Path']
fig, axs = plt.subplots(1,3, figsize = (16,16))
for path,ax in zip(random_PA_xray, axs):
    img = io.imread(path)
    ax.imshow(img, cmap='gray')

#View random 'AP' X-Rays
random_PA_xray = all_xray_df.loc[all_xray_df['View Position']=='AP'].sample(3)['Path']
fig, axs = plt.subplots(1,3, figsize = (16,16))
for path,ax in zip(random_PA_xray, axs):
    img = io.imread(path)
    ax.imshow(img, cmap='gray')


### Examination of Diseases (pneumonia cases, non-pnuemonia cases, number of disease per patient, intensity histograms (only on sample), change this to text explanation)

### Co-occurrence with Pneumonia

## Important Findings