# Detecting pneumonia

Data from Kaggle: https://www.kaggle.com/datasets/paultimothymooney/chest-xray-pneumonia

Notes:
- No Normal images in RGB format. Could be linked to pneumonia detection?
- RGB images seem to be smaller in height and width.
- Pneumonia images have "bacteria" or "virus" in the filename.
- Need to investigate for duplicates.

In [1]:
#Imports
import numpy as np
import pandas as pd
import os
import shutil
import glob
import itertools
import random
import matplotlib.pyplot as plt
import warnings
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Activation, Dense, Flatten, BatchNormalization, Conv2D, MaxPool2D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import categorical_crossentropy
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from sklearn.metrics import confusion_matrix
from PIL import Image

In [2]:
#Preferences
warnings.simplefilter(action="ignore", category=FutureWarning)
%matplotlib inline

### Data preparation

In [3]:
#Path variables
train_path = "data/train"
val_path = "data/val"
test_path = "data/test"

#### Checking image size:

In [4]:
image_dir_list = ["data/train/NORMAL/", "data/train/PNEUMONIA/"]

In [5]:
image_files = []
image_sizes = []
image_info = []
norm_files = []
norm_info = []
p_files = []
p_info = []

In [6]:
for image_dir in image_dir_list:
    for file in os.listdir(image_dir):
        if file.endswith(".jpeg"):
            image_files.append(f"{image_dir + file}")

In [7]:
for file in os.listdir(image_dir_list[0]):
    if file.endswith(".jpeg"):
        norm_files.append(f"{image_dir_list[0] + file}")

In [8]:
for file in os.listdir(image_dir_list[1]):
    if file.endswith(".jpeg"):
        p_files.append(f"{image_dir_list[1] + file}")

In [9]:
for image_file in image_files:
    img = Image.open(image_file)
    image_sizes.append(img.size)
    temp_dict = {"File":image_file,
                  "Height": img.size[0],
                  "Width": img.size[1],
                  "Mode": img.mode}
    image_info.append(temp_dict)
    img.close()

In [10]:
for image_file in norm_files:
    img = Image.open(image_file)
    image_sizes.append(img.size)
    temp_dict = {"File":image_file,
                  "Height": img.size[0],
                  "Width": img.size[1],
                  "Mode": img.mode}
    norm_info.append(temp_dict)
    img.close()

In [11]:
for image_file in p_files:
    img = Image.open(image_file)
    image_sizes.append(img.size)
    temp_dict = {"File":image_file,
                  "Height": img.size[0],
                  "Width": img.size[1],
                  "Mode": img.mode}
    p_info.append(temp_dict)
    img.close()

In [12]:
df = pd.DataFrame(image_info)

In [13]:
norm_df = pd.DataFrame(norm_info)

In [14]:
p_df = pd.DataFrame(p_info)

In [15]:
norm_df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Height,1341.0,1667.734526,289.210512,912.0,1466.0,1640.0,1824.0,2916.0
Width,1341.0,1381.431022,326.320734,672.0,1152.0,1328.0,1542.0,2663.0


In [16]:
p_df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Height,3875.0,1200.483613,291.305676,384.0,1000.0,1168.0,1368.0,2772.0
Width,3875.0,825.026839,277.073758,127.0,640.0,776.0,968.0,2304.0


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5216 entries, 0 to 5215
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   File    5216 non-null   object
 1   Height  5216 non-null   int64 
 2   Width   5216 non-null   int64 
 3   Mode    5216 non-null   object
dtypes: int64(2), object(2)
memory usage: 163.1+ KB


In [18]:
norm_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1341 entries, 0 to 1340
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   File    1341 non-null   object
 1   Height  1341 non-null   int64 
 2   Width   1341 non-null   int64 
 3   Mode    1341 non-null   object
dtypes: int64(2), object(2)
memory usage: 42.0+ KB


In [19]:
p_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3875 entries, 0 to 3874
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   File    3875 non-null   object
 1   Height  3875 non-null   int64 
 2   Width   3875 non-null   int64 
 3   Mode    3875 non-null   object
dtypes: int64(2), object(2)
memory usage: 121.2+ KB


In [20]:
df[df.Mode == "RGB"].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Height,283.0,735.519435,187.939335,384.0,553.5,759.0,863.5,1287.0
Width,283.0,434.24735,158.876515,127.0,284.0,467.0,553.5,720.0


In [21]:
df[df.Mode == "L"].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Height,4933.0,1354.176769,332.702747,664.0,1088.0,1310.0,1568.0,2916.0
Width,4933.0,998.699777,364.738819,360.0,720.0,920.0,1208.0,2663.0


In [22]:
norm_df[norm_df.Mode == "RGB"].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Height,0.0,,,,,,,
Width,0.0,,,,,,,


In [23]:
norm_df[norm_df.Mode == "L"].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Height,1341.0,1667.734526,289.210512,912.0,1466.0,1640.0,1824.0,2916.0
Width,1341.0,1381.431022,326.320734,672.0,1152.0,1328.0,1542.0,2663.0


In [24]:
p_df[p_df.Mode == "RGB"].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Height,283.0,735.519435,187.939335,384.0,553.5,759.0,863.5,1287.0
Width,283.0,434.24735,158.876515,127.0,284.0,467.0,553.5,720.0


In [25]:
p_df[p_df.Mode == "L"].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Height,3592.0,1237.11637,265.316451,664.0,1032.0,1192.0,1392.0,2772.0
Width,3592.0,855.814866,260.489702,360.0,672.0,800.0,992.0,2304.0


In [26]:
df[df.Mode == "RGB"].head(15)

Unnamed: 0,File,Height,Width,Mode
1372,data/train/PNEUMONIA/person1719_bacteria_4542....,481,234,RGB
1378,data/train/PNEUMONIA/person1642_bacteria_4352....,724,286,RGB
1418,data/train/PNEUMONIA/person1751_bacteria_4592....,462,241,RGB
1428,data/train/PNEUMONIA/person466_bacteria_1984.jpeg,650,378,RGB
1429,data/train/PNEUMONIA/person746_virus_1369.jpeg,895,516,RGB
1436,data/train/PNEUMONIA/person913_bacteria_2838.jpeg,743,453,RGB
1452,data/train/PNEUMONIA/person1700_bacteria_4502....,550,276,RGB
1456,data/train/PNEUMONIA/person636_bacteria_2527.jpeg,689,366,RGB
1469,data/train/PNEUMONIA/person965_virus_1638.jpeg,870,591,RGB
1484,data/train/PNEUMONIA/person977_virus_1652.jpeg,768,552,RGB


In [27]:
df[df.Mode == "L"].head(15)

Unnamed: 0,File,Height,Width,Mode
0,data/train/NORMAL/NORMAL2-IM-0388-0001.jpeg,1388,1009,L
1,data/train/NORMAL/NORMAL2-IM-1110-0001.jpeg,1902,1665,L
2,data/train/NORMAL/NORMAL2-IM-1039-0001.jpeg,1384,1090,L
3,data/train/NORMAL/IM-0371-0001.jpeg,2024,2036,L
4,data/train/NORMAL/IM-0563-0001.jpeg,1302,970,L
5,data/train/NORMAL/IM-0494-0001.jpeg,1692,1302,L
6,data/train/NORMAL/NORMAL2-IM-1163-0001.jpeg,1616,1283,L
7,data/train/NORMAL/IM-0178-0001.jpeg,1646,1124,L
8,data/train/NORMAL/NORMAL2-IM-0603-0001.jpeg,2234,1843,L
9,data/train/NORMAL/NORMAL2-IM-1090-0001.jpeg,1740,1541,L
