# Detecting pneumonia

Data from Kaggle: https://www.kaggle.com/datasets/paultimothymooney/chest-xray-pneumonia

Notes:
- No Normal images in RGB format. Could be linked to pneumonia detection?
- RGB images seem to be smaller in height and width.
- Pneumonia images have "bacteria" or "virus" in the filename. Created label for better classification.
- Normal images seem to have 580 origional images. Data Augmentation?

In [1]:
#Imports
import numpy as np
import pandas as pd
import os
import shutil
import glob
import itertools
import random
import matplotlib.pyplot as plt
import warnings
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Activation, Dense, Flatten, BatchNormalization, Conv2D, MaxPool2D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import categorical_crossentropy
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from sklearn.metrics import confusion_matrix
from PIL import Image

In [2]:
#Preferences
warnings.simplefilter(action="ignore", category=FutureWarning)
%matplotlib inline

### Data preparation

In [3]:
#Path variables
train_path = "data/train"
val_path = "data/val"
test_path = "data/test"

#### Checking image size:

In [4]:
image_dir_list = ["data/train/NORMAL/", "data/train/PNEUMONIA/"]

In [5]:
image_files = []
image_sizes = []
image_info = []
norm_files = []
norm_info = []
p_files = []
p_info = []
status = ""

In [6]:
for image_dir in image_dir_list:
    for file in os.listdir(image_dir):
        if file.endswith(".jpeg"):
            image_files.append(f"{image_dir + file}")

In [7]:
for file in os.listdir(image_dir_list[0]):
    if file.endswith(".jpeg"):
        norm_files.append(f"{image_dir_list[0] + file}")

In [8]:
for file in os.listdir(image_dir_list[1]):
    if file.endswith(".jpeg"):
        p_files.append(f"{image_dir_list[1] + file}")

In [9]:
for image_file in image_files:
    img = Image.open(image_file)
    if (image_file.find("virus") == (-1)) and (image_file.find("bacteria") == (-1)):
        status = "normal"
    elif image_file.find("virus") != (-1):
        status = "virus"
    elif image_file.find("bacteria") != (-1):
        status = "bacteria"
    image_sizes.append(img.size)
    temp_dict = {"file":image_file,
                 "height": img.size[0],
                 "width": img.size[1],
                 "mode": img.mode,
                 "issue": status}
    image_info.append(temp_dict)
    img.close()

In [10]:
for image_file in norm_files:
    img = Image.open(image_file)
    if (image_file.find("virus") == (-1)) and (image_file.find("bacteria") == (-1)):
        status = "normal"
    else:
        status = "unknown"
    image_sizes.append(img.size)
    temp_dict = {"file":image_file,
                 "height": img.size[0],
                 "width": img.size[1],
                 "mode": img.mode,
                 "status": status}
    norm_info.append(temp_dict)
    img.close()

In [11]:
for image_file in p_files:
    img = Image.open(image_file)
    if image_file.find("virus") != (-1):
        status = "virus"
    elif image_file.find("bacteria") != (-1):
        status = "bacteria"
    image_sizes.append(img.size)
    temp_dict = {"file":image_file,
                 "height": img.size[0],
                 "width": img.size[1],
                 "mode": img.mode,
                 "issue": status}
    p_info.append(temp_dict)
    img.close()

In [12]:
df = pd.DataFrame(image_info)

In [13]:
norm_df = pd.DataFrame(norm_info)

In [14]:
p_df = pd.DataFrame(p_info)

In [15]:
p_df.head()

Unnamed: 0,file,height,width,mode,issue
0,data/train/PNEUMONIA/person1213_virus_2058.jpeg,1208,656,L,virus
1,data/train/PNEUMONIA/person1180_virus_2011.jpeg,936,592,L,virus
2,data/train/PNEUMONIA/person764_bacteria_2668.jpeg,1016,736,L,bacteria
3,data/train/PNEUMONIA/person558_bacteria_2328.jpeg,1280,905,L,bacteria
4,data/train/PNEUMONIA/person837_virus_1475.jpeg,1656,1320,L,virus


In [16]:
norm_df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
height,1341.0,1667.734526,289.210512,912.0,1466.0,1640.0,1824.0,2916.0
width,1341.0,1381.431022,326.320734,672.0,1152.0,1328.0,1542.0,2663.0


In [17]:
p_df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
height,3875.0,1200.483613,291.305676,384.0,1000.0,1168.0,1368.0,2772.0
width,3875.0,825.026839,277.073758,127.0,640.0,776.0,968.0,2304.0


In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5216 entries, 0 to 5215
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   file    5216 non-null   object
 1   height  5216 non-null   int64 
 2   width   5216 non-null   int64 
 3   mode    5216 non-null   object
 4   issue   5216 non-null   object
dtypes: int64(2), object(3)
memory usage: 203.9+ KB


In [19]:
norm_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1341 entries, 0 to 1340
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   file    1341 non-null   object
 1   height  1341 non-null   int64 
 2   width   1341 non-null   int64 
 3   mode    1341 non-null   object
 4   status  1341 non-null   object
dtypes: int64(2), object(3)
memory usage: 52.5+ KB


In [20]:
p_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3875 entries, 0 to 3874
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   file    3875 non-null   object
 1   height  3875 non-null   int64 
 2   width   3875 non-null   int64 
 3   mode    3875 non-null   object
 4   issue   3875 non-null   object
dtypes: int64(2), object(3)
memory usage: 151.5+ KB


In [21]:
df[df["mode"] == "RGB"].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
height,283.0,735.519435,187.939335,384.0,553.5,759.0,863.5,1287.0
width,283.0,434.24735,158.876515,127.0,284.0,467.0,553.5,720.0


In [22]:
df[df["mode"] == "L"].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
height,4933.0,1354.176769,332.702747,664.0,1088.0,1310.0,1568.0,2916.0
width,4933.0,998.699777,364.738819,360.0,720.0,920.0,1208.0,2663.0


In [23]:
norm_df[norm_df["mode"] == "RGB"].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
height,0.0,,,,,,,
width,0.0,,,,,,,


In [24]:
norm_df[norm_df["mode"] == "L"].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
height,1341.0,1667.734526,289.210512,912.0,1466.0,1640.0,1824.0,2916.0
width,1341.0,1381.431022,326.320734,672.0,1152.0,1328.0,1542.0,2663.0


In [25]:
p_df[p_df["mode"] == "RGB"].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
height,283.0,735.519435,187.939335,384.0,553.5,759.0,863.5,1287.0
width,283.0,434.24735,158.876515,127.0,284.0,467.0,553.5,720.0


In [26]:
p_df[p_df["mode"] == "L"].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
height,3592.0,1237.11637,265.316451,664.0,1032.0,1192.0,1392.0,2772.0
width,3592.0,855.814866,260.489702,360.0,672.0,800.0,992.0,2304.0


In [27]:
df[df["mode"] == "RGB"].head(15)

Unnamed: 0,file,height,width,mode,issue
1372,data/train/PNEUMONIA/person1719_bacteria_4542....,481,234,RGB,bacteria
1378,data/train/PNEUMONIA/person1642_bacteria_4352....,724,286,RGB,bacteria
1418,data/train/PNEUMONIA/person1751_bacteria_4592....,462,241,RGB,bacteria
1428,data/train/PNEUMONIA/person466_bacteria_1984.jpeg,650,378,RGB,bacteria
1429,data/train/PNEUMONIA/person746_virus_1369.jpeg,895,516,RGB,virus
1436,data/train/PNEUMONIA/person913_bacteria_2838.jpeg,743,453,RGB,bacteria
1452,data/train/PNEUMONIA/person1700_bacteria_4502....,550,276,RGB,bacteria
1456,data/train/PNEUMONIA/person636_bacteria_2527.jpeg,689,366,RGB,bacteria
1469,data/train/PNEUMONIA/person965_virus_1638.jpeg,870,591,RGB,virus
1484,data/train/PNEUMONIA/person977_virus_1652.jpeg,768,552,RGB,virus


In [28]:
p_df["issue"].value_counts()

bacteria    2530
virus       1345
Name: issue, dtype: int64

In [29]:
norm_df[~norm_df["file"].str.endswith("0001.jpeg")]

Unnamed: 0,file,height,width,mode,status
34,data/train/NORMAL/IM-0555-0001-0002.jpeg,1446,1171,L,normal
74,data/train/NORMAL/NORMAL2-IM-1094-0001-0002.jpeg,1674,1337,L,normal
96,data/train/NORMAL/IM-0551-0001-0002.jpeg,1624,1341,L,normal
106,data/train/NORMAL/NORMAL2-IM-1345-0001-0002.jpeg,1294,959,L,normal
113,data/train/NORMAL/NORMAL2-IM-0971-0001-0002.jpeg,1554,1353,L,normal
...,...,...,...,...,...
1280,data/train/NORMAL/IM-0491-0001-0002.jpeg,1604,1414,L,normal
1296,data/train/NORMAL/IM-0539-0001-0002.jpeg,1526,1195,L,normal
1315,data/train/NORMAL/IM-0429-0001-0002.jpeg,1858,1380,L,normal
1329,data/train/NORMAL/IM-0620-0001-0002.jpeg,1512,1302,L,normal


In [30]:
norm_df["stripped_filename"] = norm_df["file"].str.replace("-0002","").str.replace("-0001", "")

In [31]:
norm_df["stripped_filename"] = norm_df["stripped_filename"].str[18:]

In [32]:
norm_df["stripped_filename"] = norm_df["stripped_filename"].str.replace("^NORMAL2","").str.replace("^NORMAL","")

In [33]:
norm_df["stripped_filename"] = norm_df["stripped_filename"].str.replace("^-","")

In [34]:
duplicates = norm_df[norm_df["stripped_filename"].duplicated(keep=False)]

In [35]:
duplicates.head()

Unnamed: 0,file,height,width,mode,status,stripped_filename
0,data/train/NORMAL/NORMAL2-IM-0388-0001.jpeg,1388,1009,L,normal,IM-0388.jpeg
4,data/train/NORMAL/IM-0563-0001.jpeg,1302,970,L,normal,IM-0563.jpeg
14,data/train/NORMAL/NORMAL2-IM-0520-0001.jpeg,1554,1402,L,normal,IM-0520.jpeg
17,data/train/NORMAL/NORMAL2-IM-0474-0001.jpeg,1728,1619,L,normal,IM-0474.jpeg
20,data/train/NORMAL/IM-0476-0001.jpeg,1558,1048,L,normal,IM-0476.jpeg


In [36]:
duplicates.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 580 entries, 0 to 1339
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   file               580 non-null    object
 1   height             580 non-null    int64 
 2   width              580 non-null    int64 
 3   mode               580 non-null    object
 4   status             580 non-null    object
 5   stripped_filename  580 non-null    object
dtypes: int64(2), object(4)
memory usage: 31.7+ KB
