In [None]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import cv2
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense



In [None]:
# Extracting the data directly from kaggle
os.environ['KAGGLE_USERNAME'] = "muhddaniyal"
os.environ['KAGGLE_KEY'] = "ced0131dac41051a559103b16031f513"

In [None]:
!kaggle datasets download tawsifurrahman/covid19-radiography-database

In [None]:
# Now we've got the zip folder from the above cell so now we've to unzip it
!unzip covid19-radiography-database.zip

In [None]:
df_n = pd.read_excel('/content/COVID-19_Radiography_Dataset/Normal.metadata.xlsx')

In [None]:
df_n.head()
# We've paths of every image in the COVID

In [None]:
# Now we're selecting the covid and normal folder we can also choose other but right now we're working on the
# COVID and Normal

In [None]:
len(os.listdir('/content/COVID-19_Radiography_Dataset/COVID/'))
# We've two folders inside the COVID folder

In [None]:
len(os.listdir('/content/COVID-19_Radiography_Dataset/COVID/images/'))
# no. of images we've in the covid folder

In [None]:
len(os.listdir('/content/COVID-19_Radiography_Dataset/Normal/'))
# We've two folders inside the Normal folder

In [None]:
len(os.listdir('/content/COVID-19_Radiography_Dataset/Normal/images/'))
# no. of images

In [None]:

normal_count = len(os.listdir('/content/COVID-19_Radiography_Dataset/Normal/images/'))
covid_count = len(os.listdir('/content/COVID-19_Radiography_Dataset/COVID/images/'))

data = {'Class': ['Normal', 'Covid'], 'Count': [normal_count, covid_count]}
df = pd.DataFrame(data)

sns.set(style="darkgrid")
plt.figure(figsize=(8, 6))
sns.barplot(x='Class', y='Count', data=df, palette = ['#77C3EC', 'red'])

plt.xlabel('Class')
plt.ylabel('Count')
plt.title('Dataset Balance')
plt.show()


In [None]:
# we detected the problem here that training data is not balanced right now
# Imbalanced dataset usually have low accuracy or biased data

In [None]:
# Reading any one image using cv2
img = cv2.imread('/content/COVID-19_Radiography_Dataset/Normal/images/Normal-10074.png')
sns.set(style="white")
plt.axis('off')
plt.imshow(img)

In [None]:
# dimension of the image
print('Image Shape: {}'.format(img.shape))
print('Image Height: {}'.format(img.shape[0]))
print('Image Width: {}'.format(img.shape[1]))
print('Image Dimension: {}'.format(img.ndim))
print('Image Size: {}kb'.format(img.size//1024))
print('Image Data Type: {}'.format(img.dtype))
print('Maximum RGB value of the image: {}'.format(img.max()))
print('Minimum RGB value of the image: {}'.format(img.min()))
# We don't want to resize the images because all the images in the dataset is 299*299 pixels mentioned
# in the description at kaggle where the datset is located

In [None]:
# We also observe that the image has 3 channels, hence it in in RGB scale even if these are X-ray images.
plt.title('B channel', fontsize = 14)
plt.imshow(img[ : , : , 0])
plt.axis('off')
plt.show()

In [None]:
def loadImages(path, urls, target):
  images = []
  labels = []
  for i in range(len(urls)):
    img_path = path + "/" + urls[i]
    img = cv2.imread(img_path)
    img = img / 255.0
    #print(img_path)
    # if we want to resize the images
    img = cv2.resize(img, (100, 100)) # runtime crashing again and again
    images.append(img)
    labels.append(target)
  images = np.asarray(images)
  return images, labels

In [None]:
covid_path = "COVID-19_Radiography_Dataset/COVID/images"
covidUrl = os.listdir(covid_path)
covidImages, covidTargets = loadImages(covid_path, covidUrl, 1) # 1 is -ve class

In [None]:
len(covidUrl), len(covidImages)

In [None]:
normal_path = "COVID-19_Radiography_Dataset/Normal/images"
normal_urls = os.listdir(normal_path)
normalImages, normalTargets = loadImages(normal_path, normal_urls, 0) # 0 is +ve class

In [None]:
len(normal_urls), len(normalImages)

In [None]:
# returning a list so I cannot use as an array so we've to typecast it into array
# Why? because we cannot directly reshape the data, we cannot divide the data by 255 to normalize the images

# covidImages = np.asarray(covidImages)
# normalImages = np.asarray(normalImages)

In [None]:
covidImages.shape

In [None]:
normalImages.shape

In [None]:
# Concatenating the data row wise
data = np.r_[covidImages, normalImages]
targets = np.r_[covidTargets, normalTargets]

In [None]:
data.shape

In [None]:
targets.shape

In [None]:
# data = data/255.0
# failed to do this because I don't have much RAM because data is too large
# runtime disconnected
# 1st solution is resizing the image
# Second solution is Keras. It fetches the data in bathces we don't even have to write code

In [None]:
x_train, x_test, y_train, y_test = train_test_split(data, targets, test_size=0.25)

In [None]:
model = Sequential([
    Conv2D(32, 3, input_shape=(100,100,3), activation='relu'),
    MaxPooling2D(),                     #set to default pool_size, strides, padding, etc
    Conv2D(16, 3, activation='relu'),
    MaxPooling2D(),
    Conv2D(16, 3, activation='relu'),
    MaxPooling2D(),
    Flatten(),
    Dense(512, activation='relu'),
    Dense(256, activation='relu'),
    Dense(1, activation='sigmoid')
])
# keras.layers.Conv2D(
#     filters,
#     kernel_size,
#     input_shape,
#     activation=None,      we generally uses relu becuase it removes -ve values if it is multiplied with filter or something
#     padding is valid by default
# )
# Flatten() will create input layer that contains all my neurons


In [None]:
model.summary()

In [None]:
model.compile(optimizer='adam', loss=tf.keras.losses.BinaryCrossentropy(),metrics=['accuracy'])

In [None]:
model.fit(x_train, y_train,batch_size=32,epochs=5,validation_data=(x_test, y_test))