### **Classification of CT scan images as either having Covid-19 or healthy**

Installations and imports

In [None]:
#installation of pillow for image resizing 
!pip install --upgrade pip
!pip install --upgrade Pillow

In [None]:
##USEFUL LIBRARIES
import numpy as np
import scipy
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import image
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_digits
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
import glob
import os
from PIL import Image
import zipfile

Mounting of google drive to get data

In [None]:
##MOUNTING GOOGLE DRIVE TO THIS COLAB NOTEBOOK
from google.colab import drive
drive._mount('/content/drive')  

Importing images and converting them into 1-D RGB image matrices

In [None]:
##UNZIPPING THE DATA
zip_ref = zipfile.ZipFile("/content/drive/MyDrive/HCML project/archive.zip",'r')
zip_ref.extractall("/content/dataset")
zip_ref.close()

In [None]:
##CONVERTS A PNG INTO AN RGB MATRIX
def image_to_matrix(image_file, grays=False):
    img = image.imread(image_file)
    if(len(img.shape) == 3 and img.shape[2] > 3):
        height, width, depth = img.shape
        new_img = np.zeros([height, width, 3])
        for r in range(height):
            for c in range(width):
                new_img[r,c,:] = img[r,c,0:3]
        img = np.copy(new_img)
    if(grays and len(img.shape) == 3):
        height, width = img.shape[0:2]
        new_img = np.zeros([height, width])
        for r in range(height):
            for c in range(width):
                new_img[r,c] = img[r,c,0]
        img = new_img
    if(len(img.shape) == 2):
        zeros = np.where(img == 0)[0]
        img[zeros] += 1e-7
    return img

In [None]:
def flatten(image_matrix):
    if(len(image_matrix.shape) == 3):
        height, width, depth = image_matrix.shape
    else:
        height, width = image_matrix.shape
        depth = 1
    flattened_values = np.zeros([height*width,depth])
    for i, r in enumerate(image_matrix):
        for j, c in enumerate(r):
            flattened_values[i*width+j,:] = c
    oneDim = []
    for pixel in range(len(flattened_values)):
      for RGB_value in range(3):
        if RGB_value == 0:
          oneDim.append(flattened_values[pixel][RGB_value])
    return np.array(oneDim)

In [None]:
#Resize image function
def resize_image(image_file, width, height):
  image = Image.open(image_file)
  new_image = image.resize((width, height))
  new_image.save(image_file)
  return image_file

In [None]:
#reading in CT images and converting them to RGB matrices
#########################################################
from PIL import Image

#getting patient folders (both covid and healthy)
patient_folders = []
target=[]
for filepath in glob.glob(os.path.join('/content/dataset/New_Data_CoV2/Covid', '*')):
  patient_folders.append(filepath)
  target.append("Covid")

healthy_patient_folders = []
for filepath in glob.glob(os.path.join('/content/dataset/New_Data_CoV2/Healthy', '*')):
  patient_folders.append(filepath)
  target.append("Healthy")

#convert folder lists to arrays
patient_folders = np.array(patient_folders)

#split the folders in training/testing sets
folder_train, folder_test, y_train, y_test = train_test_split(patient_folders, target, test_size = 0.2, random_state = 0) 

#Convert training images to matrices and add them to a list of them
print("Beginning conversion of training images to matrices")
train_image_matrix = []
train_target = []
for i in range(len(folder_train)): ##len(folder_train)
  folder_filepath = folder_train[i]
  category = y_train[i]
  for png_filepath in glob.glob(os.path.join(folder_filepath, '*.png')):
      img_filepath = resize_image(png_filepath,280,200)
      train_image_matrix.append(flatten(image_to_matrix(img_filepath)))
      train_target.append(category)
  print("\tAdding images folder:", folder_filepath)

#Convert testing images to matrices and add them to a list of them
print("Beginning conversion of testing images to matrices")
test_image_matrix = []
test_target = []
for i in range(len(folder_test)):
  folder_filepath = folder_test[i]
  category = y_test[i]
  for png_filepath in glob.glob(os.path.join(folder_filepath, '*.png')):
      img_filepath = resize_image(png_filepath,280,200)
      test_image_matrix.append(flatten(image_to_matrix(img_filepath)))
      test_target.append(category)
  print("\tAdding images folder:", folder_filepath)
_ 
#convert image_matrix lists to arrays
train_image_matrix = np.array(train_image_matrix)
test_image_matrix = np.array(test_image_matrix)
train_target = np.array(train_target)
test_target = np.array(test_target)
print("Conversion of images to matrices is complete")

#print that conversion process is complete and size of the training and testing image matrices
print("Training images matrix dimensions:", train_image_matrix.shape)
print("Testing images matrix dimensions:", test_image_matrix.shape)

print("Training images target matrix dimensions:", train_target.shape)
print("Testing image target matrix dimensions:", test_target.shape)

### ***ML MODELS***

Single Decision Tree Classifer

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from sklearn import tree
from sklearn.metrics import accuracy_score
from sklearn.tree import export_graphviz
import graphviz
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict

In [None]:
#tuning parameters of decision tree by trying different combos of max_depth and max_features
############################################################################################

depths = [3,5,7,9]
feature_amts = [10,50,100,500,1000,2000]

best_accuracy = 0
best_depth = 0
best_feature_amt = 0

for depth in depths:
  for feature_amt in feature_amts:

    #create decision tree
    print("Decision tree where max depth =", depth, "and max features =", feature_amt)
    dt = DecisionTreeClassifier(max_depth=depth, max_features=feature_amt)

    #train decision tree
    dt.fit(train_image_matrix, train_target)

    #create predictions using decision tree
    y_pred = dt.predict(test_image_matrix)

    #accuracy   
    print("\tAccuracy =", accuracy_score(test_target, y_pred)
    if accuracy_score(test_target, y_pred) > best_accuracy:
      best_accuracy = accuracy_score(test_target, y_pred)
      best_depth = depth
      best_feature_amt = feature_amt

print("\nBest Accuracy =", best_accuracy)
print("\tMax Depth =", best_depth)
print("\tMax features =", best_feature_amt)

In [None]:
#create decision tree using best found configuration of parameters (aka the optimal decision tree)
##################################################################################################

#create decision tree
dt = DecisionTreeClassifier(max_depth=5, max_features=2000)

#train decision tree
dt.fit(x_train, y_train)

#create predictions using decision tree
y_pred = dt.predict(x_test)

### Performance metrics ###
#confusion matrix
print(confusion_matrix(y_test, y_pred))

#accuracy
acc = accuracy_score(y_test, y_pred)
print("Accuracy of optimal decision tree =", acc)

In [None]:
#save optimal decision tree to a .png file
##########################################

export_graphviz(dt, out_file="mytree.dot")
with open("mytree.dot") as f:
    dot_graph = f.read()
graphviz.Source(dot_graph)

#convert .dot to .png and save it
!dot mytree.dot -Tpng -o NewDecisionTree.png

Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
#determine what max depth for RF trees achieves best accuracy
#############################################################

best_acc = 0
best_depth = 0
depths = [1,2,3,4,5,6,7,8,9]
accuracies = []

for depth in depths:

  #create and train RF 
  print("\nTraining random forest with max depth of",depth)
  rf = RandomForestClassifier(max_depth=depth, random_state=0)

  #train decision tree
  scores = cross_val_score(rf, image_matrices, image_targets, cv=5)

  #accuracy   
  print("\tAccuracy =", scores.mean())
  accuracies.append(scores.mean())
  
  if scores.mean() > best_acc:
    best_acc = scores.mean()
    best_depth = depth 

print("\nBest accuracy achieved is", best_acc, "using a max depth of", best_depth)

In [None]:
#plot accuracies over max depth
###############################

x_vals = range(1,10)
plt.plot(x_vals,accuracies)
plt.title("Accuracies of Random Forests with Varying Max Depths")
plt.xlabel("Max Depth")
plt.ylabel("Accuracy")