<a href="https://colab.research.google.com/github/JEN6YT/APS360-Project/blob/main/Random%20Forest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from PIL import Image, ImageDraw
import pandas as pd
import os
import torchvision.transforms as transforms
import torch
import random
import numpy as np
from imblearn.over_sampling import RandomOverSampler
from keras.preprocessing.image import ImageDataGenerator
from sklearn.model_selection import train_test_split

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Get image and label


In [None]:
# read the xlsx file from Google Drive
file_path = '/content/drive/My Drive/U of T/APS360 Deep Learning/NIH-NLM-ThinBloodSmearsPf/img_path.xlsx'

# The dataframe store all the path, i.e. 'NIH-NLM-ThinBloodSmearsPf\Polygon Set\142C38P...'
df1 = pd.read_excel(file_path)

# create complete path for each image
prefix = "/content/drive/My Drive/U of T/APS360 Deep Learning/"
df1 = df1.applymap(lambda x: prefix + str(x))


In [None]:
df1.head()

In [None]:
df1.shape

(800, 1)

In [None]:
# read the xlsx file from Google Drive
file_path_infected = '/content/drive/My Drive/U of T/APS360 Deep Learning/NIH-NLM-ThinBloodSmearsPf/infected_RBC.xlsx'

# The dataframe store all the path, i.e. 'NIH-NLM-ThinBloodSmearsPf\Polygon Set\142C38P...'
df2 = pd.read_excel(file_path_infected)

In [None]:
df2.head()

In [None]:
df = pd.merge(df1, df2, left_index=True, right_index=True)

In [None]:
df.head()

In [None]:
df.shape

## Label, resize, save in array

In [None]:
data_list = []
target_size = (224, 224)

# iterate over rows and label images
for i, row in df.iterrows():
  # get the file path and infected RBC count
  print(i)

  file_path = row['File path']
  infected_rbc = row['Infected RBC']
  
  # open the image using PIL
  image = Image.open(file_path)
  image = image.resize(target_size)
  img_arr = np.array(image)
  
  # label the image based on the infected RBC count
  if infected_rbc == 0:
    label = 0
  elif infected_rbc > 0:
    label = 1
  
  # append the image and label to the list
  data = {'image': img_arr, 'label': label}
  data_list.append(data)

# convert the list to numpy array and save it
np.save('data.npy', np.array(data_list))

In [None]:
data_list[0]

# Checking balance

In [None]:
infected = 0
uninfected = 0
for data in data_list:
  if data['label'] == 1:
    infected += 1
  if data['label'] == 0:
    uninfected += 1
total = infected+uninfected
print(f'infected data {infected/total} and uninfected data {uninfected/total}')

## Resampling

In [None]:
# load the data from the saved numpy array
data = np.load('data.npy', allow_pickle=True)

# get the features (images) and labels
image = np.array([d['image'].flatten() for d in data])
label = np.array([d['label'] for d in data])

# reshape X to a 2D array
image = image.reshape(image.shape[0], -1)

# apply RandomOverSampler to X and y
ros = RandomOverSampler()
image_resampled, label_resampled = ros.fit_resample(image, label)

# reshape X_resampled back to 4D array
image_resampled = image_resampled.reshape(image_resampled.shape[0], 224, 224, 3)

# combine X_resampled and y_resampled into a list of dicts
data_resampled = [{'image': image_resampled[i], 'label': label_resampled[i]} for i in range(len(label_resampled))]

# save the resampled data to a new numpy array
np.save('data_resampled.npy', np.array(data_resampled))

In [None]:
infected = 0
uninfected = 0
for data in data_resampled:
  if data['label'] == 1:
    infected += 1
  if data['label'] == 0:
    uninfected += 1
total = infected+uninfected
print(f'infected data {infected/total} and uninfected data {uninfected/total}')

In [None]:
total

# Normalizing

In [None]:
# Define the normalization function
def normalize(image):
  return (image - np.min(image)) / (np.max(image) - np.min(image))

# Normalize the data
normalized_data = []
for data in data_resampled:
  normalized_image = normalize(data['image'])
  normalized_data.append({'image': normalized_image, 'label': data['label']})

# Save the normalized data as a .npy file
np.save('normalized_data.npy', normalized_data)
#print(normalized_data)

# Splitting

In [None]:
# get the features (images) and labels
X = np.array([d['image'] for d in normalized_data])
y = np.array([d['label'] for d in normalized_data])

# Split your data into training, validation and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=66)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.3, random_state=66)

In [None]:
print(len(X_train),len(X_val),len(X_test))

In [None]:
X_train.shape

# Data Augmentation

In [None]:
# create an ImageDataGenerator instance with desired data augmentation parameters
datagen = ImageDataGenerator(
    rotation_range=25,
    horizontal_flip=True
)

# compute internal statistics of the data
datagen.fit(X_train)

# generate augmented data in batches
augmented_data_generator = datagen.flow(X_train, y_train, batch_size=32)

# iterate over batches of augmented data and add them to the original dataset
for x_batch, y_batch in augmented_data_generator:
  X_train = np.concatenate([X_train, x_batch], axis=0)
  y_train = np.concatenate([y_train, y_batch], axis=0)
  
  # break the loop if we have reached the desired number of samples
  if len(X_train) >= 1348:
      break

In [None]:
# set the random seed for reproducibility
np.random.seed(66)

# create an array of indices for shuffling
indices = np.arange(len(X_train))

# shuffle the indices
np.random.shuffle(indices)

# shuffle the x_train and y_train arrays using the shuffled indices
X_train = X_train[indices]
y_train = y_train[indices]

In [None]:
X_train.shape

(1370, 224, 224, 3)

In [None]:
y_train.shape

(1370,)

In [None]:
# flatten the input data
X_train = X_train.reshape(X_train.shape[0], -1)
X_test = X_test.reshape(X_test.shape[0], -1)
ytrain = y_train.reshape(-1,1)
ytest = y_test.reshape(-1,1)

In [None]:
# X_train = np.load('/content/drive/My Drive/U of T/APS360 Deep Learning/x_train_data.npy', allow_pickle=True)

In [None]:
# y_train = np.load('/content/drive/My Drive/U of T/APS360 Deep Learning/y_train_data.npy', allow_pickle=True)

In [None]:
# X_val = np.load('/content/drive/My Drive/U of T/APS360 Deep Learning/x_val_data.npy', allow_pickle=True)

In [None]:
# y_val = np.load('/content/drive/My Drive/U of T/APS360 Deep Learning/y_val_data.npy', allow_pickle=True)

In [None]:
# X_test = np.load('/content/drive/My Drive/U of T/APS360 Deep Learning/x_test_data.npy', allow_pickle=True)

In [None]:
# y_test = np.load('/content/drive/My Drive/U of T/APS360 Deep Learning/y_test_data.npy', allow_pickle=True)

# Baseline Model

Baseline Model: Random Forest

Create a RandomForestClassifier with 100 decision trees and train the model using the training data with rf.fit(X_train, y_train).

After training the model, we use it to predict the target variable for the test data with y_pred = rf.predict(X_test).

Finally, we evaluate the accuracy of the model using accuracy_score and print the result.

In [None]:
# Import the necessary libraries
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Create a Random Forest classifier with 10 decision trees
rf = RandomForestClassifier(n_estimators=10, random_state=42)

# Train the Random Forest classifier
rf.fit(X_train, y_train)

# Predict the target variable for the test data
y_pred = rf.predict(X_test)

In [None]:
# Evaluate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt

# Calculate metrics
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("Mean Absolute Error:", mae)

# Plot metrics
metrics = {'MSE': mse, 'MAE': mae}
plt.bar(metrics.keys(), metrics.values())
plt.show()


In [None]:
#Example from: https://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html
import numpy as np
import matplotlib.pyplot as plt

from sklearn import svm, datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import ConfusionMatrixDisplay

# flatten the input data
X_train = X_train.reshape(X_train.shape[0], -1)
X_test = X_test.reshape(X_test.shape[0], -1)
ytrain = y_train.reshape(-1,1)
ytest = y_test.reshape(-1,1)

class_names = [0, 1]

# Run classifier, using a model that is too regularized (C too low) to see
# the impact on the results
classifier = svm.SVC(kernel="linear", C=0.01).fit(X_train, ytrain)

np.set_printoptions(precision=2)

# Plot non-normalized confusion matrix
titles_options = [
    ("Confusion matrix, without normalization", None),
    ("Normalized confusion matrix", "true"),
]

# after normalizing, it becomes probability (ex. of the versicolor row, 0.62 of the row is predicted and classified as versicolor)
for title, normalize in titles_options:
    disp = ConfusionMatrixDisplay.from_estimator(
        classifier,
        X_test,
        y_test,
        display_labels=class_names,
        cmap=plt.cm.Blues,
        normalize=normalize,
    )
    disp.ax_.set_title(title)

    print(title)
    print(disp.confusion_matrix)

plt.show()