In [None]:
#Drive upload
from google.colab import drive
drive.mount('/content/gdrive')

#Uploading all the disease images
#1,73,851 -> Total Data points
!unzip gdrive/MyDrive/PDPArchive.zip -d /content/gdrive/MyDrive/Content_Unzip

In [2]:
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
from torchvision import datasets, transforms
from torch import utils
import sklearn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
import os

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Running on", device)
torch.cuda.empty_cache()

In [None]:
#Creating an Even DataSet to Work with

path = '/gdrive/MyDrive/Content_Unzip'
cls = os.listdir(path)
class_map = {}
for cl in cls:
  n = len(os.listdir(path + '/' + cl))
  class_map.update(cl = n)
print(class_map)


#Transform to Tensor
transform =  transforms.Compose(
    [transforms.Resize(224),
     transforms.ToTensor()])
#Doubt : Why Exactly do they usually resize the Input to 224x224?
#Note : Look up the concept of normalization again to fully understand why this is done
data_ = datasets.ImageFolder(path, transform=transform)

#Split into 3 subsets -> 40 | 30 | 30
size1 = int(len(data_) * 0.4)
size2 = (len(data_) - size1) / 2
size3 = len(data_) - (size1 + size2)
data1, data2, data3 = utils.data.random_split(data_, [size1, size2, size3])

In [None]:
def process_and_split__data(data):

  testSize = int(len(data) * 0.3)
  trainSize = len(data) - testSize
  train_set, test_set = utils.data.random_split(data, [trainSize, testSize])

  #Use Dataloader to load the data into an itterable form (Set 1)
  train = utils.data.DataLoader(train_set, batch_size = 32, shuffle = True)
  test = utils.data.DataLoader(test_set, batch_size = 32, shuffle = True)
  
  return train, test

In [None]:
#Comparing DataDistribution between the two classes
def compareDataSegments(data1loader, data2loader):
  class_count_data1 = DataCount(data1loader)
  class_count_data2 = DataCount(data2loader)

  fig = plt.figure()
  ax = fig.add_axes([0,0,1,1])
  ax.set_title('Data Distribution between two batches of the Dataset', loc='left')

  # Set position of bar on X axis
  br1 = np.arange(len(class_count_data1))
  br2 = [x + 0.25 for x in br1]

  ax.bar(br1, class_count_data1, color = 'blue', width = 0.25, edgecolor ='black', label = 'Batch 1')
  ax.bar(br2, class_count_data2, color = 'grey', width = 0.25, edgecolor ='black', label = 'Batch 2')

  plt.xlabel('DataSets', fontweight='bold')
  plt.ylabel('Class Count', fontweight='bold')
  plt.xticks([r + 0.25 for r in range(len(class_count_data1))], [str(c) for c in range(len(class_count_data1))])
    
  plt.show()

  #Class Names of Numeric Labels
  classNames = os.listdir(path)
  classNames.sort()
  class_to_idx = {classNames[i]: i for i in range(len(classNames))}
  print(class_to_idx)

In [None]:
#calculating the class distribution over a dataset
def DataCount(dataloader):
  class_count = [0, 0, 0, 0]
  for i, batch in enumerate(dataloader):
      image, classes = batch
      labels = np.array(classes)
      for label in labels:
        for i in range(4):
          if(label == i):
            class_count[i] = class_count[i] + 1
  return class_count

In [None]:
#Plotting the claculated class distributions
def plot_class_distributions(class_count_train, class_count_test):
  fig = plt.figure()
  ax = fig.add_axes([0,0,1,1])
  ax.set_title('Data Distribution', loc='left')

  # Set position of bar on X axis
  br1 = np.arange(len(class_count_train))
  br2 = [x + 0.25 for x in br1]
  br3 = [x + 0.25 for x in br2]

  ax.bar(br1, class_count_train, color = 'grey', width = 0.25, edgecolor ='black', label = 'Train')
  ax.bar(br2, class_count_test, color = 'pink', width = 0.25, edgecolor ='black', label = 'Test')

  plt.xlabel('DataSets', fontweight='bold')
  plt.ylabel('Class Count', fontweight='bold')
  plt.xticks([r + 0.25 for r in range(len(class_count_train))], [str(c) for c in range(len(class_count_train))])
   
  plt.show()

  #Class Names of Numeric Labels
  classNames = os.listdir(path)
  classNames.sort()
  class_to_idx = {classNames[i]: i for i in range(len(classNames))}
  print(class_to_idx)

In [None]:
#Class Ratios : Are they as even as expected?
def check_class_ratio(class_count_train, class_count_val, class_count_test, classes):
  class_ratios = []
  #Initialize an empty zeroes array for all 39 classes

  for i in range():
    aggregate = class_count_train[i] + class_count_test[i] + class_count_val[i]
    #print(aggregate)
    class_ratios[i][0] = round((class_count_train[i] / aggregate) * 100, 2)
    class_ratios[i][2] = round((class_count_test[i] / aggregate) * 100, 2)

  print("Rough Percentage of Class Division amongst the three train, test and validate sets")
  for i in range():
    print(class_ratios[i])
    print()

In [None]:
#Creating a simple CNN architechture
class SimpleCustomCNN(nn.Module):
  def __init__(self):
    super(SimpleCustomCNN, self).__init__()

    #Defining a sequential model layers
    self.c1 = nn.Sequential(
        nn.Conv2d(in_channels = 3, out_channels = 16, kernel_size=5, padding=0, stride=1),
        nn.LeakyReLU(),
        nn.MaxPool2d(kernel_size=2, stride=2)
    )
    self.c2 = nn.Sequential(
        nn.Conv2d(in_channels = 16, out_channels = 32, kernel_size=3, padding=0, stride=1),
        nn.LeakyReLU(),
        nn.AvgPool2d(kernel_size=2, stride=2)
    )
    self.fc1 = nn.Linear(in_features = 54*54*32, out_features = 39)

  def forward(self, x):
    output = self.c1(x)
    output = self.c2(output)
    output = output.reshape(output.size(0), -1)
    #or nn.Flatten()
    output = self.fc1(output)
    return output

In [None]:
#Calling all the functions to visualize the Data
def datasets_visualization(train, test):

  dataiter = iter(train)
  images, classes = dataiter.next()

  print(type(images))
  print(images.shape)
  print(classes.shape)

  class_count_train = DataCount(train)
  class_count_test = DataCount(test)

  #Check if the Ratio of the Train:Val:Test has been maintained through the classes:
  check_class_ratio(class_count_train, class_count_test, classes)

  #Plot to Visualize the way Data is Distributed between the sets
  plot_class_distributions(class_count_train, class_count_test)


In [None]:
def train__model(e, train, model, optimizer):

  train_losses = []
  train_accuracy = []

  for epoch in range(e):
    model.train()

    run_loss = 0.0
    correct = 0
    total = 0
    for img, class_ in train:
          img, class_ = img.to(device), class_.to(device)
          optimizer.zero_grad()
          output = model(img)
          loss = lossCriteria(output, class_)
          loss.backward()
          optimizer.step()
          run_loss += loss.item() 

          _, prediction = output.max(1)
          total += class_.size(0)
          correct += prediction.eq(class_).sum().item()
    
    train_loss = run_loss/len(train.sampler)
    accuracy = 100.*correct/total

    train_losses.append(train_loss)
    train_accuracy.append(accuracy)
    print('Epoch: {} \tTraining Loss: {:.4f} \tTraining Accuracy: {:.4f}'.format(epoch, train_loss, accuracy))

  return train_losses, train_accuracy


In [None]:
def test_model(e, test, model):

  final_losses = []
  final_accuracy = []

  for epoch in range(e):
    model.eval()

    run_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
      for img, class_ in test:
        img, class_ = img.to(device), class_.to(device)
        output = model(img)
        loss = lossCriteria(output, class_)
        run_loss += loss.item()

        _, prediction = output.max(1)
        total += class_.size(0)
        correct += prediction.eq(class_).sum().item()
      

      test_loss = run_loss/len(test.sampler)
      accuracy = 100.*correct/total

      final_losses.append(test_loss)
      final_accuracy.append(accuracy)
      print('Epoch: {} \tTest Loss: {:.4f} \tTest Accuracy: {:.4f}'.format(epoch, test_loss, accuracy))
  
  return final_losses, final_accuracy
  

In [None]:
#Visulaize the Results
def plot_acc_curve(train_accuracy, test_accuracy):
  plt.plot(train_accuracy, color='green')
  plt.xlabel('epoch')
  plt.ylabel('Accuracy')
  plt.title('Training Accuracy')
 
  plt.show()

def plot_loss_curve(train_losses, test_losses):
  plt.plot(train_losses, color='green')
  plt.xlabel('epoch')
  plt.ylabel('Loss')
  plt.title('Training Loss')
 
  plt.show()

In [None]:
#Call the Train and Validate and Test Functions
def train_test_models(model_, train, test):
  train_losses, train_accuracy = train__model(50, train, model_)
  test_losses, test_accuracy = test_model(1, test, model_)

  plot_loss_curve(train_losses, test_losses)
  plot_acc_curve(train_accuracy, test_accuracy)


In [None]:
#Splitting into training and testing datasets
train1, test1 = process_and_split__data(data1)
train2, test2 = process_and_split__data(data2)
train3, test3 = process_and_split__data(data3)

#Visualizing the Induvidual Datasets :
print("Dataset Segment 1 : ")
datasets_visualization(train1, test1)
print("Dataset Segment 2 : ")
datasets_visualization(train2, test2)
print("Dataset Segment 3 : ")
datasets_visualization(train3, test3)

#Compare these Data Segemnts
print("Dataset Segment 1 v.s 2: ")
compareDataSegments(data1, data2)
print("Dataset Segment 2 v.s 3: ")
compareDataSegments(data2, data3)
print("Dataset Segment 1 v.s 3: ")
compareDataSegments(data1, data3)

DataSets are clearly un-even. They require some segmentation techniques to even out the class distributions

If the output from all three random data partitions give us a more or less similar accuracy and loss curve, we can take the average of these three as the final accuracy. If Further investigation is required, we may go ahead and re-run this whole runtime so that the next random three data segments can be tested. Those results can further strengthen the conclusion

In [None]:
#Cross Entropy Function
lossCriteria = nn.CrossEntropyLoss()

#Initialize model and optimizer 1
model1 = SimpleCustomCNN().to(device)
optimizer1 = torch.optim.Adam(model1.parameters(), lr=0.001)
print(model1)
#Calling the training function for model 1
train_test_models(model1, train1, test1, optimizer1)


#Initialize model and optimizer 2
model1 = SimpleCustomCNN().to(device)
optimizer2 = torch.optim.Adam(model1.parameters(), lr=0.001)
print(model1)
#Calling the training function for model 2
train_test_models(model1, train1, test1, optimizer2)


#Initialize model and optimizer 3
model1 = SimpleCustomCNN().to(device)
optimizer3 = torch.optim.Adam(model1.parameters(), lr=0.001)
print(model1)
#Calling the training function for model 3
train_test_models(model1, train1, test1, optimizer3)

In [None]:
'''
#Visualize through a confusion matrix

#Getting the list of prediction made by model
def get_all_preds(model, loader):
    all_preds = torch.tensor([])
    for batch in loader:
        images, labels = batch

        preds = model(images)
        all_preds = torch.cat(
            (all_preds, preds)
            ,dim=0
        )
    return all_preds

#Counting the total number of correct predictions made
def get_num_correct():

#Calling above two functions
with torch.no_grad():
    prediction_loader = torch.utils.data.DataLoader(train1, batch_size=10000)
    train1_preds = get_all_preds(network, prediction_loader)
preds_correct = get_num_correct(train1_preds, train1.targets)

print('Total Correct:', preds_correct)
print('Accuracy:', preds_correct / len(train1))'''

In [None]:
#saving model state:
#torch.save(model.state_dict(), '/content/gdrive/MyDrive/CustomCNN_93.4911_apple')

In [None]:
#Load Previously saved model
'''
modelOld =  SimpleCustomCNN().to(device)
modelOld.load_state_dict = torch.load('/content/gdrive/MyDrive/CustomCNN_93.4911_apple')
print(modelOld)
modelOld.to(device)
'''