In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:

import cv2
import numpy as np
import pandas as pd
import os
import joblib
from tqdm import tqdm
from collections import Counter
from sklearn.neighbors import KNeighborsClassifier
from imblearn.over_sampling import RandomOverSampler, SMOTE
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.metrics import roc_auc_score 
import joblib
import pickle
from torch.utils.data import DataLoader


In [None]:
class simple_cnn(nn.Module):
  def __init__(self):
    super(simple_cnn, self).__init__()
    self.cnn_layers = nn.Sequential(
        # Defining a 2D convolution layer
        nn.Conv2d(1, 3, 4, 2),
        nn.BatchNorm2d(3),
        nn.ReLU(inplace=True),
        nn.MaxPool2d(2),
        # Defining another 2D convolution layer
        nn.Conv2d(3,2,4,2),
        nn.BatchNorm2d(2),
        nn.ReLU(inplace=True),
        nn.MaxPool2d(2),
    )

    self.linear_layers = nn.Sequential(
        nn.Linear(2 * 7 * 7, 5)
    )
  
  def forward(self, x):
    x = self.cnn_layers(x)
    x = x.view(x.size(0), -1)
    x = self.linear_layers(x)
    return x

class Dataset(torch.utils.data.Dataset):
  def __init__(self, X, y, indices):
    self.X = X
    self.y = y
    self.indices = indices
    
  def __len__(self):
    #y = self.y
    #length = y.shape[1]
    return len(self.indices)
  
  def __getitem__(self, sample_idx):
    Id = self.indices[sample_idx]
    return {
        "data": torch.from_numpy(self.X[Id]),
        "label": self.y[Id]
    }

In [None]:
df = pd.read_csv("/content/drive/MyDrive/trainLabels_cropped.csv")
image_names = df['image']
y = df['level']
print(Counter(y))

X = joblib.load("/content/drive/MyDrive/X")
X = X.reshape( (X.shape[0], 128*128) )
oversample = SMOTE()
X_over, y_over = oversample.fit_resample(X, y)
for i in range(X_over.shape[0]):
  min = X_over[i].min(axis=0)
  max = X_over[i].max(axis=0)
  X_std = (X_over[i] - min) / (max - min)
  X_over[i] = X_std * (max - min) + min

X_over = X_over.reshape((X_over.shape[0], 1, 128, 128) )
print("oversampled: ",Counter(y_over))
# X_train, X_test, y_train, y_test = train_test_split(X_over, y_over, test_size=0.3)
indices = np.arange(X_over.shape[0])

Counter({0: 25802, 2: 5288, 1: 2438, 3: 872, 4: 708})
oversampled:  Counter({0: 25802, 1: 25802, 2: 25802, 4: 25802, 3: 25802})


In [None]:
X_over = X_over.astype('float')
print(X_over.shape)
model_cnn = simple_cnn()

train_val_indices, heldout_indices = train_test_split(indices, test_size=0.3, random_state=42, stratify=y_over)
train_indices, valid_indices = train_test_split(train_val_indices, test_size = 0.2, random_state=42, stratify=y_over[train_val_indices])

train_dataset = Dataset(X_over, y_over, train_indices)
train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_dataset = Dataset(X_over, y_over, valid_indices)
test_dataloader = DataLoader(test_dataset, batch_size=64, shuffle=True)


(129010, 1, 128, 128)


In [None]:
epochs = 100
device = torch.device('cuda:0')
loss_function = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model_cnn.parameters(), lr = 0.001)
model_cnn = model_cnn.to(device)

train_loss_history, valid_loss_history = [], []

for epoch in range(epochs):
  model_cnn.train()
  train_loss = []
  for batch_idx, batch in enumerate(train_dataloader):
    X_batch = batch["data"].to(device)
    y_batch = batch["label"].to(device)
    optimizer.zero_grad()
    
    outputs = model_cnn(X_batch.float())
    loss = loss_function(outputs, y_batch)
    train_loss.append(loss.item())
    loss.backward()
    optimizer.step()

  model_cnn.eval()
  with torch.no_grad():
    valid_loss = []
    y_probs_train = torch.empty(0, 5).to(device)
    y_true_valid, y_probs_valid, y_pred_valid = [], [], []

    for i, batch in enumerate(test_dataloader):
      X_batch = batch["data"].to(device)
      y_batch = batch["label"].to(device)
      
      outputs = model_cnn(X_batch.float())
      loss = loss_function(outputs, y_batch)
      valid_loss.append(loss.item())

      _, predicted = torch.max(outputs.data, 1)
      y_pred_valid += predicted.cpu().numpy().tolist()
      y_probs_valid = torch.cat((y_probs_train, outputs), 0)
      y_true_valid += y_batch.cpu().numpy().tolist()
  y_probs_valid = F.softmax(y_probs_train, dim=1).cpu().numpy()
  y_true_valid = np.array(y_true_valid)

  train_loss_history.append(np.mean(train_loss))
  valid_loss_history.append(np.mean(valid_loss))
  
  print(f"Epoch {epoch} train loss: {train_loss_history[-1]}")
  print(f"Epoch {epoch} test loss: {valid_loss_history[-1]}")
  # print(classification_report(y_true_valid, y_pred_valid))
  
  state = {
      'model_description': str(model_cnn),
      'model_state': model_cnn.state_dict(),
      'optimizer': optimizer.state_dict()
  }
  torch.save(state, "/content/drive/MyDrive/model_cnn.ckpt")

Epoch 0 train loss: 1.3665933989126984
Epoch 0 test loss: 1.3172494177262268
Epoch 1 train loss: 1.3117160248059525
Epoch 1 test loss: 1.3422920274229015
Epoch 2 train loss: 1.2966532557278845
Epoch 2 test loss: 1.3373362571527596
Epoch 3 train loss: 1.2868694776555307
Epoch 3 test loss: 1.287922960709346
Epoch 4 train loss: 1.2818825599680546
Epoch 4 test loss: 1.2767905258037175
Epoch 5 train loss: 1.2737662341343396
Epoch 5 test loss: 1.2936803869139601
Epoch 6 train loss: 1.268997433335513
Epoch 6 test loss: 1.2730757103370693
Epoch 7 train loss: 1.2642396148904644
Epoch 7 test loss: 1.2740894176934718
Epoch 8 train loss: 1.2621016021425266
Epoch 8 test loss: 1.2722870690662533
Epoch 9 train loss: 1.259076784520787
Epoch 9 test loss: 1.2602269245120745
Epoch 10 train loss: 1.2583755494431124
Epoch 10 test loss: 1.257417907141965
Epoch 11 train loss: 1.255412296133827
Epoch 11 test loss: 1.2806781058597903
Epoch 12 train loss: 1.2535667089997824
Epoch 12 test loss: 1.295744072000887

In [None]:
heldout_dataset = Dataset(X_over, y_over, heldout_indices)
heldout_dataloader = DataLoader(heldout_dataset, batch_size=64, shuffle=True)

model_cnn.eval()
with torch.no_grad():
  valid_loss = []
  y_probs_train = torch.empty(0, 5).to(device)
  y_true_heldout, y_probs_heldout, y_pred_heldout = [], [], []

  for i, batch in enumerate(heldout_dataloader):
    X_batch = batch["data"].to(device)
    y_batch = batch["label"].to(device)
    
    outputs = model_cnn(X_batch.float())
    loss = loss_function(outputs, y_batch)
    valid_loss.append(loss.item())

    _, predicted = torch.max(outputs.data, 1)
    y_pred_heldout += predicted.cpu().numpy().tolist()
    y_probs_heldout = torch.cat((y_probs_train, outputs), 0)
    y_true_heldout += y_batch.cpu().numpy().tolist()
y_probs_heldout = F.softmax(y_probs_train, dim=1).cpu().numpy()
y_true_heldout = np.array(y_true_heldout)
print(classification_report(y_true_heldout, y_pred_heldout))


              precision    recall  f1-score   support

           0       0.66      0.95      0.77      7741
           1       0.41      0.23      0.30      7740
           2       0.29      0.13      0.18      7741
           3       0.36      0.22      0.28      7741
           4       0.38      0.74      0.50      7740

    accuracy                           0.45     38703
   macro avg       0.42      0.45      0.41     38703
weighted avg       0.42      0.45      0.41     38703

