In [1]:
!pip install -q --upgrade fastcore 
!pip install -q --upgrade fastai

[K     |████████████████████████████████| 40kB 2.8MB/s 
[K     |████████████████████████████████| 358kB 5.4MB/s 
[?25h

In [1]:
!pip install efficientnet_pytorch

Collecting efficientnet_pytorch
  Downloading https://files.pythonhosted.org/packages/4e/83/f9c5f44060f996279e474185ebcbd8dbd91179593bffb9abe3afa55d085b/efficientnet_pytorch-0.7.0.tar.gz
Building wheels for collected packages: efficientnet-pytorch
  Building wheel for efficientnet-pytorch (setup.py) ... [?25l[?25hdone
  Created wheel for efficientnet-pytorch: filename=efficientnet_pytorch-0.7.0-cp36-none-any.whl size=16031 sha256=376ddba89b5b962dc29eeb42ff2be0b17314bd45bd1ef29cd5437ab4ab5c8a5e
  Stored in directory: /root/.cache/pip/wheels/e9/c6/e1/7a808b26406239712cfce4b5ceeb67d9513ae32aa4b31445c6
Successfully built efficientnet-pytorch
Installing collected packages: efficientnet-pytorch
Successfully installed efficientnet-pytorch-0.7.0


In [2]:
%matplotlib inline

import numpy as np
import pandas as pd

from efficientnet_pytorch import EfficientNet
import torch
import torchvision
from torchvision import datasets, models, transforms, utils
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
import torch.nn.functional as F
from torch.utils.data import Dataset
import matplotlib.image as mpimg
import matplotlib.pyplot as plt
from PIL import Image
from tqdm import tqdm

import os
import time
import copy
import random

from sklearn.metrics import f1_score as f1
from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import StratifiedShuffleSplit, train_test_split
from google.colab import drive
import glob

#from fastai.vision.all import *

In [3]:
drive.mount('/content/drive/')

Mounted at /content/drive/


In [4]:
!mkdir images
!unzip /content/drive/My\ Drive/Zindi\ CGIAR\ Wheat\ Growth\ Stage\ Challenge/Images.zip -d images

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: images/Images/JVkjiXLf.jpeg  
  inflating: images/Images/RU4EWpeB.jpeg  
  inflating: images/Images/c7VUilqY.jpeg  
  inflating: images/Images/kzaetKOn.jpeg  
  inflating: images/Images/XlspTcDO.jpeg  
  inflating: images/Images/Yren9biV.jpeg  
  inflating: images/Images/hde3LUfQ.jpeg  
  inflating: images/Images/wZ2qQ7DL.jpeg  
  inflating: images/Images/k3LmhDP7.jpeg  
  inflating: images/Images/lmABIeNG.jpeg  
  inflating: images/Images/G3YP7MrO.jpeg  
  inflating: images/Images/FjkQKHGd.jpeg  
  inflating: images/Images/kE7aW5te.jpeg  
  inflating: images/Images/wETtqc9x.jpeg  
  inflating: images/Images/tqvMeu2j.jpeg  
  inflating: images/Images/qi1EG3Bb.jpeg  
  inflating: images/Images/0sCjXV5h.jpeg  
  inflating: images/Images/1nHSBWt2.jpeg  
  inflating: images/Images/42I5ngPN.jpeg  
  inflating: images/Images/2Z50vzcp.jpeg  
  inflating: images/Images/cfIPC5FT.jpeg  
  inflating: images/Images/8sidO

In [5]:
def set_random_state(seed_value):
    random.seed(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed(seed_value)
    os.environ['PYTHONHASHSEED'] = str(seed_value)
    torch.backends.cudnn.deterministic = False
    torch.backends.cudnn.benchmark = True
    np.random.seed(seed_value)
    
set_random_state(7)

In [6]:
sub = pd.read_csv('/content/drive/My Drive/Zindi CGIAR Wheat Growth Stage Challenge/SampleSubmission.csv')
train = pd.read_csv('/content/drive/My Drive/Zindi CGIAR Wheat Growth Stage Challenge/Train.csv')

In [7]:
train.groupby(by='label_quality').count()

Unnamed: 0_level_0,UID,growth_stage
label_quality,Unnamed: 1_level_1,Unnamed: 2_level_1
1,7839,7839
2,2856,2856


In [8]:
names = list(map(lambda s: s[-13:-5], glob.glob('/content/images/Images/*.jpeg')))

In [9]:
class Wheat(Dataset):
      def __init__(self, images, gts, split_type, transform):

          self.images = images
          self.gts = gts

          self.split_type = split_type
          self.transform = transform

      def __len__(self):

          return len(self.images)

      def __getitem__(self, idx):

          img = self.images[idx]
          if self.split_type == 'test':
              y = 0
          else:
              y = self.gts[idx]

          img = Image.fromarray(img)
          img = self.transform(img)
          return img, y

In [10]:
def read_dataset(imgs_path, lbl_path):
    lbl_df = pd.read_csv(lbl_path)
    gt, lbl_quality, test_id_arr, labels_arr = [], [], [], []
    train_imgs_fname_set = lbl_df['UID'].tolist()
    
    imgs_fname_arr = os.listdir(imgs_path)
    imgs_arr = np.zeros((lbl_df.shape[0], 224, 224, 3), dtype = np.uint8)
    test_imgs_arr = np.zeros((len(imgs_fname_arr) - lbl_df.shape[0], 224, 224, 3), dtype = np.uint8)
    i, j = 0, 0
    for fname in imgs_fname_arr:
        img = np.array(Image.open(os.path.join(imgs_path, fname)).convert('RGB').resize((224,224), Image.ANTIALIAS)).astype(np.uint8)
        img_id = fname.split('.')[0] 
        if img_id in train_imgs_fname_set: 
            imgs_arr[i] = img
            labels_arr.append(img_id)
            gt.append(float(lbl_df[lbl_df['UID']==img_id]['growth_stage']))
            lbl_quality.append(int(lbl_df[lbl_df['UID']==img_id]['label_quality']))
            i += 1
        else:
            test_imgs_arr[j] = img
            j += 1
            test_id_arr.append(img_id)
            
    return imgs_arr, labels_arr, test_imgs_arr, np.array(gt), np.array(lbl_quality), test_id_arr

In [11]:
imgs_arr, labels_arr, test_imgs_arr, gt, lbl_quality, test_id_arr = read_dataset('/content/images/Images/', '/content/drive/My Drive/Zindi CGIAR Wheat Growth Stage Challenge/Train.csv')

In [12]:
imgs_arr_train = imgs_arr[lbl_quality == 2]
labels_arr_train = np.array(labels_arr)[lbl_quality == 2]
gt_train = gt[lbl_quality == 2]
lbl_quality_train = lbl_quality[lbl_quality == 2]


#train_idxs = np.where(lbl_quality == 1)
#test_idxs = np.where(lbl_quality == 2)

In [49]:
def train_model(model, criterion, optimizer, scheduler, device, num_epochs=25):
    since = time.time()

    best_model_wts = copy.deepcopy(model.state_dict())
    best_loss = float('inf')
    best_f1 = 0
    best_mse = float('inf')

    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0
            f1_score = 0.0
            mse_score = 0.0

            y_trues, y_preds = [], []
            # Iterate over data.
            for inputs, labels in dataloaders[phase]:
                inputs = inputs.to(device)
                labels = labels.type(torch.int64).to(device)  #if classification add .type(torch.int64) before .to(device)
                labels -= 1

                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)
                    #print(outputs, labels)
                    #loss = criterion(outputs, labels.unsqueeze(1))
                    loss = criterion(outputs, labels)
                    
                    
                    # backward + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                # statistics
                running_loss += loss.item() * inputs.size(0)
                y_preds.extend(torch.argmax(outputs.cpu(), axis=1))#if classification y_preds.extend(torch.argmax(outputs.cpu(), axis=1)), if regression  y_preds.extend(torch.round(outputs.cpu()))
                y_trues.extend(labels.cpu())
                mse_score += mse(y_true=labels.cpu(), y_pred=torch.argmax(outputs.cpu(), axis=1))  * inputs.size(0)

            if phase == 'train':
                scheduler.step()

            
            f1_score = f1(y_pred=y_preds, y_true=y_trues,average='micro')
            epoch_loss = running_loss / dataset_sizes[phase]
            mse_score = mse_score/dataset_sizes[phase]
            print('{} RMSE Loss: {:.4f}'.format(
                phase, np.sqrt(mse_score)))
            
            print('{} F1 Score: {:.4f}'.format(
                phase, f1_score))
            
            print('{} Loss: {:.4f}'.format(
                phase, np.sqrt(epoch_loss)))

            # deep copy the model
            if phase == 'val' and epoch_loss < best_loss:
                best_loss = epoch_loss
                best_model_wts = copy.deepcopy(model.state_dict())

            if phase == 'val' and mse_score < best_mse:
                best_mse = mse_score
            if phase == 'val' and f1_score > best_f1:
                best_f1 = f1_score 

        print()

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    print('Best val Loss: {:4f}'.format(best_loss))
    print('Best val F1 Loss: {:4f}'.format(best_f1))
    print('Best val RMSE Loss: {:4f}'.format(np.sqrt(best_mse)))

    # load best model weights
    model.load_state_dict(best_model_wts)
    return model

In [124]:
imgs_arr_train = np.concatenate([imgs_arr_train, new_train_img], axis=0)
gt_train = np.concatenate([gt_train, new_train_gt], axis=0)

sss = StratifiedShuffleSplit(n_splits=1, test_size=0.1, random_state=7)

for train_idxs, test_idxs in sss.split(imgs_arr_train, gt_train):
    break


train_imgs_arr = imgs_arr_train[train_idxs]
train_gt = gt_train[train_idxs]
val_imgs_arr = imgs_arr_train[test_idxs]
val_gt = gt_train[test_idxs]

In [126]:
data_transforms = {
    'train': transforms.Compose([
        transforms.RandomVerticalFlip(p=0.5),
        transforms.RandomHorizontalFlip(p=0.5),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
    'val': transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
}

image_datasets = {'train': Wheat(train_imgs_arr, train_gt, 'train', data_transforms['train']),
                  'val': Wheat(val_imgs_arr, val_gt, 'val', data_transforms['val'])}
dataloaders = {x: torch.utils.data.DataLoader(image_datasets[x], batch_size=8,
                                             shuffle=True, num_workers=4)
              for x in ['train', 'val']}
dataset_sizes = {x: len(image_datasets[x]) for x in ['train', 'val']}

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [127]:
model_ft = EfficientNet.from_pretrained('efficientnet-b1')

count = 0
for param in model_ft.parameters():
    param.requires_grad = False
    count += 1
    if count == 230:
      break


num_ftrs = model_ft._fc.in_features

model_ft._fc = nn.Linear(num_ftrs, 7) #if regression nn.Linear(num_ftrs, 1), if classification nn.Linear(num_ftrs, 7)

model_ft = model_ft.to(device)

criterion = nn.CrossEntropyLoss()
optimizer_ft = optim.AdamW(model_ft.parameters(), lr=0.01)

# Decay LR by a factor of 0.1 every 7 epochs
exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=7, gamma=0.1)

model_ft = train_model(model_ft, criterion, optimizer_ft, exp_lr_scheduler, device,
                       num_epochs=25)

Loaded pretrained weights for efficientnet-b1
Epoch 0/24
----------
train RMSE Loss: 1.1923
train F1 Score: 0.7013
train Loss: 0.9193
val RMSE Loss: 0.6733
val F1 Score: 0.8190
val Loss: 0.7263

Epoch 1/24
----------
train RMSE Loss: 0.9333
train F1 Score: 0.7693
train Loss: 0.7794
val RMSE Loss: 0.6733
val F1 Score: 0.8057
val Loss: 0.7626

Epoch 2/24
----------
train RMSE Loss: 0.8884
train F1 Score: 0.7886
train Loss: 0.7552
val RMSE Loss: 0.6873
val F1 Score: 0.7581
val Loss: 0.7906

Epoch 3/24
----------
train RMSE Loss: 0.8596
train F1 Score: 0.7892
train Loss: 0.7457
val RMSE Loss: 0.9866
val F1 Score: 0.8019
val Loss: 0.7332

Epoch 4/24
----------
train RMSE Loss: 0.8350
train F1 Score: 0.8108
train Loss: 0.7139
val RMSE Loss: 0.7635
val F1 Score: 0.8286
val Loss: 0.7207

Epoch 5/24
----------
train RMSE Loss: 0.8381
train F1 Score: 0.8087
train Loss: 0.7157
val RMSE Loss: 0.4976
val F1 Score: 0.8533
val Loss: 0.6387

Epoch 6/24
----------
train RMSE Loss: 0.8344
train F1 Score

In [129]:
def test(model, test_loader, device):
    model.eval()
    res_arr = []
    for inputs, _ in test_loader:
        inputs = inputs.to(device)
        with torch.set_grad_enabled(False):
            outputs = model(inputs)   
            res_arr.append(outputs.detach().cpu().numpy())
    res_arr = np.concatenate(res_arr, axis = 0)
    return res_arr

In [100]:
imgs_arr_check = imgs_arr[lbl_quality == 1]
labels_arr_check = np.array(labels_arr)[lbl_quality == 1]
gt_check = gt[lbl_quality == 1]
lbl_quality_check = lbl_quality[lbl_quality == 1]

In [101]:
image_datasets['check'] = Wheat(imgs_arr_check, None, 'test', data_transforms['val'])
check_loader = torch.utils.data.DataLoader(image_datasets['check'], batch_size=4, shuffle=False, num_workers=16)
check_pred = test(model_ft, check_loader, device)

In [102]:
softmax = nn.Softmax()
preds = softmax(torch.tensor(check_pred)).numpy()
#check_pred

  


In [118]:
new_train_lbl, new_train_gt, new_train_img = [], [], []
for i in range(len(preds)):
  if np.max(preds[i]) > 0.9:
    new_train_img.append(imgs_arr_check[i])
    new_train_lbl.append(labels_arr_check[i])
    new_train_gt.append(np.argmax(preds[i])+1)

new_df = pd.DataFrame()
new_df['UID'] = new_train_lbl 
new_df['growth_stage'] = new_train_gt

In [119]:
new_df.groupby(by='growth_stage').count()

Unnamed: 0_level_0,UID
growth_stage,Unnamed: 1_level_1
2,268
3,161
4,671
5,19
7,1270


In [120]:
new_df.to_csv('new_train_first_iter', index=False)

In [130]:
image_datasets['test'] = Wheat(test_imgs_arr, None, 'test', data_transforms['val'])
test_loader = torch.utils.data.DataLoader(image_datasets['test'], batch_size=4,shuffle=False, num_workers=16)
test_pred = test(model_ft, test_loader, device)

In [131]:
set(np.argmax(test_pred, axis=1))

{1, 2, 3, 4, 6}

In [133]:
sub['UID'] = test_id_arr
sub['growth_stage'] = np.argmax(test_pred, axis=1)+1# if classification np.argmax(test_pred, axis=1)+1, if regression test_pred.flatten().tolist()
sub.to_csv('class_high_quality_data+new_train_first_iter_efficientnetb1_sub.csv', index = False)

In [35]:
count

301