In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import glob
import cv2
from PIL import Image
from tqdm import tqdm



In [2]:
# read raw train data
train = pd.read_csv("../data/train.csv")
print(train.shape)
train.head()

(147793, 5)


Unnamed: 0,image_id,width,height,bbox,source
0,b6ab77fd7,1024,1024,"[834.0, 222.0, 56.0, 36.0]",usask_1
1,b6ab77fd7,1024,1024,"[226.0, 548.0, 130.0, 58.0]",usask_1
2,b6ab77fd7,1024,1024,"[377.0, 504.0, 74.0, 160.0]",usask_1
3,b6ab77fd7,1024,1024,"[834.0, 95.0, 109.0, 107.0]",usask_1
4,b6ab77fd7,1024,1024,"[26.0, 144.0, 124.0, 117.0]",usask_1


In [21]:
# read clean wheat data
wheat = pd.read_csv("wheat.csv")
print(wheat.shape)
wheat.head()

(147793, 5)


Unnamed: 0,image_id,xmin,ymin,xmax,ymax
0,b6ab77fd7.jpg,834.0,222.0,890.0,258.0
1,b6ab77fd7.jpg,226.0,548.0,356.0,606.0
2,b6ab77fd7.jpg,377.0,504.0,451.0,664.0
3,b6ab77fd7.jpg,834.0,95.0,943.0,202.0
4,b6ab77fd7.jpg,26.0,144.0,150.0,261.0


### Train / Valid split

- **usask_1** and **rres_1** as validation set.
- Number of images roughly 20%.

In [17]:
# carve out a validation set from 2 sources
valid_id = train.loc[train.source.isin(["usask_1", "rres_1"])].image_id.unique()
len(valid_id)

632

In [19]:
valid_id = [Id+".jpg" for Id in valid_id]  # image_id of validation set

In [25]:
train_wheat = wheat.loc[~wheat.image_id.isin(valid_id)].reset_index(drop=True)
valid_wheat = wheat.loc[wheat.image_id.isin(valid_id)].reset_index(drop=True)
print("train:", train_wheat.shape, "valid:", valid_wheat.shape)

train: (121750, 5) valid: (26043, 5)


In [26]:
train_wheat.head()

Unnamed: 0,image_id,xmin,ymin,xmax,ymax
0,44c60402e.jpg,830.0,21.0,896.0,78.0
1,44c60402e.jpg,752.0,0.0,804.0,49.0
2,44c60402e.jpg,321.0,0.0,467.0,32.0
3,44c60402e.jpg,363.0,32.0,471.0,97.0
4,44c60402e.jpg,555.0,63.0,612.0,109.0


In [27]:
train_wheat.image_id.unique()

array(['44c60402e.jpg', '6e3da4ae3.jpg', 'd8616bdd1.jpg', ...,
       'a5c8d5f5c.jpg', 'e6b5e296d.jpg', '5e0747034.jpg'], dtype=object)

In [None]:
class WheatDataSet(Dataset):
    def __init__(self, df, folder):
        self.image_ids = df.image_id.unique()
        self.folder = folder
    
    def __getitem__(self, index):
        # get image and transform
        image_id = self.image_ids[index]
        image = cv2.imread(f"{self.folder}/{image_id}", cv2.IMREAD_COLOR)  # Convert to B G R channels
#         tmp, image[:, :, 0], image[:, :, 2] = image[:, :, 0], image[:, :, 2], tmp  # swap BGR to RGB
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB).astype(np.float32)
        image /= 255.0  # normalize to 0~1
        
        

In [42]:
np.array(Image.open("../data/train/44c60402e.jpg"))

array([[[ 11,  28,   9],
        [ 14,  31,  12],
        [ 18,  35,  16],
        ...,
        [ 95, 117,  55],
        [122, 142,  81],
        [126, 146,  85]],

       [[  9,  26,   7],
        [ 11,  28,   9],
        [ 14,  31,  12],
        ...,
        [137, 160,  92],
        [113, 136,  68],
        [ 90, 113,  45]],

       [[  7,  24,   5],
        [  8,  25,   6],
        [ 10,  27,   8],
        ...,
        [157, 185, 108],
        [112, 139,  62],
        [ 77, 104,  27]],

       ...,

       [[ 27,  72,   5],
        [ 56, 101,  34],
        [108, 153,  84],
        ...,
        [ 10,  33,   5],
        [  9,  32,   4],
        [  8,  33,   3]],

       [[  0,  39,   0],
        [ 20,  67,   0],
        [ 81, 126,  57],
        ...,
        [ 12,  35,   6],
        [ 10,  35,   3],
        [ 10,  35,   3]],

       [[  8,  58,   0],
        [ 19,  66,   0],
        [ 78, 125,  55],
        ...,
        [ 11,  35,   3],
        [ 10,  35,   3],
        [ 10,  36,   1]]