In [1]:
import os
import cv2
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from datetime import datetime

In [2]:
from helpers import DataExplorator

data_explorator = DataExplorator()

# dataset

the dataset provides training and validation images separately

In [3]:
data_train = data_explorator.get_train_images()
data_train

Unnamed: 0,class,image
0,black_measles,grape-dataset/train/black_measles/d283be4c-a24...
1,black_measles,grape-dataset/train/black_measles/a3b0be25-d00...
2,black_measles,grape-dataset/train/black_measles/83312d48-c63...
3,black_measles,grape-dataset/train/black_measles/37642d48-d6b...
4,black_measles,grape-dataset/train/black_measles/337aedbd-4df...
5,black_measles,grape-dataset/train/black_measles/72eb1763-849...
6,black_measles,grape-dataset/train/black_measles/0fcfc51c-169...
7,black_measles,grape-dataset/train/black_measles/1f772cab-21e...
8,black_measles,grape-dataset/train/black_measles/ace2440f-b13...
9,black_measles,grape-dataset/train/black_measles/9d13bf06-829...


In [4]:
data_valid = data_explorator.get_valid_images()
data_valid

Unnamed: 0,class,image
0,black_measles,grape-dataset/valid/black_measles/01c74b8d-111...
1,black_measles,grape-dataset/valid/black_measles/0ad02171-f9d...
2,black_measles,grape-dataset/valid/black_measles/ef4421c9-4eb...
3,black_measles,grape-dataset/valid/black_measles/8ee91739-195...
4,black_measles,grape-dataset/valid/black_measles/b6f9d1c9-f43...
5,black_measles,grape-dataset/valid/black_measles/f751baa1-2fd...
6,black_measles,grape-dataset/valid/black_measles/83691e0b-04e...
7,black_measles,grape-dataset/valid/black_measles/3c4c6822-92f...
8,black_measles,grape-dataset/valid/black_measles/d98dd678-86d...
9,black_measles,grape-dataset/valid/black_measles/f16c2e0b-32f...


In [5]:
def histogram(img):
    #initializing
    WB = np.zeros(256)
    WG = np.zeros(256)
    WR = np.zeros(256)
    
    l, c, z = img.shape
    
    B = img[:,:,0]
    G = img[:,:,1]
    R = img[:,:,2]
    
    #count pixels by channel
    for i in range(l):
        for j in range(c):
            WB[B[i,j]] = WB[B[i,j]]+1
            WG[G[i,j]] = WG[G[i,j]]+1
            WR[R[i,j]] = WR[R[i,j]]+1
    
    #normalizing
    for i in range(256):
        WB[i] = WB[i]/(l*c)
        WG[i] = WG[i]/(l*c)
        WR[i] = WR[i]/(l*c)

    return [WB,WG,WR]

In [6]:
def extract_features(img):
    features = []
    hist = histogram(img)

    soma_B = 0
    soma_G = 0
    soma_R = 0

    for i in range(256):
        if((i+1)%64 == 0):
            features.append(soma_B)
            features.append(soma_G)
            features.append(soma_R)
            soma_B = 0
            soma_G = 0
            soma_R = 0
        else:
            soma_B+=hist[0][i]
            soma_G+=hist[1][i]
            soma_R+=hist[2][i]
    
    return features

In [7]:
classes_codes = {"healthy": 0, "black_measles": 1, "leaf_blight": 2, "black_rot": 3}

In [8]:
def get_features(img_path, img_class):
    image = cv2.imread(img_path)
    features = extract_features(image)
    
    features_dict = dict(zip(["feature_"+str(i+1) for i in range(len(features))], features))
    features_dict["class"] = classes_codes[img_class]
    return features_dict

In [9]:
data_train_size = data_train.shape[0]

In [13]:
%time data_train_features = [get_features(img, img_class) for img, img_class in zip(data_train['image'], data_train['class'])]

CPU times: user 16min 29s, sys: 670 ms, total: 16min 29s
Wall time: 16min 46s


In [14]:
data_train_df = pd.DataFrame(data_train_features)
data_train_df

Unnamed: 0,class,feature_1,feature_10,feature_11,feature_12,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9
0,1,0.257675,0.090347,0.088745,0.121277,0.127579,0.192993,0.170944,0.254974,0.201874,0.469009,0.514191,0.469711
1,1,0.360321,0.124649,0.131256,0.146835,0.103897,0.265533,0.132339,0.326385,0.195496,0.370255,0.424332,0.377029
2,1,0.247589,0.107758,0.133209,0.233337,0.091492,0.169022,0.125031,0.209625,0.174805,0.505905,0.547760,0.405075
3,1,0.344940,0.059830,0.076736,0.123825,0.087921,0.300095,0.234116,0.441254,0.244919,0.347946,0.379623,0.314316
4,1,0.369858,0.086670,0.104034,0.155609,0.145950,0.210892,0.143402,0.288635,0.221146,0.387314,0.447495,0.396805
5,1,0.424637,0.054626,0.066269,0.080307,0.138046,0.255249,0.105331,0.287827,0.215332,0.406403,0.494934,0.435577
6,1,0.324661,0.068115,0.086807,0.152634,0.189499,0.286057,0.106613,0.197067,0.111664,0.488297,0.511978,0.433807
7,1,0.345322,0.058762,0.067917,0.079636,0.109421,0.278992,0.102890,0.294662,0.127411,0.482742,0.514755,0.502777
8,1,0.312988,0.014587,0.014008,0.015121,0.049683,0.296661,0.167923,0.403870,0.194305,0.493408,0.519363,0.481689
9,1,0.371597,0.031906,0.045502,0.077301,0.187897,0.298691,0.188400,0.290863,0.186783,0.395340,0.461990,0.423340


In [21]:
data_dir = "data"
if not os.path.exists(data_dir):
    os.makedirs(data_dir)

In [22]:
data_train_df.to_csv("{}/train.csv".format(data_dir), index=False)

In [17]:
%time data_valid_features = [get_features(img, img_class) for img, img_class in zip(data_valid['image'], data_valid['class'])]

CPU times: user 4min 7s, sys: 161 ms, total: 4min 7s
Wall time: 4min 8s


In [18]:
data_valid_df = pd.DataFrame(data_valid_features)
data_valid_df

Unnamed: 0,class,feature_1,feature_10,feature_11,feature_12,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9
0,1,0.341400,0.113556,0.117584,0.135681,0.155563,0.233658,0.110641,0.243515,0.174789,0.422653,0.467911,0.441391
1,1,0.278732,0.074310,0.072266,0.099884,0.109833,0.225998,0.110703,0.244080,0.134048,0.525024,0.559814,0.527084
2,1,0.251389,0.005539,0.034775,0.018646,0.112518,0.214706,0.222687,0.217407,0.201508,0.509705,0.625031,0.556610
3,1,0.398956,0.017914,0.029099,0.059113,0.181381,0.284012,0.138046,0.282242,0.185898,0.435150,0.493988,0.457962
4,1,0.281158,0.025421,0.038208,0.079086,0.123215,0.225052,0.164139,0.245834,0.176300,0.518951,0.579697,0.505722
5,1,0.377304,0.038361,0.047836,0.059113,0.137482,0.294907,0.172455,0.339752,0.212082,0.400711,0.461197,0.421875
6,1,0.380524,0.058411,0.095520,0.208664,0.111786,0.310776,0.146896,0.312744,0.187393,0.402267,0.465210,0.275284
7,1,0.328705,0.054886,0.056137,0.078186,0.062225,0.152969,0.142029,0.308182,0.255142,0.464615,0.562134,0.499207
8,1,0.359192,0.114014,0.118896,0.141907,0.117691,0.277405,0.153030,0.316360,0.207626,0.361572,0.432251,0.357697
9,1,0.355331,0.142670,0.146942,0.163803,0.128601,0.282791,0.076675,0.270035,0.135681,0.414154,0.438965,0.403976


In [23]:
data_valid_df.to_csv("{}/valid.csv".format(data_dir), index=False)