# Looking at our data

## 1. Load data

In [1]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import PIL
import PIL.Image

from tensorflow.keras import models, layers, preprocessing
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.applications.resnet50 import ResNet50
from tensorflow.keras.applications.resnet50 import preprocess_input, decode_predictions



### 1.1 Load classification file

In [4]:
!pwd

/Users/Gui/Desktop/Data Science/Green Eye Project


In [5]:
!ls -la

total 109608
drwxr-xr-x  15 guillaumevanderwinden  staff       480 Nov 29 20:24 [1m[36m.[m[m
drwxr-xr-x   8 guillaumevanderwinden  staff       256 Nov 24 11:53 [1m[36m..[m[m
-rw-r--r--@  1 guillaumevanderwinden  staff      6148 Nov 28 21:31 .DS_Store
drwxr-xr-x   5 guillaumevanderwinden  staff       160 Nov 29 01:32 [1m[36m.ipynb_checkpoints[m[m
-rw-r--r--@  1 guillaumevanderwinden  staff  54805207 Nov 28 21:29 Firstmodel(16,frozenlayers,preprocessed).zip
-rw-r--r--   1 guillaumevanderwinden  staff      2912 Nov 24 14:33 Gui_Preprocessing.ipynb
-rw-r--r--   1 guillaumevanderwinden  staff     67226 Nov 24 14:26 Untitled.ipynb
drwxr-xr-x@  7 guillaumevanderwinden  staff       224 Nov 12 22:18 [1m[36mUnvalid__planet-understanding-the-amazon-from-space[m[m
drwxr-xr-x@  9 guillaumevanderwinden  staff       288 Nov 28 21:33 [1m[36mgreen-eye dataset[m[m
drwxr-xr-x@  6 guillaumevanderwinden  staff       192 Nov 24 16:34 [1m[36msolution_06-Deep-Learning_03-Convol

In [4]:
train_classes = pd.read_csv("raw_data/EncodedData.csv")

### 1.2. Edit classification data

In [5]:
labels = []
for i in train_classes['tags']:
    for x in i.split(' '):
        if x not in labels:
            labels.append(x)

labels

['haze',
 'primary',
 'agriculture',
 'clear',
 'water',
 'habitation',
 'road',
 'cultivation',
 'slash_burn',
 'cloudy',
 'partly_cloudy',
 'conventional_mine',
 'bare_ground',
 'artisinal_mine',
 'blooming',
 'selective_logging',
 'blow_down']

In [6]:
train_classes['tags'] = train_classes['tags'].apply(lambda x: x.split(' '))

In [7]:
train_classes['image_name'] = train_classes['image_name'].apply(lambda x: (x + '.jpg') if '.jpg' not in x else x)
train_classes.head()

Unnamed: 0,image_name,tags,taglist,clear,cloudy,haze,partly cloudy,agriculture,artisinal mine,bare ground,...,blow down,cultivation,habitation,primary,road,selective logging,conventional mine,slashu burn,water,result_id
0,train_0.jpg,"[haze, primary]","['haze', 'primary']",0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,'00100000000100000'
1,train_1.jpg,"[agriculture, clear, primary, water]","['clear', 'agriculture', 'primary', 'water']",1,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,1,'10001000000100001'
2,train_2.jpg,"[clear, primary]","['clear', 'primary']",1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,'10000000000100000'
3,train_3.jpg,"[clear, primary]","['clear', 'primary']",1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,'10000000000100000'
4,train_4.jpg,"[agriculture, clear, habitation, primary, road]","['clear', 'agriculture', 'habitation', 'primar...",1,0,0,0,1,0,0,...,0,0,1,1,1,0,0,0,0,'10001000001110000'


### 1.3. Make train and test data

In [8]:
train=train_classes.sample(frac=0.7,random_state=42)
test=train_classes.drop(train.index)

In [9]:
test.shape

(12144, 21)

In [10]:
datagen=preprocessing.image.ImageDataGenerator(rescale=1./255, validation_split=0.25)
train_gen=datagen.flow_from_dataframe(dataframe=train, 
                                directory="raw_data/train-jpg", x_col="image_name", y_col="tags", subset="training", seed=42,
                                shuffle=True, class_mode="categorical", target_size=(224,224), batch_size=32, classes=labels)
valid_gen=datagen.flow_from_dataframe(dataframe=train, 
                                directory="raw_data/train-jpg", x_col="image_name", y_col="tags", subset="validation", seed=42,
                                shuffle=True, class_mode="categorical", target_size=(224,224), batch_size=32, classes=labels)


Found 21252 validated image filenames belonging to 17 classes.
Found 7083 validated image filenames belonging to 17 classes.


In [123]:
datagen_test=preprocessing.image.ImageDataGenerator(rescale=1./255)
test_gen=datagen_test.flow_from_dataframe(dataframe=test, 
                                directory="raw_data/train-jpg", x_col="image_name", y_col="tags",
                                class_mode="categorical", target_size=(224,224), batch_size=32, classes=labels)

Found 12144 validated image filenames belonging to 17 classes.


In [112]:
print(train_gen.n)
print(train_gen.batch_size)

28335
32


In [130]:
next(iter(train_gen))[0].shape

(32, 224, 224, 3)

### 1.4. Make model

In [9]:
base_model = ResNet50(
    include_top=False,
    weights='imagenet',
    input_tensor=None,
    input_shape=(224,224,3),
    pooling='avg')

x = base_model.output

# let's add a fully-connected layer
x = layers.Dense(2048, activation='relu')(x)

# and a logistic layer -- let's say we have 200 classes
predictions = layers.Dense(17, activation='softmax')(x)

# this is the model we will train
model = models.Model(inputs=base_model.input, outputs=predictions)

for layer in base_model.layers:
    layer.trainable = False

In [10]:
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [11]:
model.summary()

Model: "resnet50"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, 224, 224, 3) 0                                            
__________________________________________________________________________________________________
conv1_pad (ZeroPadding2D)       (None, 230, 230, 3)  0           input_2[0][0]                    
__________________________________________________________________________________________________
conv1_conv (Conv2D)             (None, 112, 112, 64) 9472        conv1_pad[0][0]                  
__________________________________________________________________________________________________
conv1_bn (BatchNormalization)   (None, 112, 112, 64) 256         conv1_conv[0][0]                 
___________________________________________________________________________________________

conv5_block1_3_bn (BatchNormali (None, 7, 7, 2048)   8192        conv5_block1_3_conv[0][0]        
__________________________________________________________________________________________________
conv5_block1_add (Add)          (None, 7, 7, 2048)   0           conv5_block1_0_bn[0][0]          
                                                                 conv5_block1_3_bn[0][0]          
__________________________________________________________________________________________________
conv5_block1_out (Activation)   (None, 7, 7, 2048)   0           conv5_block1_add[0][0]           
__________________________________________________________________________________________________
conv5_block2_1_conv (Conv2D)    (None, 7, 7, 512)    1049088     conv5_block1_out[0][0]           
__________________________________________________________________________________________________
conv5_block2_1_bn (BatchNormali (None, 7, 7, 512)    2048        conv5_block2_1_conv[0][0]        
__________

In [None]:
STEP_SIZE_TRAIN=train_gen.n//train_gen.batch_size
STEP_SIZE_VALID=valid_gen.n//valid_gen.batch_size
STEP_SIZE_TEST=test_gen.n//test_gen.batch_size
history = model.fit_generator(generator=train_gen,
                    steps_per_epoch=STEP_SIZE_TRAIN,
                    validation_data=valid_gen,
                    validation_steps=STEP_SIZE_VALID,
                    epochs=10
)

Epoch 1/10
Epoch 2/10

## 3. Explore data

In [None]:
for i in labels:
    train_classes[i] = train_classes['tags'].apply(lambda x:1 if i in x else 0)
train_classes.head()

In [9]:
def conditional_frequency(a, b):
    v = (a * b).sum()/b.sum()
    return v
train_classes.drop(weather, axis=1).corr(method=conditional_frequency)

Unnamed: 0,primary,agriculture,water,habitation,road,cultivation,slash_burn,conventional_mine,bare_ground,artisinal_mine,blooming,selective_logging,blow_down
primary,1.0,0.972148,0.944677,0.947814,0.957502,0.995086,1.0,0.94,0.792343,0.955752,1.0,1.0,1.0
agriculture,0.972148,1.0,0.365943,0.747814,0.747615,0.7543,0.569378,0.24,0.261021,0.112094,0.096386,0.191176,0.22449
water,0.944677,0.365943,1.0,0.25,0.263288,0.19388,0.114833,0.26,0.238979,0.882006,0.048193,0.144118,0.030612
habitation,0.947814,0.747814,0.25,1.0,0.345186,0.199911,0.196172,0.36,0.189095,0.085546,0.012048,0.038235,0.030612
road,0.957502,0.747615,0.263288,0.345186,1.0,0.289033,0.172249,0.59,0.37471,0.324484,0.03012,0.444118,0.020408
cultivation,0.995086,0.7543,0.19388,0.199911,0.289033,1.0,0.602871,0.04,0.103248,0.053097,0.105422,0.170588,0.081633
slash_burn,1.0,0.569378,0.114833,0.196172,0.172249,0.602871,1.0,0.0,0.011601,0.0,0.006024,0.005882,0.020408
conventional_mine,0.94,0.24,0.26,0.36,0.59,0.04,0.0,1.0,0.011601,0.011799,0.0,0.0,0.0
bare_ground,0.792343,0.261021,0.238979,0.189095,0.37471,0.103248,0.011601,0.011601,1.0,0.117994,0.009036,0.038235,0.040816
artisinal_mine,0.955752,0.112094,0.882006,0.085546,0.324484,0.053097,0.0,0.011799,0.117994,1.0,0.0,0.017647,0.0


In [22]:
train_classes[train_classes['primary']==1].mean(axis=0)

primary              0.94
agriculture          0.24
water                0.26
habitation           0.36
road                 0.59
cultivation          0.04
slash_burn           0.00
conventional_mine    1.00
bare_ground          0.10
artisinal_mine       0.04
blooming             0.00
selective_logging    0.00
blow_down            0.00
clear                0.70
haze                 0.02
cloudy               0.00
partly_cloudy        0.28
dtype: float64

In [23]:
train_classes[train_classes['artisinal_mine']==1].mean(axis=0)

primary              0.955752
agriculture          0.112094
water                0.882006
habitation           0.085546
road                 0.324484
cultivation          0.053097
slash_burn           0.000000
conventional_mine    0.011799
bare_ground          0.117994
artisinal_mine       1.000000
blooming             0.000000
selective_logging    0.017699
blow_down            0.000000
clear                0.905605
haze                 0.014749
cloudy               0.000000
partly_cloudy        0.079646
dtype: float64

In [24]:
train_classes[train_classes['blooming']==1].mean(axis=0)

primary              1.000000
agriculture          0.096386
water                0.048193
habitation           0.012048
road                 0.030120
cultivation          0.105422
slash_burn           0.006024
conventional_mine    0.000000
bare_ground          0.009036
artisinal_mine       0.000000
blooming             1.000000
selective_logging    0.021084
blow_down            0.003012
clear                0.936747
haze                 0.012048
cloudy               0.000000
partly_cloudy        0.051205
dtype: float64

In [45]:
train_classes[train_classes['primary'] + train_classes['cloudy']==0].mean(axis=0)

primary              0.000000
agriculture          0.391106
water                0.467503
habitation           0.217788
road                 0.391106
cultivation          0.025086
slash_burn           0.000000
conventional_mine    0.006842
bare_ground          0.204105
artisinal_mine       0.017104
blooming             0.000000
selective_logging    0.000000
blow_down            0.000000
clear                0.870011
haze                 0.030787
cloudy               0.000000
partly_cloudy        0.098062
dtype: float64