<a href="https://colab.research.google.com/github/KolatimiDave/Amazon-Forest-Project/blob/master/Amazon_Planet.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import Relevant Packages
import os
import cv2
import time
import random
import numpy as np
import pandas as pd
from tqdm import tqdm

import tensorflow as tf
from tensorflow import keras
from keras.models import load_model
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.optimizers import Adam, RMSprop
from tensorflow.keras.applications.vgg16 import preprocess_input
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from sklearn.metrics import roc_curve, auc, roc_auc_score,fbeta_score
from tensorflow.keras.applications import InceptionV3, VGG16, ResNet50, ResNet152
from tensorflow.keras.callbacks import ReduceLROnPlateau, ModelCheckpoint, EarlyStopping
from tensorflow.keras.layers import Dense, Dropout, Flatten,Conv2D, GlobalAveragePooling2D, BatchNormalization


In [None]:
# Define path and directory variables
### Ensure to have downloaded the dataset from 'https://www.kaggle.com/nikitarom/planets-dataset'
TRAIN_DIR = '/content/planet/train-jpg'
TEST_DIR = '/content/planet/test-jpg'
TEST_DIR_1 = '/content/test-jpg-additional'

train_csv = pd.read_csv('/content/planet/train_classes.csv')
sample = pd.read_csv('/content/planet/sample_submission.csv')
train_path = '/content/planet/train-jpg/'

print('GPU AVAILABILITY',tf.config.list_physical_devices('GPU'))


In [None]:
# Get Unique tags
label_list = []
for tag_str in train_csv.tags.values:
    labels = tag_str.split(' ')
    for label in labels:
        if label not in label_list:
            label_list.append(label)

In [None]:
# Add onehot features for every label
for label in label_list:
    train_csv[label] = train_csv['tags'].apply(lambda x: 1 if label in x.split(' ') else 0)


In [None]:
# Get test_id and image_path
# First Test Directory
image_name = []
image_path = []
for img in tqdm(os.listdir(TEST_DIR)):
  image_name.append(img.split('.')[0])
  image_path.append(TEST_DIR + '/' + img)

# Second Test Directory
image_name_1 = []
image_path_1 = []
for img in tqdm(os.listdir(TEST_DIR_1)):
  image_name_1.append(img.split('.')[0])
  image_path_1.append(TEST_DIR_1 + '/' + img)


In [None]:
# combine the two
image_name.extend(image_name_1) # Name of all Test images
image_path.extend(image_path_1) # Path of all Test Images


In [None]:
#create dataframes and use keras Image data preprocessing function: flow_from_dataframe
# Train
train_data = train_csv.drop('tags',1)
train_names = train_data.image_name.values
train_data['filepath'] = train_path + train_names + '.jpg'
train = train_data.copy()

# Test
test_names = pd.Series(image_name).values
test_dict = {'filepath': image_path, 'image_name':test_names}
test = pd.DataFrame(data=test_dict)



In [None]:
print('train and test shapes',train.shape, test.shape)


In [None]:
# Splitting Train data into 2, the other is for validation.
df_train = train[:36500:]
df_val = train[36500:]
print('train and validation shapes',df_train.shape, df_val.shape)


In [None]:
#seed for reproducibility
SEED = 20
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)


In [None]:
#data generators
batch_size = 128
image_size = (224, 224)
learning_rate = 0.0001 

train_steps = np.ceil(len(df_train) / batch_size)
val_steps = np.ceil(len(df_val) / batch_size )

classes = [i for i in train.columns.to_list() if i not in ['tags', 'image_name', 'filepath'] ]

train_datagen = ImageDataGenerator(preprocessing_function=preprocess_input,horizontal_flip=True,
                             vertical_flip=True,shear_range=10,zoom_range=0.2,width_shift_range=0.1,
                             height_shift_range=0.1,channel_shift_range=10.)

val_datagen = ImageDataGenerator(preprocessing_function=preprocess_input)

train_ds = train_datagen.flow_from_dataframe(df_train,x_col='filepath', y_col=classes,
                                        target_size=image_size,classes=classes,
                                        batch_size=batch_size,
                                        class_mode='raw', shuffle=True, seed=SEED)

val_ds = val_datagen.flow_from_dataframe(df_val,x_col='filepath', y_col=classes,
                                        target_size=image_size,classes=classes,
                                        batch_size=batch_size,
                                        class_mode='raw', shuffle=False, seed=SEED)

test_ds = val_datagen.flow_from_dataframe(test, x_col='filepath',target_size=image_size, class_mode=None,
                                          shuffle=False,batch_size=batch_size)

In [None]:
y_valid_df = df_val.drop(columns=['image_name','filepath']) 


In [None]:
# Evaluate Model
def ModelEvaluator(model_eval_preds, h5_name, threshold=0.5):
    score = fbeta_score(y_valid_df, np.array(model_eval_preds) > threshold, beta=2, average='samples')  
    print('{} evaluation on validation_data with a threshold of {} = {}'.format(h5_name,threshold, score))
    return score

In [None]:
# Model Submission Files Generation

def PredictionsClipper(preds, threshold=0.05):
    ''' Changing our predictions into a DataFrame'''
    preds = pd.DataFrame(preds, columns=y_valid_df.columns.to_list())
    
    new_features, tmp = [], []
    for idx in range(preds.shape[0]):
      for col in preds.columns.to_list():
        if preds[col].iloc[idx]>threshold:
          tmp.append(col)
      new_features.append(tmp)
      tmp = [] # Reseting the state of temporary[tmp] list back to being empty for every index[idx]

    return new_features


In [None]:
def PredictionsFormater(pred_features):
  ''' Changing our predicted tags to the submission format'''
    tmp = ''
    new_cols = []
    for idx,_ in enumerate(pred_features):
        for j in pred_features[idx]:
            tmp = tmp + j+ ' ' 
        new = tmp[:-1]
        new_cols.append(new)
        tmp = '' # Reseting the state of temporary[tmp] string back to empty for every index[idx]

    return new_cols

In [None]:

def PredictionsToCsv(new_cols, sub_name):
    sub_dict = {'image_name': test.image_name, 'tags': pd.Series(new_cols)}
    sub = pd.DataFrame(sub_dict)
    sub.to_csv(sub_name, index=False)
    
    return sub



In [None]:
class Network:
    
    def __init__(self): # Initialize class variables
        self.model = None
        self.preds = None
        self.history = None
        self.score = None
        self.h5_name = None
        self.model_eval_preds = None
        self.threshold = None
        self.preds = None
        self.new_features = None
        self.new_cols = None
        
    def BuildModel (self, pretrained):
      ''' Transfer Learning '''
        base_model = pretrained(include_top=False, weights='imagenet', input_shape=(224, 224,3)) #include_top=False to remove the last layer 

        for layer in base_model.layers:
            layer.trainable = False   #freeze trainable layers

        x = base_model.output
        x = GlobalAveragePooling2D()(x)
        x = Dropout(0.1)(x)
        x = BatchNormalization()(x)
        x = Dense(3064, activation='relu')(x)
        x = Dropout(0.3)(x)
        x = BatchNormalization()(x)
        output = Dense(17, activation='sigmoid')(x) 

        self.model = Model(base_model.input, output)

        return self.model
    
    def TrainModel(self, h5_name, nb_epochs = 30, patience = 5):
        self.h5_name = h5_name
        optimizer = keras.optimizers.Adam(lr=learning_rate)
        self.model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['AUC'])

        earlystop = EarlyStopping(monitor='val_loss', patience=patience, verbose=1, restore_best_weights=True)
        chkpt_path = os.path.join("./", self.h5_name+'_Amazon_Forest.h5', )
        checkpoint = ModelCheckpoint(chkpt_path, monitor='val_loss',mode='auto', verbose=1, save_best_only=True)

        self.history = self.model.fit(train_ds, epochs=nb_epochs,steps_per_epoch=train_steps, callbacks=[earlystop, checkpoint],
                            verbose=1, shuffle=False,validation_data=(val_ds), validation_steps= val_steps)

        start = time.time() # time module to calculate inference time on test dataset
        self.preds = self.model.predict(test_ds) 
        inference_time = time.time() - start
        print('{} inference time = {:.2f} minutes'.format(h5_name, inference_time/60))
        
        self.model_eval_preds = self.model.predict(val_ds) # Making predictions on the validation dataset
        print('first', type(self.model_eval_preds))
        
        return self.preds, self.model, self.history
    
    def EvalModel(self, threshold=0.5):
        self.threshold = threshold
        self.score = ModelEvaluator(self.model_eval_preds, self.h5_name, self.threshold)
        
        return self.score
    
    def ClipPredictions(self):
        self.new_features = PredictionsClipper(self.preds, self.threshold) 
        
        return self.new_features
    
    def FormatPredictions(self):
        self.new_cols = PredictionsFormater(self.new_features)

        return self.new_cols
    
    def ToCsv(self,sub_name):
        sub = PredictionsToCsv(self.new_cols, sub_name)
        
        return sub


In [None]:
def load_trained_model(h5_name):
    try:
      if h5_name in [i[0:len(h5_name)] for i in os.listdir('./')]:
          model = load_model(h5_name+'_Amazon_Forest.h5')
          return model
      else:print('.h5 Path incorrect ')
    except OSError:
      print('unable to load .h5 file from specified path')

    

In [None]:
def MakeCSVs(pretrained, h5_name, sub_name, threshold=0.5):
    network = Network()
    model = network.BuildModel(pretrained)
    preds, fitted_model, model_history = network.TrainModel(h5_name=h5_name, nb_epochs=30)
    score = network.EvalModel(threshold = 0.25)
    pred_features = network.ClipPredictions()
    formated_features = network.FormatPredictions()
    sub = network.ToCsv(sub_name)
    
    return model, score, preds, 


In [None]:
# ResNet152 Transfer learning
res152_trained, fbeta_res152, res152_preds = MakeCSVs(ResNet152,'res152_model','res152.csv')


In [None]:
# VGG16 Transfer learning
vgg16_trained, fbeta_vgg16, vgg16_preds = MakeCSVs(VGG16,'vgg16_model','vgg16.csv')


In [None]:
# ResNet52 Transfer learning
res50_trained, fbeta_res50, res50_preds = MakeCSVs(ResNet50,'res50_model','res50.csv')


In [None]:
blended_pred = (res152_preds * 0.4)  + (vgg16_preds*0.2) + (res50_preds*0.4)

In [None]:
blended_features = PredictionsClipper(V_preds, threshold = 0.22)
blended_columns = PredictionsFormater(blended_features)
blended_sub = PredictionsToCsv(blended_columns, sub_name='Blended.csv')

In [None]:
## ResNet152 LB score of 0.90984
##  VGG16 lB score of 0.90280
## ResNet52 LB score of 0.91107

## Blended LB score of 0.91630