# Kaggle Planet Competition: How to land in top 4%

## Extra Material
fastai api implementation: https://medium.com/ai-saturdays/kaggle-planet-competition-how-to-land-in-top-4-a679ff0013ba, https://github.com/irshadqemu/Kaggle-Competitions/blob/master/Planet_amazon_resnet34.ipynb

p7zip-full install step: http://ask.xmodulo.com/install-7zip-linux.html

data exploration & analysis(full detail): https://www.kaggle.com/anokas/data-exploration-analysis

other pytorch model: https://www.kaggle.com/mratsim/starting-kit-for-pytorch-deep-learning, https://github.com/mratsim/Amazon-Forest-Computer-Vision

# Data Exploration & Analysis

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import gc
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

pal = sns.color_palette()

import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls

In [None]:
print('# File sizes')
for f in os.listdir('./data'):
    if not os.path.isdir('./data/' + f):
        print(f.ljust(30) + str(round(os.path.getsize('./data/' + f) / 1000000, 2)) + 'MB')
    else:
        sizes = [os.path.getsize('./data/'+f+'/'+x)/1000000 for x in os.listdir('./data/' + f)]
        print(f.ljust(30) + str(round(sum(sizes), 2)) + 'MB' + ' ({} files)'.format(len(sizes)))

In [None]:
df_train = pd.read_csv('./data/train.csv')
df_train.head()

In [None]:
labels = df_train['tags'].apply(lambda x: x.split(' '))
from collections import Counter, defaultdict
counts = defaultdict(int)
for l in labels:
    for l2 in l:
        counts[l2] += 1

data=[go.Bar(x=list(counts.keys()), y=list(counts.values()))]
layout=dict(height=800, width=800, title='Distribution of training labels')
fig=dict(data=data, layout=layout)
py.iplot(data, filename='train-label-dist')

### what is co-occurrence matrix?
A co-occurrence matrix or co-occurrence distribution is a matrix that is defined over an image to be the distribution of co-occurring pixel values (grayscale values, or colors) at a given offset.
link: https://en.wikipedia.org/wiki/Co-occurrence_matrix

In [None]:
# Co-occurence Matrix
com = np.zeros([len(counts)]*2)
for i, l in enumerate(list(counts.keys())):
    for i2, l2 in enumerate(list(counts.keys())):
        c = 0
        cy = 0
        for row in labels.values:
            if l in row:
                c += 1
                if l2 in row: cy += 1
        com[i, i2] = cy / c

data=[go.Heatmap(z=com, x=list(counts.keys()), y=list(counts.keys()))]
layout=go.Layout(height=800, width=800, title='Co-occurence matrix of training labels')
fig=dict(data=data, layout=layout)
py.iplot(data, filename='train-com')

In [None]:
import cv2

new_style = {'grid': False}
plt.rc('axes', **new_style)
_, ax = plt.subplots(3, 3, sharex='col', sharey='row', figsize=(20, 20))
i = 0
for f, l in df_train[:9].values:
    img = cv2.imread('./data/train-jpg/{}.jpg'.format(f))
    ax[i // 3, i % 3].imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
    ax[i // 3, i % 3].set_title('{} - {}'.format(f, l))
    #ax[i // 4, i % 4].show()
    i += 1
    
plt.show()

# Model Training

In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
import os, sys
sys.path.append('./')
from fastai.conv_learner import *
from fastai.plots import *

In [None]:
path = './data' #path to folder containing data
sz=64   #image size
bs=64    #batch size 

## Evaluation Criteria and Initial Model 
If you read the evaluation criteria of competition, you will know it is based on f2 score. we define metrics for model accordingly. You can find the further information about F2 score [here](https://clusteval.sdu.dk/1/clustering_quality_measures/5). 

For our initial model, we will be using pre-trained implementation of deep residual model renet34 which was [made public by Microsoft](https://medium.com/r/?url=https%3A%2F%2Farxiv.org%2Fpdf%2F1512.03385.pdf) 

In [None]:
from sklearn.metrics import fbeta_score
import warnings

In [None]:
def f2(preds, targs, start=0.17, end=0.24, step=0.01):
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        return max([fbeta_score(targs, (preds>th), 2, average='samples')
                    for th in np.arange(start,end,step)])

def opt_th(preds, targs, start=0.17, end=0.24, step=0.01):
    ths = np.arange(start,end,step)
    idx = np.argmax([fbeta_score(targs, (preds>th), 2, average='samples')
                for th in ths])
    return ths[idx]

In [None]:
metrics=[f2]
f_model = resnet34

In [None]:
#  Prepare data Validation set
label_csv = f'{path}train_v2.csv'
n = len(list(open(label_csv)))-1 #total number of images
val_idxs = get_cv_idxs(n) #it will return 20% indexes from training data set to used for val data

In [None]:
def get_data(sz, bs):
    """Returns data generator"""
    tfms =  tfms_from_model(f_model, sz, aug_tfms=transforms_top_down, max_zoom=1.05)
    return ImageClassifierData.from_csv(path, 'train-jpg', f'{path}train_v2.csv', bs, tfms, suffix='.jpg', val_idxs=val_idxs, test_name='test-jpg')
data = get_data(bs, sz)

In [None]:
mdl = ConvLearner.pretrained(f_model, data, metrics=metrics) 

### Finding the Learning Rate
Learning rate(LR) is one of the most important hyper parameter of your model. It determines how fast or slow your model will learn.If LR is too high, model will try to learn too fast and loss function will not converge. If LR is very too low you model will take too long to converge.

Finding a good learning rate using fastai library is very easy, just use the  following two commands. They will plot a graph of LR against loss function, a good value for LR will be where the slop of the loss function is highest. As we can see slope is highest between *0.1 to 1*, you can use any value in between this range. It would be a good idea to experiment with a few values in this range to find the optimal value.After experimenting with three values, 0.2 seemed to work best for me.

In [None]:
lrf = mdl.lr_find()
mdl.sched.plot()

### Training the model 

Total size of chips in competition is 256X256, we start training our model with 64x64 and will gradually increase the size of image as training progress. This is a very good technique to avoid our fitting.

In [None]:
lr=0.2
data = get_data(64, 64) #data generator for batch size=64, image size=64x64
learn = ConvLearner.pretrained(f_model, data, metrics=metrics)
learn.fit(lr, 3, cycle_len=1, cycle_mult=2)

When training the model, fastai implements a technique called **stochastic gradient descent with restarts (SGDR)**, which trains model in cycles, where each cycle consists of one or more epochs. For each cycle, it starts with LR original value and will exponentially decrease the LR([Exponential learning rate schedule](https://towardsdatascience.com/learning-rate-schedules-and-adaptive-learning-rate-methods-for-deep-learning-2c8f433990d1)) as the training progress. Second parameter in fit denotes the total number of cycles. Total number of epochs in a cycle are controlled by 2 parameter *cycle_len* and *cycle_mult* as follows.

`number of epochs in first cycle = cycle_len 
 number of epochs in second cycle = number of epochs in previous(first) cycle x cycle_mult
 number of epochs in third cycle =  number of epochs in previous(second) cycle x cycle_mult`
 
 
Here is the graph the show changes in LR for each cycle in above training.


In [None]:
learn.sched.plot_lr()

By default the fastai will freeze the weights of all of the layers except a few last layers and the ones that it adds to fine-tune the model for your dataset. So in above epochs, all of learning is done by those unfrozen last layers.
Next, we will unfreeze the weights of all of the layers to get more accuracy out of our model.

In [None]:
lrs = [lr/9, lr/3, lr]
learn.unfreeze()
learn.fit(lrs, 3, cycle_len=1, cycle_mult=2)

If you have noticed, I have used an array for LR instead of a single value. If you give an array of 3 elements to fastai, it will divide the layers into 3 equal sets. For each set, it will use corresponding value from array.Since we are using a pre-trained model, and in a CNN initial set of layers usually learns simple features(like find a edge, corner, etc) so we don't want our initial layers to change too much, therefore we are using the lowest LR for them.  Higher layers in CNN learns to find complex features (like geometrical patterns,faces etc), so having a higher rate for them would be good idea so they can adopt more rapidly to our data set.

In [None]:
learn.sched.plot_loss()

In [None]:
#train for image size128x128
learn.set_data(get_data(128, 64))

learn.freeze()
learn.fit(lr, 3, cycle_len=1, cycle_mult=2)

learn.unfreeze()
learn.fit(lr, 3, cycle_len=1, cycle_mult=2)

In [None]:
#train for image size256x256
learn.set_data(get_data(256, 64))

learn.freeze()
learn.fit(lr, 3, cycle_len=1, cycle_mult=2)

learn.unfreeze()
learn.fit(lr, 3, cycle_len=1, cycle_mult=2)

In [None]:
learn.sched.plot_loss()

fastai has another very good feature called Test Time Augmentation (TTA). The dea is simple; apply simple augmentation on each test image to generate five copies  of it,  and then do the prediction for each copy. You can average these prediction to get a significant(1-2%) decrease in error.
 
So we have trained our first model, let's see how well it performs on validation set using TTA.

In [None]:
probs = learn.predict() #returns prediction without TTA
f2_without_TTA =f2(probs, data.val_y)
probs,y = learn.TTA()
probs = np.mean(probs, axis=0)
f2_with_TTA = f2(probs, y)
print(f"F2 Score without TTA:{f2_without_TTA},   and with TTA:{f2_with_TTA}")

In [None]:
dd

In [None]:
learn.save('resnet34.weights.1')

In [None]:
learn.load('resnet34.weights.1')

## Preparing our first submission

In our submission file, we need to place predicted labels against each image. Each image can belong to more than one class.

`file_10770,agriculture clear cultivation primary road
test_26732,agriculture clear cultivation haze primary`

If you look at an example of predictions from our validation set, you will see our original labels are in the form of 1's, 0's, but our predictions are floating point numbers. So, we need to pick a threshold for our predicts to be included in submission files (0.66 for below example). `op_th` function tries multiple threshold in a given range and returns the one which maximizes the F2 score.

In [None]:
print(probs[0])
print(y[0])

In [None]:
threshold = opt_th(probs, y)
print(threshold)

In [None]:
%time test_preds, _ = learn.TTA(is_test=True)
preds = np.mean(test_preds, axis=0)
classes = np.array(data.classes)
res = np.array([" ".join(classes[(np.where(pp>threshold))]) for pp in preds])
filenames = np.array([os.path.basename(fn).split('.')[0] for fn in data.test_ds.fnames])
frame=pd.DataFrame(res, index=filenames, columns=['tags'])
frame.to_csv(f'{path}planet_amazon_restnet34_submission1.csv', index_label='image_name')

## Ensembling
Instead of training one model, we will be training multiple models and then averaging their prediction. This techniques is always employed to get more accuracy on data set

In [None]:
def get_ensumble(nmodels):
    models = list()
    
    for i in range(nmodels):
        print(f'-----Training model: {i+1}--------')
        val_idx = get_cv_idxs(n, val_pct=0.1, seed=12345) #use 10% of train data as val data
        
        data = get_data_ens(64, i, val_idx)
        learn = ConvLearner.pretrained(f_model, data, metrics=metrics)
        print('training for 64x64')
        learn.fit(lr, 2, cycle_len=1, cycle_mult=2)
        learn.unfreeze()
        learn.fit(lrs, 2, cycle_len=1, cycle_mult=2)
        
        print('training for 128x128')
        learn.set_data(get_data_ens(128, i, val_idx))
        learn.freeze()
        learn.fit(lr, 2, cycle_len=1, cycle_mult=2)
        learn.unfreeze()
        learn.fit(lrs, 2, cycle_len=1, cycle_mult=2)
        
        print('training for 256x256')
        learn.set_data(get_data_ens(256, i, val_idx))
        learn.freeze()
        learn.fit(lr, 2, cycle_len=1, cycle_mult=2)
        learn.unfreeze()
        learn.fit(lrs, 2, cycle_len=1, cycle_mult=2)
        
        
        learn.save(f'ensem_model_{i}.weights')
        np.savez_compressed(f'{path}models/ensem_model_{i}_validx', val_idx=val_idx)
        models.append(learn)
        print(f'-----Training of model {i+1} complete----')
    return models
        
    
def get_data_ens(img_sz, model_index, val_idx):
    return  ImageClassifierData.from_csv(path, 'train-jpg', f'{path}train_v2.csv', bs, get_transform(model_index, img_sz), suffix='.jpg', val_idxs=val_idx, test_name='test-jpg')

    
    
def get_transform(index, img_sz):
    f_model=resnet34
    index = (index%5)
    print(f'get_transform--{index}: {img_sz}')
    tfms = [
        tfms_from_model(f_model, img_sz, aug_tfms=transforms_basic, max_zoom=1.05),
        tfms_from_model(f_model, img_sz, aug_tfms=transforms_side_on, max_zoom=1.05),
        tfms_from_model(f_model, img_sz, aug_tfms=transforms_top_down, max_zoom=1.05),
        tfms_from_model(f_model, img_sz, aug_tfms=transforms_top_down, max_zoom=1.1),
        tfms_from_model(f_model, img_sz, aug_tfms=transforms_top_down, max_zoom=1.05, crop_type=CropType.RANDOM)]
    return tfms[index]

    

In [None]:
%time ens = get_ensumble(5)

### Preparing Submission

In [None]:
#Find optimized Threshold
ens_val_probs=list()
th_list=list()
for mdl in ens:
    val_probs, y=mdl.TTA()
    val_probs=np.mean(val_probs, axis=0)
    acc = f2(val_probs, y)
    print (f'f2 Score: {acc}')
    th =opt_th(val_probs, y)
    th_list.append(th)
    ens_val_probs.append(val_probs)
print(th_list)   
op_th = np.mean(th_list)
print(op_th)

In [None]:
#Prepare test predictions
ens_test_probs = list()
for mdl in ens:
    test_probs,_ = mdl.TTA(is_test=True)
    test_probs = np.mean(test_probs, axis=0)
    ens_test_probs.append(test_probs)
ens_test_probs = np.array(ens_test_probs)
ens_test_probs = np.mean(ens_test_probs, axis=0)
ens_test_probs.shape

In [None]:
#prepare the submission file
classes = np.array(ens[0].data.classes)
res = np.array([" ".join(classes[(np.where(pp>op_th))]) for pp in ens_test_probs])
filenames = np.array([os.path.basename(fn).split('.')[0] for fn in ens[0].data.test_ds.fnames])
frame=pd.DataFrame(res, index=filenames, columns=['tags'])
frame.to_csv(f'{path}planet_amazon_restnet34_submission_ens.csv', index_label='image_name')

## A Machine Learning model for calculating threshold
When preparing submission, we used a threshold of ~0.2 to select classes for all of test images, but ideally each test image should have a separate threshold depending on the predictions values from model. I experimented with training ML model to find the better threshold but didn’t succeed. Here is the code.

In [None]:
from sklearn.linear_model import Ridge
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score, mean_squared_error

In [None]:
def get_optimized_threshold_list(x, y):
    th_list = list()
    for truth,  preds in zip(y, x):
        steps = np.arange(0.1, 0.4, 0.01)
        acc = np.array([accuracy_score(truth, (preds > th)) for th in steps])
        th_list.append(steps[np.argmax(acc)])
    return th_list
                   
                   
def ml_model_threshold():
    arr = np.load(f'{path}models/preds_probs.npz')
    x = arr['probs']
    y = arr['y']
    th_list= get_optimized_threshold_list(x, y)
    model = make_pipeline(PolynomialFeatures(2), Ridge())
    model.fit(x, th_list)
    print (mean_squared_error(th_list, model.predict(x)))
    return model

### Submitting Machine Learning predicitons

In [None]:
model = ml_model_threshold()

In [None]:
classes = np.array(data.classes)
op_th_ml = model.predict(preds)
res = np.array([" ".join(classes[(np.where(preds[i]>op_th_ml[i]))]) for i in range(len(preds))])
filenames = np.array([os.path.basename(fn).split('.')[0] for fn in data.test_ds.fnames])
frame=pd.DataFrame(res, index=filenames, columns=['tags'])
frame.to_csv(f'{path}planet_amazon_restnet34_submission2.csv', index_label='image_name')