In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Preparation

In [None]:
!pip install tensorflow-determinism
!pip install py7zr

### Unpack pictures

In [None]:
!python -m py7zr x ../input/cifar-10/test.7z /kaggle/working/
# import py7zr
# except:
#     pass
# with py7zr.SevenZipFile("../input/cifar-10/test.7z", 'r') as archive:
#     archive.extractall(path="/kaggle/working/")


### Import libraries

In [None]:
from __future__ import print_function
import os
import pickle
import random

import cv2 as cv
import keras
from keras.datasets import cifar10
from keras.preprocessing.image import ImageDataGenerator
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Conv2D, MaxPooling2D, GlobalAveragePooling2D
from tensorflow.keras.callbacks import ModelCheckpoint, LearningRateScheduler
import tensorflow as tf
import tensorflow_datasets as tfds
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())
os.environ['TF_DETERMINISTIC_OPS'] = '1' # credits Wojciech Bogucki

import numpy as np
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split 
import pandas as pd
import itertools

import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline


### Set seed and import data

In [None]:
def set_seed(seed=123):
    tf.random.set_seed(seed)
    np.random.seed(seed)
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
  
set_seed()

num_classes=10

# Load dataset (different function than in google colab because of image rescaling)
data_dir = '/kaggle/working/dataset'
try:
    os.mkdir(data_dir)
except:
    pass

(train_set, test_set), dataset_info =  tfds.load( 
              name="cifar10", 
              split=["train", "test"], 
              with_info=True, 
              data_dir=data_dir
          )

def prepare_data(dataset, input_shape, create_val=True):
    x = []
    y = []

    for example in tfds.as_numpy(dataset):
        new_img = example['image']
        new_img = cv.resize(new_img, input_shape[:2], interpolation = cv.INTER_AREA) 
        x.append(new_img)
        y.append(example['label'])

    x = np.asarray(x)
    y = np.asarray(y)

    # Normalize the data. Before we need to connvert data type to float for computation
    x = x.astype('float32')
    x /= 255

    # Convert class vectors to binary class matrices. This is called one hot encoding
    y = keras.utils.to_categorical(y, num_classes)

    if create_val:
        x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.1, random_state=123)
        print(x_train.shape[0], 'train samples')
        print(x_val.shape[0], 'validation samples')
        print(y_val[:10].sum(axis=0) + y_train[:10].sum(axis=0))
        print('Shape of x: ', x.shape)
        return x_train, x_val, y_train, y_val
    else:
        print(x.shape[0], 'test samples')
        print(y[:10].sum(axis=0))
        return x, y


### Set some callbacks and plotting function

In [None]:
# define callbacks
def lr_schedule(epoch):
    lr = 1e-3
    if (epoch > 30):
        lr *= 0.01
    elif (epoch > 20):
        lr *= 0.1
    return lr

lr_callback = LearningRateScheduler(lr_schedule)

checkpoint = ModelCheckpoint(filepath='/kaggle/working/dataset/model2.h5', 
                               monitor='val_accuracy',
                               save_best_only=True,
                              )

early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=5, restore_best_weights=True)

# plot models
def plotmodelhistory(history): 
    fig, axs = plt.subplots(1,2,figsize=(15,5)) 
    # summarize history for accuracy
    axs[0].plot(history.history['accuracy']) 
    axs[0].plot(history.history['val_accuracy']) 
    axs[0].set_title('Model Accuracy')
    axs[0].set_ylabel('Accuracy') 
    axs[0].set_xlabel('Epoch')
    axs[0].legend(['train', 'validate'], loc='upper left')
    # summarize history for loss
    axs[1].plot(history.history['loss']) 
    axs[1].plot(history.history['val_loss']) 
    axs[1].set_title('Model Loss')
    axs[1].set_ylabel('Loss') 
    axs[1].set_xlabel('Epoch')
    axs[1].legend(['train', 'validate'], loc='upper left')
    plt.show()

### Function to load test images

In [None]:
# load test kaggle dataset
test_images_path = "/kaggle/working/test"

def load_test_images(input_shape, start, end):
    test_images = []
    labels = []
    for dir_name, _, filenames in os.walk(test_images_path):
        for filename in filenames[start:end]:
            img = cv.imread(os.path.join(dir_name,filename))
            img = cv.resize(img, input_shape[:2], interpolation = cv.INTER_AREA) 
            # Add the image to the dataset
            test_images.append(img)
            labels.append(filename.split('.')[0])
    
    x = np.asarray(test_images)
    x = x.astype('float32')
    x /= 255

    return x, labels


# Models

### Resize images

In [None]:
# Resize images
set_seed(123)
input_shape = (80, 80, 3)
x_train, y_train, x_val, y_val = prepare_data(train_set, input_shape, create_val=True)
x_test, y_test = prepare_data(test_set, input_shape, create_val=False)

### Load model architecture

In [None]:
# create model
set_seed(123)
# change model in this line to one of tried
# efficientnet.EfficientNetB{i} (for i in range(8))
# inception_resnet_v2.InceptionResNetV2
# vgg19.VGG19
model = tf.keras.applications.efficientnet.EfficientNetB7(
    weights='imagenet', include_top=False, input_shape=(80, 80, 3))


# parameter not used but may be worth consideration in later experiments
# model.trainable = False

out = model.layers[-1].output
out = GlobalAveragePooling2D() (out)
out = Flatten() (out)
out = Dense(512, activation='relu') (out)
out = Dropout(0.5) (out)
predictions = Dense(10, activation='softmax') (out)

myModel = Model(inputs=model.input, outputs=predictions)


### Compile model and set data augmentation

In [None]:
myModel.compile(
      loss='categorical_crossentropy',
      optimizer=tf.keras.optimizers.RMSprop(learning_rate=0.0001, decay=1e-6), 
      metrics=['accuracy']
  )

datagen = ImageDataGenerator(
    rotation_range=15,
    width_shift_range=0.1,
    height_shift_range=0.1,
#     shear_range=0.1,
#     zoom_range=0.1,
#     channel_shift_range=0.1,
    horizontal_flip=True
)

datagen.fit(x_train)


### Train model (most time consuming part, should be done with GPU)

In [None]:
batch_size = 64
epochs = 40

augmented_train = datagen.flow(
    x_train, y_train, batch_size
)

history = myModel.fit(
    augmented_train,
    validation_data=(x_val, y_val),
    epochs=epochs,
    batch_size=batch_size,
    callbacks=[early_stopping, checkpoint])

plotmodelhistory(history)

### Compute score on test dataset

In [None]:
scores = myModel.evaluate(x_test, y_test)
print(scores)


# Final prediction
Code has to be done in 2 parts due to RAM problems. Between parts change for loop and change submision file to other of {submission2, submission1}.

In [None]:
myModel.load_weights('/kaggle/working/dataset/model2.h5')
input_shape = (80, 80, 3)
class_names = ['airplane',
                'automobile',
                'bird',
                'cat',
                'deer',
                'dog',
                'frog',
                'horse',
                'ship',
                'truck']


all_preds = []
all_ids = []
n = 20
one_time = int(300000/n)
for i in range(int(n/2), n):
# for i in range(int(n/2)):
    print(i)
    x_test_kaggle, ids = load_test_images(input_shape, one_time*i, one_time*(i+1))
    y_pred = myModel.predict(x_test_kaggle)
    y_pred = np.argmax(y_pred, axis=1)
    y_pred = [class_names[i] for i in y_pred]
    all_preds += y_pred
    all_ids += ids
x_test_kaggle = None
ids = None

submissions=pd.DataFrame({"id": all_ids,
                          "label": all_preds})

submissions.to_csv("/kaggle/working/dataset/submission2.csv", index=False)

### Read predictions together

In [None]:
sub = pd.concat([pd.read_csv("/kaggle/working/dataset/submission1_.csv"),
                pd.read_csv("/kaggle/working/dataset/submission2_.csv")])
print(sub.shape)
sub.to_csv("/kaggle/working/dataset/submission.csv", index=False)
print(sub)

### Install kaggle, create kaggle json and upload results

In [None]:
!pip install kaggle

In [None]:
os.mkdir('/root/.kaggle')
with open('/root/.kaggle/kaggle.json', 'w') as f:
    # have to write your own credentials (can be found in account menu)
    f.write('{"username":"your_username","key":"write_your_own_key"}')

In [None]:
!kaggle competitions submit -c cifar-10 -f /kaggle/working/dataset/submission.csv -m "Some message"