In [None]:
import numpy as np
import pandas as pd
import os
import cv2
import matplotlib.pyplot as plt
import time
import glob
import shutil
import json
import random
import datetime
from tqdm.notebook import tqdm
from sklearn.model_selection import StratifiedKFold
import albumentations as A
np.random.seed(42)

In [None]:
# max image size for EfficientNet-B4
MAX_IMAGE_SIZE = 380

# only use 10 % of dataset for hparam tuning
HPARAM_SET = False
H_FOLDS = 10 # Number of folds
FOLDS = 3 # number of folds for training set (not actually 33% of dataset, only ids with > 3 images)
NUM_PICTURES = 5000

# Output the dataset as a kaggle dataset (higher memory limit)
KAGGLE_DATASET_NAME = 'happywhale-luke' # Name of the resulting dataset

USE_TRANSFORMATIONS = False

TEST_MODE = False # Used for debugging

# Create a Kaggle Dataset

In [None]:
# authenticate Kaggle user to create a dataset directlyt from notebook
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
apitoken = user_secrets.get_secret("kaggle")
!mkdir -p ~/.kaggle
%store apitoken >~/.kaggle/kaggle.json

In [None]:
# !rm -r /kaggle/working/tmp/{KAGGLE_DATASET_NAME}

# copy everything over to output folder
!cp ../input/happy-whale-and-dolphin/sample_submission.csv sample_submission.csv
!cp ../input/happy-whale-and-dolphin/train.csv train.csv

# found this code on Kaggle to create dataset in a notebook
BASE_PATH = f"/kaggle/working/tmp/{KAGGLE_DATASET_NAME}"
!mkdir -p BASE_PATH

with open('/root/.kaggle/kaggle.json') as f:
    kaggle_creds = json.load(f)

os.environ['KAGGLE_USERNAME'] = kaggle_creds['username']
os.environ['KAGGLE_KEY'] = kaggle_creds['key']

!kaggle datasets init -p /kaggle/working/tmp/{KAGGLE_DATASET_NAME}

with open(f'/kaggle/working/tmp/{KAGGLE_DATASET_NAME}/dataset-metadata.json') as f:
    dataset_meta = json.load(f)
dataset_meta['id'] = f'luketambakis/{KAGGLE_DATASET_NAME}'
dataset_meta['title'] = KAGGLE_DATASET_NAME
with open(f'/kaggle/working/tmp/{KAGGLE_DATASET_NAME}/dataset-metadata.json', "w") as outfile:
    json.dump(dataset_meta, outfile)
print(dataset_meta)

!cp /kaggle/working/tmp/{KAGGLE_DATASET_NAME}/dataset-metadata.json /kaggle/working/tmp/{KAGGLE_DATASET_NAME}/meta.json
!ls /kaggle/working/tmp/{KAGGLE_DATASET_NAME}

!kaggle datasets create -u -p /kaggle/working/tmp/{KAGGLE_DATASET_NAME} 

# make directories for images
!mkdir {BASE_PATH}/test_images
!mkdir {BASE_PATH}/train_images

INPUT_PATH = "../input/happy-whale-and-dolphin/train_images"
DIR_NAME = "train_images"

!cp train.csv f"{BASE_PATH}/train.csv"

# Pre-Process Train CSV

In [None]:
train = pd.read_csv("./train.csv")

train["species"][train["species"] == "bottlenose_dolpin"] = "bottlenose_dolphin"
train["species"][train["species"] == "kiler_whale"] = "killer_whale"
train["species"][train["species"] == "globis"] = "short_finned_pilot_whale"
train["species"][train["species"] == "pilot_whale"] = "short_finned_pilot_whale"

# Adjust typos in "species" column
train["species"] = train["species"].replace(["bottlenose_dolpin", "kiler_whale", "globis", "pilot_whale"],
                                            ["bottlenose_dolphin","killer_whale", "short_finned_pilot_whale", "short_finned_pilot_whale"])

# Add attribute for presence of a dorsal fin
train["finned"] = train["species"].apply(lambda x: 'finless' if ((x=='beluga')or(x=='gray_whale')or(x=='southern_right_whale')) else 'finned')

# Add attribute for suborder baleen and toothed
train["suborder"] = train["species"].apply(lambda x: 'baleen' if ((x=='humpback_whale')or(x=='minke_whale')or(x=='fin_whale')or(x=='blue_whale')or\
                                                                  (x=='gray_whale')or(x=='southern_right_whale')or(x=='sei_whale')or(x=='brydes_whale')) else 'toothed')

# Add attribute for family monodontidae, balaenidae, balaenopteridae, and delphinidae
train["family"] = train["species"].apply(lambda x: 'monodontidae' if (x=='beluga') else ('balaenidae' if (x=='southern_right_whale') else \
                                                                                         ('balaenopteridae' if (x=='humpback_whale')or(x=='minke_whale')or\
                                                                                         (x=='fin_whale')or(x=='blue_whale')or(x=='gray_whale')or(x=='sei_whale')or\
                                                                                         (x=='brydes_whale') else 'delphinidae')))

# remove bad photo - has a clipboard in it, no whale
train.drop(train.index[train['image'] == 'cd5fe465c60cb9.jpg'], inplace=True)
train.reset_index(inplace=True)
train.drop('index', axis=1, inplace=True)

# Split Data

In [None]:
if HPARAM_SET:
    # split into small dataset for hyperparameter tuning
    skf = StratifiedKFold(n_splits=10, random_state=42, shuffle=True)
    folds = np.zeros(len(train), dtype=np.uint8)
    for fold, ( _, val_) in enumerate(skf.split(X=train, y=train.individual_id)):
        folds[val_] = fold
    train["folds"] = folds
    train2 = train[train['folds']==0]

else:
    # get counts for each id
    counts = train.groupby(['individual_id']).size().reset_index(name='counts')
    counts = counts[counts['counts'] > 2]
    
    # get ids with at least 3 examples
    df_temp = train[train['individual_id'].isin(counts['individual_id'])]

    # split temp into 3 stratified folds, so at least 1 example in each fold
    skf = StratifiedKFold(n_splits=3, random_state=42, shuffle=True)
    folds = np.zeros(len(df_temp), dtype=np.uint8)
    for fold, ( _, val_) in enumerate(skf.split(X=df_temp, y=df_temp.individual_id)):
        folds[val_] = fold
    df_temp["folds"] = folds

    # combine with full set - rest of the training set is fold 0, so 2 or 3 could be used for validation
    train["folds"] = 0
    train.update(df_temp)

In [None]:
transformations = A.Compose([
                A.HorizontalFlip(p=0.5),
                A.augmentations.transforms.HueSaturationValue(p=0.5),
                A.RandomBrightnessContrast(brightness_limit=0.10, contrast_limit=(-0.2, 0.2),p=0.5),
                A.Resize(MAX_IMAGE_SIZE, MAX_IMAGE_SIZE, cv2.INTER_CUBIC)
            ])

In [None]:
# augment images - need 2 examples for each id in hparam subset
if HPARAM_SET:
    # get counts of each ID in subset
    counts = train2.groupby(['individual_id']).size().reset_index(name='counts')
    for index, rows in tqdm(counts.iterrows(), total=counts.shape[0]):

        tmp_df = train2[train2['individual_id']==rows['individual_id']]
        
        for i in range(2-rows['counts']):
            # pick random image from group of same id
            idx = random.choice(range(0, rows['counts']))
            path = tmp_df['image'].values[idx]
            image = cv2.imread(f"{INPUT_PATH}/{path}")
            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
            
            # augment image
            tmp_img = transformations(image=image)["image"]
            
            # come up with new image name
            f = f"{rows['individual_id']}_{str(i)}.jpg"
            
            # write to dataset in bmp format
            output_path = os.path.join(BASE_PATH, DIR_NAME, f.split('.')[0] + ".bmp")
            cv2.imwrite(output_path, tmp_img)
            
            # append new entry to dataframe
            output_df = tmp_df.iloc[[0]]
            output_df['image'] = f
            train2 = train2.append(output_df, ignore_index = True)
            
    train2["image"] = train2["image"].str[:-3] + "bmp"
    train2.to_csv(os.path.join(BASE_PATH, "./train.csv"), index=False)

else: 
    train["image"] = train["image"].str[:-3] + "bmp"
    train.to_csv(os.path.join(BASE_PATH, "./train.csv"), index=False)


# Alter Images and Output to Dataset

In [None]:
def copy_dir():
    # copy images to output directory
    
    path = os.path.join(INPUT_PATH, DIR_NAME)
    n = len(train)

    for _, rows in tqdm(train.iterrows(), total=n):
        f = rows['image']
        
        image_path = os.path.join(path, f.split('.')[0] + ".jpg")
        image = cv2.imread(image_path)
        
        if image.shape[0] > MAX_IMAGE_SIZE or image.shape[1] > MAX_IMAGE_SIZE:
            image = cv2.resize(image, (MAX_IMAGE_SIZE, MAX_IMAGE_SIZE), interpolation=cv2.INTER_CUBIC)
         
        new_path = os.path.join(BASE_PATH, DIR_NAME, f.split('.')[0] + ".bmp") # output as bmp
            
        cv2.imwrite(new_path, image)

copy_dir()

In [None]:
# Add photos to dataset and change version
!ls /kaggle/working/tmp/{KAGGLE_DATASET_NAME}
version_name = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
!kaggle datasets version -m {version_name} -p /kaggle/working/tmp/{KAGGLE_DATASET_NAME} -r zip -q