<a href="https://colab.research.google.com/github/JingshuoLi/cv_finetuned_existingmodel/blob/main/Bird_Classification_Resnet50_Transfer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = '100-bird-species:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F534640%2F5468571%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240827%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240827T003747Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D278df32c8cfb9ee2efb2134a0b5e8e59cff6fd04e4511017bfe45dc18e7360732461053812b316f32e339c1d483755abe4162975921a76e34943ffe6d71dccd94182db04bd7a52a194c478480d173445bb882aa9946e4d846b0a586f65f9ea0507e12aeafcb1337f09ec05f4ce6ef50ddd92869bf2e241309a97a70f5e8c85a73f95eaccf58bc62a47e41eaf73c23b2e23e702980fd4a86160900051819c7f36feabd07f52daf99f16e4832d2600e24963b402230f4166d3f7b7622d76b2d56656c2c5227a9852ab7c472115991297b1ca9b6a91d9d3ed3a217325e9ee2a8a7adec554f0f28b695c4c4dda6d389a4c784d5c414bb36748ef9738db0e62905c49'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


Downloading 100-bird-species, 2099628449 bytes compressed
Downloaded and uncompressed: 100-bird-species
Data source import complete.


In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder

In [None]:
#Read the csv file, containing all training images' paths and their corresponding labels
data = pd.read_csv('/kaggle/input/100-bird-species/birds.csv')
print("The first five columns of the csv file are as shown")
print(data.head(5))
print("\n")

size = data.shape[0]
print(f"The size of the dataset is {size}")

#Check the number of classes in the dataset
NUM_CLASS = data['labels'].nunique()
print(f"The total number of classes is this dataset is {NUM_CLASS}")

FileNotFoundError: [Errno 2] No such file or directory: '/kaggle/input/100-bird-species/birds.csv'

In [None]:
#Random seed setting
tf.random.set_seed(42)

#Defining important hyperparameters used in the training process
#The transfer learning is based on Resnet50
BATCH_SIZE = 32
NUM_EPOCHS = 10
IMG_H = IMG_W = 224
IMG_SIZE = 224
IMG_CHANNELS = 3
NUM_EXAMPLES = size
BUFFER_SIZE = 1024

In [None]:
#Split the data into train, test, and validation based on their labels on the csv file
split_data = {label: instances for label, instances in data.groupby("data set")}
data_train = split_data["train"]
data_valid = split_data["valid"]
data_test = split_data["test"]

In [None]:
#Create a new label encoder
label_encoder = LabelEncoder()

#Extract the label from the dataset and apply sparse encoding to it
label_train = label_encoder.fit_transform(data_train["labels"])
label_valid = label_encoder.fit_transform(data_valid["labels"])
label_test = label_encoder.fit_transform(data_test["labels"])

#Convert these sparse encoded label, into a tf dataset
label_train = tf.data.Dataset.from_tensor_slices(label_train)
label_valid = tf.data.Dataset.from_tensor_slices(label_valid)
label_test = tf.data.Dataset.from_tensor_slices(label_test)

In [None]:
#For both train, validation, and test data, create tensor and preprocessing
filepaths_train = data_train["filepaths"].tolist()
filepaths_val = data_valid["filepaths"].tolist()
filepaths_test = data_test["filepaths"].tolist()

dataset_train = tf.data.Dataset.from_tensor_slices(filepaths_train)
dataset_val = tf.data.Dataset.from_tensor_slices(filepaths_val)
dataset_test = tf.data.Dataset.from_tensor_slices(filepaths_test)

In [None]:
def img_tensor_creation(filepath):
    #Loop through the dataframe, read each file path, and convert each image to a tensor object
    path_firstpart = "/kaggle/input/100-bird-species/"
    full_path = tf.strings.join([path_firstpart, filepath])
    image_tensor = tf.io.read_file(full_path) #Read the file into raw bytes
    image_tensor = tf.image.decode_image(image_tensor, channels = 3) #Convert  the raw bytes into tensor
    image_tensor = tf.image.convert_image_dtype(image_tensor, tf.float32) #Convert uint8 to float within [0,1]

    image_tensor = tf.image.resize_with_pad(image_tensor,IMG_SIZE, IMG_SIZE) #Resize the image to the required size of Resnet 50

    #Apply normalization
    mean = [0.485, 0.456, 0.406]
    std = [0.229, 0.224, 0.225]
    image_tensor = (image_tensor - mean) / std

    return image_tensor

In [None]:
#Preprocessing finished
dataset_train = dataset_train.map(img_tensor_creation, num_parallel_calls = tf.data.experimental.AUTOTUNE)
dataset_val = dataset_val.map(img_tensor_creation, num_parallel_calls = tf.data.experimental.AUTOTUNE)
dataset_test = dataset_test.map(img_tensor_creation, num_parallel_calls = tf.data.experimental.AUTOTUNE)

print("Successful")

Successful


In [None]:
#Zip the dataset and the label together
ds_train = tf.data.Dataset.zip(dataset_train, label_train)
ds_val = tf.data.Dataset.zip(dataset_val, label_valid)
ds_test = tf.data.Dataset.zip(dataset_test, label_test)

#Now the data is ready for the training step

In [None]:
train = ds_train.shuffle(BUFFER_SIZE).batch(BATCH_SIZE)
train = train.prefetch(tf.data.experimental.AUTOTUNE)

val = ds_val.batch(BATCH_SIZE)
val = val.prefetch(tf.data.experimental.AUTOTUNE)

In [None]:
log_dir = "/kaggle/Output"

#Create a TensorBoard Callback function
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir,
                                                      batch_size = BATCH_SIZE,
                                                      write_images = True,
                                                      write_graph = True,
                                                      write_grads = True
)

TypeError: TensorBoard.__init__() got an unexpected keyword argument 'batch_size'

In [None]:
#Defining the transfer learning model structure with part of the Resnet50 model unfreeze
def transfer_learning(train, val, learning_rate = 0.001):
    #Firstly, import the model
    resnet = tf.keras.applications.ResNet50(input_shape = (IMG_SIZE, IMG_SIZE, IMG_CHANNELS), include_top = False)
    resnet.trainable = False

    #Only make the last 20% of the layers trainable
    num_layers = len(resnet.layers) #Count the number of layers

    #Set the trainable percentage of the entire Resnet50 structure
    trainable_percentage = 0.2

    #Set those layers trainable
    for layer_index in range(int(num_layers - trainable_percentage*num_layers), num_layers):
        resnet.layers[layer_index].trainable = True

    #Stack a classifier onto the Resnet50 model
    model_transfer = tf.keras.Sequential([
        resnet,
        tf.keras.layers.GlobalAveragePooling2D(),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(64),
        tf.keras.layers.Dropout(0.3),
        tf.keras.layers.Dense(NUM_CLASS,
                             activation = "softmax"),
    ])

    #Compile the model with Adam optimizer
    model_transfer.compile(optimizer = tf.keras.optimizer.Adam(learning_rate = learning_rate),
                          loss = 'categorical_crossentropy',
                          metrics = ["accuracy"])



    #Define an early callback, based on validation accuracy
    early_callback = tf.keras.callbacks.EarlyStopping(monitor = "val_accuracy",
                                                      min_delta = 0.0001,
                                                      patience = 5

    )

    #To keep the model simple, the TensorBoard Callback defined above will not be included in the training
    model_transfer.fit(train,
                      epoch = NUM_EPOCHS,
                      steps_per_epoch = (NUM_EXAMPLES/BATCH_SIZE),
                      validation_data = val,
                      validation_freq = 1,
                      validation_steps = 1,
                      callbacks = [early_callback])

    return model_transfer
    #Validation runs at each epoch, and each run will use 1 batch of the validation data
    #Again, only early callback is applied. TensorBoard callback is ignored.

In [None]:
model = transfer_learning(train, val)

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5


Exception: URL fetch failure on https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5: None -- [Errno -3] Temporary failure in name resolution