<a href="https://colab.research.google.com/github/MohamedAhmed35/computer-vision/blob/main/object-detection/R-CNN/cat_dog_detection/01_data_preperation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Preparing the data for R-CNN project

## Import Resources

In [None]:
import cv2
import pandas as pd
import os, sys
import tensorflow as tf

sys.path.append('/content/drive/MyDrive/Colab Notebooks/R-CNN')
import helper_functions as helper

## Load all the data

In [None]:
dataset_path = "/content/drive/MyDrive/Colab Notebooks/R-CNN/data/raw/VOC2007"


for item in os.listdir(dataset_path):
    if 'Anno' in item:
        annot_dir = os.path.join(dataset_path, item)
    elif 'JPEG' in item:
        src_images_dir = os.path.join(dataset_path, item)
    elif "ImageSet" in item:
        imageSet_path = os.path.join(dataset_path, item)


        for root, _, files in os.walk(imageSet_path):
            # Check if we're in the 'Main' folder inside 'ImageSet'
            if os.path.basename(root) == "Main":
                # Get all text files and sort them in ascending order. For example (dog_train.txt) will be befroe (dog_val.txt)
                txt_files = [f for f in files if f.endswith(".txt")]
                # sort the list of files in ascending order. For example (dog_train.txt) will be befroe (dog_val.txt)
                txt_files.sort()

                # Store the images labels [train, test] of each object, [cat train, cat test, dog train, dog test]
                all_image_labels = []

                for txt_file in txt_files:

                    if txt_file in ["cat_train.txt", "dog_train.txt"]:
                        txt_file_path = os.path.join(root, txt_file)
                    elif txt_file in ["cat_val.txt", "dog_val.txt"]:
                        txt_file_path = os.path.join(root, txt_file)
                    else:
                        continue

                    with open(txt_file_path, mode = 'r') as f:
                            lines = f.readlines()

                    # Stores the images labels of each object
                    current_labels = []

                    for line in lines:
                        line = line.strip()
                        img_label, check_value = line.split()   # Splits on any whitespace and discard any empty string
                        check_value = int(check_value)

                        # At check_value = 1, means "img_label" corresponds to an image that contains either "cat(s)" or "dog(s)" in it
                        # based on the file that contains this "img_label"
                        if 1 == check_value:
                            current_labels.append(img_label)

                    if len(current_labels) != 0:
                        # Append current labels to all_image_labels list.
                        all_image_labels.append(current_labels)


print(f"Length of cat Train set: {len(all_image_labels[0])}\nLength of cat Test set: {len(all_image_labels[1])}")
print(f"\nLength of dog Test set: {len(all_image_labels[2])}\nLength of dog Test set: {len(all_image_labels[3])}")

Length of cat Train set: 163
Length of cat Test set: 174

Length of dog Test set: 203
Length of dog Test set: 218


In [None]:
# Create directories to store the train images and test images
data_dir = "/content/drive/MyDrive/Colab Notebooks/R-CNN/data"
processed_dir = os.path.join(data_dir, "processed")

if not os.path.exists(processed_dir):
    os.makedirs(processed_dir)

train_dir = os.path.join(processed_dir, "train_dir")
test_dir = os.path.join(processed_dir, "test_dir")

train_csv_path = os.path.join(train_dir, "CSV_files")
test_csv_path = os.path.join(test_dir, "CSV_files")

if not os.path.exists(train_dir):
    os.makedirs(train_dir)

if not os.path.exists(test_dir):
    os.makedirs(test_dir)

## Process all the images

In [None]:
cat_labels = all_image_labels[:2]
dog_labels = all_image_labels[2:]

train_labels = cat_labels[0] + dog_labels[0]
test_labels = cat_labels[1] + dog_labels[1]

# speed-up using multithreads
cv2.setUseOptimized(True);
cv2.setNumThreads(8);

# process train images
df = process_images_and_annotations(src_images_dir, train_dir, annot_dir, train_labels)
# Save DataFrame to CSV
csv_name = os.path.join(processed_dir, "train_data.csv")
df.to_csv(csv_name, index=False)

# process test images
df = process_images_and_annotations(src_images_dir, test_dir, annot_dir, test_labels)
# Save Datafraome to CSV
csv_name = os.path.join(processed_dir, "test_data.csv")
df.to_csv(csv_name, index=False)

## VGG16

### Base Model

In [None]:
# Load a pretrained alexnet model. The source from the internet
base_model = tf.keras.applications.VGG16(weights="imagenet")

# Preserve the weights of each layer
for layer in base_model.layers:
    layer.trainable = False

base_model.summary()

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels.h5
[1m553467096/553467096[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 0us/step


### Classification model

In [None]:
# Remove the last layer in VGG16 model
x = base_model.layers[-2].output

# Make a model the give the features of an image, the feature vector length: 4096 feature
classification_model = tf.keras.models.Model(inputs = base_model.input, outputs = x)

classification_model.summary()

In [None]:
classification_model.save("/content/drive/MyDrive/Colab Notebooks/R-CNN/models/classification.keras")

In [None]:
# Read Train and Test CSV files
df_train = pd.read_csv(os.path.join(processed_dir, "train_data.csv"))
df_test = pd.read_csv(os.path.join(processed_dir, "test_data.csv"))

In [None]:
df_train.sample(5)

Unnamed: 0,image_name,x1,y1,x2,y2,IOU,class,object
3389,3389.png,463,199,500,244,0.0,2,background
14161,14161.png,270,128,284,162,0.0,2,background
10972,10972.png,78,165,98,193,0.02,2,background
7916,7916.png,156,88,375,124,0.05,2,background
11694,11694.png,236,188,249,207,0.0,2,background


In [None]:
df_test.sample(5)

Unnamed: 0,image_name,x1,y1,x2,y2,IOU,class,object
10272,10272.png,148,246,171,267,0.0,2,background
16086,16086.png,11,0,59,168,0.03,2,background
8107,8107.png,0,0,322,375,0.64,0,cat
15621,15621.png,236,211,265,228,0.0,2,background
2357,2357.png,277,60,353,221,0.19,2,background


#### Cat SVM classification

##### Extract Features vectors

In [None]:
# Extract features labels for cat train and test set
train_features, train_labels = extract_features_from_folder(train_dir, df_train, "cat", classification_model)
test_features, test_labels = extract_features_from_folder(test_dir, df_test, "cat", classification_model)

# Save features vectors in csv file
df_cat_train = pd.DataFrame(train_features)
df_cat_train["label"] = train_labels
df_cat_test = pd.DataFrame(test_features)
df_cat_test["label"] = test_labels

df_cat_train.to_csv(os.path.join(train_csv_path, "cat_train.csv"), index = False)
df_cat_test.to_csv(os.path.join(test_csv_path, "cat_test.csv"), index = False)

#### Dog SVM classification

##### Extract Features vectors

In [None]:
# Extract features and labels for dog train and test set
train_features, train_labels = extract_features_from_folder(train_dir, df_train, "dog", classification_model)
test_features, test_labels = extract_features_from_folder(test_dir, df_test, "dog", classification_model)

# Save features vectors in csv file
df_dog_train = pd.DataFrame(train_features)
df_dog_train["label"] = train_labels
df_dog_test = pd.DataFrame(test_features)
df_dog_test["label"] = test_labels

df_dog_train.to_csv(os.path.join(train_csv_path, "dog_train.csv"), index = False)
df_dog_test.to_csv(os.path.join(test_csv_path, "dog_test.csv"), index = False)