In [0]:
!pip install kaggle



In [0]:
from googleapiclient.discovery import build
import io, os
from googleapiclient.http import MediaIoBaseDownload
from google.colab import auth

auth.authenticate_user()

drive_service = build('drive', 'v3')
results = drive_service.files().list(q="name = 'kaggle.json'", fields="files(id)").execute()
kaggle_api_key = results.get('files', [])

filename = "/content/.kaggle/kaggle.json"
os.makedirs(os.path.dirname(filename), exist_ok=True)

request = drive_service.files().get_media(fileId=kaggle_api_key[0]['id'])
fh = io.FileIO(filename, 'wb')
downloader = MediaIoBaseDownload(fh, request)
done = False
while done is False:
    status, done = downloader.next_chunk()
    print("Download %d%%." % int(status.progress() * 100))
os.chmod(filename, 600)

Download 100%.


In [0]:
!cat .kaggle/kaggle.json

{"username":"thegambitier","key":"3848e9a43fafe9d8ad91b2217e65b4c0"}

In [0]:
pwd

'/content'

In [0]:
ls --all

[0m[01;34m.[0m/  [01;34m..[0m/  sampleSubmission.csv  [01;34mtest1[0m/  [01;34mtrain[0m/


In [0]:
!mkdir ~/.kaggle

In [0]:
!cp .kaggle/kaggle.json ~/.kaggle/kaggle.json

In [0]:
!mkdir dataset_cat_dog

In [0]:
cd dataset_cat_dog

/content/dataset_cat_dog


In [0]:
!kaggle competitions download -c dogs-vs-cats

Downloading sampleSubmission.csv to /content/dataset_cat_dog
  0% 0.00/86.8k [00:00<?, ?B/s]
100% 86.8k/86.8k [00:00<00:00, 33.3MB/s]
Downloading test1.zip to /content/dataset_cat_dog
 97% 262M/271M [00:02<00:00, 142MB/s]
100% 271M/271M [00:02<00:00, 119MB/s]
Downloading train.zip to /content/dataset_cat_dog
100% 542M/543M [00:08<00:00, 62.6MB/s]
100% 543M/543M [00:08<00:00, 68.6MB/s]


In [0]:
ls

sampleSubmission.csv  test1.zip  train.zip


In [0]:
!unzip train.zip 

In [0]:
!unzip test1.zip  

In [1]:
rm test1.zip  train.zip

rm: cannot remove 'test1.zip': No such file or directory
rm: cannot remove 'train.zip': No such file or directory


In [0]:
ls train/ -U | head -4

dog.12165.jpg
dog.4091.jpg
cat.6046.jpg
cat.3400.jpg


## Saparateing The Data

The data we get from the Kaggle dataset it’s a mixed data. means all the images of dogs and cats are in the same folder. Now let’s separate them into two separate folders.



In [0]:
import shutil
def seperateData(data_dir):
    for filename in listdir(data_dir):
        if isfile(join(data_dir, filename)):
            tokens = filename.split('.')
            if tokens[-1] == 'jpg':
                image_path = join(data_dir, filename)
                if not os.path.exists(join(data_dir, tokens[0])):
                    os.makedirs(join(data_dir, tokens[0]))
                shutil.copyfile(image_path, join(join(data_dir, tokens[0]), filename))
                os.remove(image_path)

In [0]:
seperateData("./train")

In [0]:
!ls ./train/cat

So by reading the filename, we can get if it’s a dog or cat.

Now we can read all the images and store them in a python list. and feed it to the network one by one. loading the entire dataset into your program, it will occupy too much ram probably in GB, and we don’t even need all the images at the same time. 

So we will create a Class which will get few of those images in batch, let’s say 20 images and then after we train our network on.



Then the generator we collect next set of images from the folder and create another mini batch. this will continue until we are finished with all the images in the folder



# Let’s Create a Dataset Generator


In [0]:
import cv2 # to load the images
import numpy as np # to do matrix mnupulations 
from os.path import isfile, join # to manupulate file paths
from os import listdir # get list of all the files in a directory
from random import shuffle # shuffle the data (file paths)

In [0]:
class DataSetGenerator:
    def __init__(self, data_dir):
        self.data_dir = data_dir
        self.data_labels = self.get_data_labels()
        self.data_info = self.get_data_paths()

    def get_data_labels(self):
        data_labels = []
        for filename in listdir(self.data_dir):
            if not isfile(join(self.data_dir, filename)):
                data_labels.append(filename)
        return data_labels

    def get_data_paths(self):
        data_paths = []
        for label in self.data_labels:
            img_lists=[]
            path = join(self.data_dir, label)
            for filename in listdir(path):
                tokens = filename.split('.')
                if tokens[-1] == 'jpg':
                    image_path=join(path, filename)
                    img_lists.append(image_path)
            shuffle(img_lists)
            data_paths.append(img_lists)
        return data_paths

      
# Now the data_path list should contain two lists one with all the 
# lists of image paths for dog and one with list of all the image 
# paths for cat      


# So we got all the image file paths and the corresponding labels, 
# Now what? How to get the images?, We will be using the Python’s 
# concept of generator, and the concept of yield to create the 
# mini-batches on the fly and delete them after we are done training 
# our network on it.      

    def get_mini_batches(self, batch_size=10, image_size=(200, 200), allchannel=True):
        images = []
        labels = []
        empty=False
        
        # counter for the current iteration for each class.
        counter=0
        
        # each_batch = int(10 / 2) = 5
        # batch_size for each class
        each_batch_size=int(batch_size/len(self.data_info))
        
        # we will break this loop when none of the classes have 
        # any images left to train
        while True:
            
            # inner for loop to get images from each and every class one by one
            for i in range(len(self.data_labels)):
                label = np.zeros(len(self.data_labels),dtype=int)
                label[i] = 1
                
                
                # So before loading any images, we have to check if that
                # class has any image_path left in the list, so we checked 
                # if the length of that list of that particular class is 
                # less than our counter value. if it is then we considered 
                # it as empty and continuing to the next class. if it’s not 
                # we are setting the empty flag as False and loading the image 
                # using cv2.imread() method.

                if len(self.data_info[i]) < counter+1:
                    empty=True
                    continue
                empty=False
                img = cv2.imread(self.data_info[i][counter])
                img = self.resizeAndPad(img, image_size)
                if not allchannel:
                    img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
                    img = np.reshape(img, (img.shape[0], img.shape[1], 1))
                images.append(img)
                labels.append(label)
            
            # counter for the current iteration for each class.
            counter+=1

            if empty:
                break
            # if the iterator is multiple of batch size return the mini batch
            if (counter)%each_batch_size == 0:
                yield np.array(images,dtype=np.uint8), np.array(labels,dtype=np.uint8)
                del images
                del labels
                images=[]
                labels=[]
                
# So here first we checked if we are shrinking the image or enlarging. 
# Cuz the cv2.INTER_AREA method is better for shrinking the image where as  
# cv2.INTER_CUBIC  is better for enlarging the image.

# Next, we are checking if it’s a horizontal image or a vertical image. 
# and padding the image with zeros so that it becomes a square image.

# Then we applied the cv2.resize  method to scale the image 
# according to the given size.
                
                
    def resizeAndPad(self, img, size):
        h, w = img.shape[:2]

        sh, sw = size
        # interpolation method
        if h > sh or w > sw:  # shrinking image
            interp = cv2.INTER_AREA
        else: # stretching image
            interp = cv2.INTER_CUBIC

        # aspect ratio of image
        aspect = w/h

        # padding
        if aspect > 1: # horizontal image
            new_shape = list(img.shape)
            new_shape[0] = w
            new_shape[1] = w
            new_shape = tuple(new_shape)
            new_img=np.zeros(new_shape, dtype=np.uint8)
            h_offset=int((w-h)/2)
            new_img[h_offset:h_offset+h, :, :] = img.copy()

        elif aspect < 1: # vertical image
            new_shape = list(img.shape)
            new_shape[0] = h
            new_shape[1] = h
            new_shape = tuple(new_shape)
            new_img = np.zeros(new_shape,dtype=np.uint8)
            w_offset = int((h-w) / 2)
            new_img[:, w_offset:w_offset + w, :] = img.copy()
        else:
            new_img = img.copy()
        # scale and pad
        scaled_img = cv2.resize(new_img, size, interpolation=interp)
        return scaled_img
                