Follow these steps to connect to Kaggle (source of datasets). You will have to have an account there and an API access. Learn more [here](https://www.kaggle.com/general/74235).

In [None]:
# Pip install Kaggle package
! pip install -q kaggle

In [None]:
# Upload your Kaggle access API file here
from google.colab import files

files.upload()

In [None]:
# Save the API access file to the correct place
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/

In [None]:
# Set permissions
! chmod 600 ~/.kaggle/kaggle.json

In [None]:
# Test if your Kaggle access works
! kaggle datasets list

Prepare functions for datasets download and preprocessing

In [None]:
# Imports
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from skimage.color import rgb2lab, lab2rgb, rgb2gray
from keras.preprocessing.image import ImageDataGenerator, array_to_img, img_to_array, load_img
from PIL import Image
import numpy as np
import os
import cv2

In [None]:
# Helper function for debugging
def show_image(image_path):
    image = mpimg.imread(image_path)
    plt.imshow(image)

In [None]:
# Crop images and resize them, the function first resizes the image, then throws away images that are too small, and finally cuts a square of 224x244 pixels from the middle
def image_crop_resize(img_path):
    image = Image.open(img_path)

    maxwidth = 270
    maxheight = 270
    width = image.size[0]
    height = image.size[1]
    a = max(maxwidth/width, maxheight/height)

    image.thumbnail((a*width,a*height))
    mid = (image.size[0]/2,image.size[1]/2)

    # Throw away images too small, also make sure that there is some extra edge space before cropping to avoid potential watermarks
    if mid[0] < 125 or mid[1] < 125:
      print('problem with '+ img_path)
      return None
      
    image = image.crop((mid[0]-112,mid[1]-112,mid[0]+112,mid[1]+112))

    return image

In [None]:
# Manages the image processing 
def process_images(input_folder,out_putfolder):
    i = 0
    non_suitable = 0
    for file in os.listdir(input_folder):
        if '.jpg' in file:
            image_path = input_folder + file
            to_save = image_crop_resize(image_path)
            if not to_save == None:
              to_save.save(out_putfolder + str(i) + '.jpg')
            else:
              non_suitable += 1

            i += 1
    # Number of images that were not used because they were too small
    print('Images skipped: ' + str(non_suitable))

Download and preprocess the landscapes dataset

In [None]:
# Download the landscapes dataset
! kaggle datasets download -d arnaud58/landscape-pictures

In [None]:
# Create a lanscapes dataset folder
! mkdir landscapes

In [None]:
# Unzip the landscapes dataset
! unzip landscape-pictures.zip -d landscapes/ 

In [None]:
# Create a folder for the prepared images
! mkdir landscapes_cropped

In [None]:
# Preprocess images
process_images('landscapes/','landscapes_cropped/')

Download and preporcess the flicker8k dataset

In [None]:
# Donwload the flicker8k dataset
! kaggle datasets download -d adityajn105/flickr8k

In [None]:
# Unzip the dataset
! unzip flickr8k.zip -d flicker/ 

In [None]:
# Create a folder for the prepared images
! mkdir flicker_cropped

In [None]:
# Process the images
process_images('flicker/Images/','flicker_cropped/') 

Convert datasets to X and Y matrices (L and A+B channels)

In [None]:
# Takes an image from a file and turns it into a np.array with the LAB channels
def rgb2Lab(img):
    img_rgb = cv2.imread(img, cv2.IMREAD_COLOR)
    Lab = cv2.cvtColor(img_rgb, cv2.COLOR_RGB2LAB)
    return Lab

In [None]:
# Gets the LAP channels for all the images in a selected folder
def get_LAB(dataset_path):
    LAB = []
    for filename in os.listdir(dataset_path):
        LAB.append(rgb2Lab(dataset_path+filename))
    LAB = np.array(LAB, dtype=float)
    return(LAB)

In [None]:
# Returns the X and Y matrices in the correct shape for the model
def get_channels(LAB):
    X = LAB[:,:,:,0]
    X = X.reshape(X.shape+(1,))
    Y = LAB[:,:,:,1:]
    return(X,Y)

Get X and Y matrices for the landscapes dataset

In [None]:
ls_LAB = get_LAB('landscapes_cropped/')

In [None]:
ls_X, ls_Y = get_channels(ls_LAB)

Get X and Y matrices for the flicker8k dataset

In [None]:
fl_LAB = get_LAB('flicker_cropped/')

In [None]:
fl_x, fl_Y = get_channels(fl_LAB)