# Dataset Import, Analyisis

Dataset: https://www.kaggle.com/moltean/fruits  
High quality dataset of images containing fruits

### Import libraries

In [10]:
import pandas as pd
import numpy as np
import os
import glob
import cv2
from tqdm import tqdm
from joblib import Parallel, delayed
import multiprocessing
import gc
import pickle

### Define constants

In [11]:
DATASET_PATH = "..\\fruits-360"

TRAIN_DATA = '..\\fruits-360\\Training'
VALIDATION_DATA = '..\\fruits-360\\Test'
TEST_MULTIPLE = '..\\fruits-360\\test-multiple_fruits'


# dimensions of images.
IMG_WIDTH, IMG_HEIGHT = 224, 224

FRUIT_IMAGES = []
LABELS = []


### Save Object Function (Pickle)

In [12]:
def save_object(obj, filename):
    """Write a python object to file system using pickle"""
    import pickle
    
    with open(filename, 'wb') as output:  # Overwrites any existing file.
        
        pickle.dump(obj, output, pickle.HIGHEST_PROTOCOL)
        
        print("file:",obj.name,"written")
        
    return True

### Read Object Function (Pickle)

In [13]:
def read_object(filename):
    """Read a pickle object file from file system"""
    import pickle
    
    with open(filename, 'rb') as input:
        obj = pickle.load(input)
        
        print("Loaded:",obj.name)

### Get the fuits and labels into python objects

In [20]:
def countFruits(dataset_type = "training"):
    """Count how many fruit there are into the trainingset"""
    fruit_images = FRUIT_IMAGES
    labels = LABELS
    
    num_cores = multiprocessing.cpu_count()
    
    if dataset_type.strip().lower() == ("training" or "trainingset" or "dataset"):
        path = TRAIN_DATA
    elif dataset_type.strip().lower() == ("test" or "validation"):
        path = VALIDATION_DATA
    elif dataset_type.strip().lower() == "multiple":
        path = TEST_MULTIPLE
    else:
        print("Error, please specify a correct dataset type you want to count")
    
    count_images = 0
    print("Searching in the path:",path)
    
    for f in tqdm(glob.glob(os.path.join(path,"*"))):

        p = os.path.join(f,"*.jpg")

        for image_path in glob.glob(p):
            
            count_images += 1

    return count_images

In [15]:
def getFruits():
    """Get the fruits and labels obtained from the dataset into python objects"""
    
    fruit_images = FRUIT_IMAGES
    labels = LABELS
    path = TRAIN_DATA

    for f in tqdm(glob.glob(os.path.join(path,"*"))):

        p = os.path.join(f,"*.jpg")

        for image_path in glob.glob(p):

            image = cv2.imread(image_path, cv2.IMREAD_COLOR)
            image = cv2.resize(image, (IMG_WIDTH, IMG_HEIGHT))
            image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)

            fruit_images.append(image)
            labels.append(f)
    
    
    print("Image vector created, try to create np.array with images")
    fruit_images = np.array(fruit_images)
    labels = np.array(labels)
    
    
    # Save the memory to disk and free the memory
    
    save_object(fruit_images, '../tmp/fruit_images.pkl')
    save_object(labels, '../tmp/labels.pkl')
    
    #Free the memory
    gc.collect()
    
    print("Job finished")
    
    return True

In [23]:
number_images = countFruits("training")
print("number of images inside the trainingset:", number_images)

Searching in the path: ..\fruits-360\Training


100%|███████████████████████████████████████████████████████████████████████████████| 103/103 [00:00<00:00, 353.91it/s]


number of images inside the trainingset: 53177


In [22]:
number_images = countFruits("test")
print("number of images inside the validation dataset:", number_images)

Searching in the path: ..\fruits-360\Test


100%|███████████████████████████████████████████████████████████████████████████████| 103/103 [00:00<00:00, 786.27it/s]


number of images inside the validation dataset: 17845


In [24]:
number_images = countFruits("multiple")
print("number of images inside the validation dataset:", number_images)

Searching in the path: ..\fruits-360\test-multiple_fruits


100%|█████████████████████████████████████████████████████████████████████████████| 106/106 [00:00<00:00, 21205.58it/s]


number of images inside the validation dataset: 0


In [8]:
getFruits()

100%|████████████████████████████████████████████████████████████████████████████████| 103/103 [04:25<00:00,  2.63s/it]


Image vector created, try to create np.array with images
file written
file written
Job finished


True