In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

data = pd.read_json('labels.json')

# converts label lists to strings and assigns them to the "annotations" column
data['annotations'] = data['annotations'].apply(lambda x: ','.join(x))

# extracting labels and converting them to numeric values
encoder = LabelEncoder()
numeric_labels = encoder.fit_transform(data['annotations'])

# writing numerical labels to the "labels" column in the data frame
data['labels'] = numeric_labels

# removing the .jpg ending
data['image'] = data['image'].str.replace('.jpg', '')
data


In [None]:
import numpy as np
from PIL import Image
import os

folder_path = (r'C:\Users\ghost\Desktop\STUDIA\Semestr\ZPD\Projekt\Warzywa')

image_list = []
label_list = []

for filename in os.listdir(folder_path):
    if filename.endswith('.jpg'):  # loading only files with .jpg extension
        image_path = os.path.join(folder_path, filename)
        image = Image.open(image_path).convert('RGB') # use of RGB
        image = image.resize((250, 250))  # required image size
        image = np.array(image)  # transforming an image into a numpy array
        image_list.append(image)
        label_list.append(data.loc[data['image'] == os.path.splitext(filename)[0], 'labels'].iloc[0])  # assigning a label to an image

image_array = np.array(image_list)  # converting a list of images to a numpy matrix
label_array = np.array(label_list)  # converting a list of labels to a numpy matrix

In [None]:
print("Ilość obrazów: ", len(image_array))
print("Ilość etykiet: ", len(label_array))

In [None]:
# None 0, Broccoli 1, White onions 2, Red onions 3, Garlic 4, Carrots 5
class_counts = np.bincount(label_array)
print(class_counts)

In [None]:
from sklearn.preprocessing import StandardScaler

# changing the shape of the input data matrix
image_array_flat = image_array.reshape((image_array.shape[0], -1))

# data standardization
scaler = StandardScaler()
image_array_scaled = scaler.fit_transform(image_array_flat)

# changing the shape of the input data matrix back to the original one
image_array_scaled = image_array_scaled.reshape((image_array.shape[0], 250, 250, 3))

In [None]:
from keras.preprocessing.image import ImageDataGenerator

# definition of image generator
datagen = ImageDataGenerator(
        rotation_range=20,  # image rotation range in degrees
        width_shift_range=0.2,  # horizontal displacement range
        height_shift_range=0.2,  # vertical travel range
        horizontal_flip=True,  # horizontal mirroring
        vertical_flip=True)  # vertical mirroring

# generator application
augmented_images = []
augmented_labels = []
for image, label in zip(image_array, label_array):
    augmented_images.append(image)
    augmented_labels.append(label)
    generated_images = 0
    for batch in datagen.flow(np.expand_dims(image, axis=0), np.array([label]), batch_size=1):
        augmented_images.append(batch[0])
        augmented_labels.append(batch[1][0])
        if len(augmented_images) >= 1:
            break
image_array_augmented = np.array(augmented_images)
label_array_augmented = np.array(augmented_labels)

In [None]:
# None 0, Broccoli 1, White onions 2, Red onions 3, Garlic 4, Carrots 5
class_counts_augmented = np.bincount(label_array_augmented)
print(class_counts_augmented)

In [None]:
from sklearn.model_selection import train_test_split

train_s = 0.7 # percentage of training data
val_s = 0.15  # percentage of validation data
test_s = 0.15  # percentage of test data

X_train_val, X_test, y_train_val, y_test = train_test_split(image_array_augmented, label_array_augmented, test_size=test_s)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=val_s/(train_s+val_s))

# Calculating the statistics for each set
train_size = len(X_train)
val_size = len(X_val)
test_size = len(X_test)

num_classes_train = len(np.unique(y_train))
num_classes_val = len(np.unique(y_val))
num_classes_test = len(np.unique(y_test))

class_counts_train = np.bincount(y_train)
class_counts_val = np.bincount(y_val)
class_counts_test = np.bincount(y_test)

class_proportions_train = class_counts_train / train_size
class_proportions_val = class_counts_val / val_size
class_proportions_test = class_counts_test / test_size

# Displaying the results
print("TRENING Dataset:")
print("Number of images:", train_size)
print("Number of classes:", num_classes_train)
print("Number of images from each class:", class_counts_train)
print("Class proportions:", class_proportions_train)

print("VALID Dataset:")
print("Number of images:", val_size)
print("Number of classes:", num_classes_val)
print("Number of images from each class:", class_counts_val)
print("Class proportions:", class_proportions_val)

print("TEST Dataset:")
print("Number of images:", test_size)
print("Number of classes:", num_classes_test)
print("Number of images from each class:", class_counts_test)
print("Class proportions:", class_proportions_test)