## Imports

### modules

In [90]:
import os
import pandas as pd 
import numpy as np 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from os import path, listdir
import tensorflow as tf 
import matplotlib.pyplot as plt 

In [5]:
from tensorflow.compat.v1 import ConfigProto
from tensorflow.compat.v1 import InteractiveSession
config = ConfigProto()
config.gpu_options.allow_growth = True
session = InteractiveSession(config=config)
tf.config.run_functions_eagerly(True)
physical_devices = tf.config.list_physical_devices('GPU')
tf.config.set_visible_devices(physical_devices[0], 'GPU')

### data

In [7]:
dataset_path = "../../data/meta/final_dataset_labeled.csv"
df = pd.read_csv(dataset_path, index_col=0)
df

Unnamed: 0,name,car_name,car_type,is_test
1,00002_Acura TL Sedan 2012.jpg,Acura TL Sedan 2012,Midsize,0
2,00003_Dodge Dakota Club Cab 2007.jpg,Dodge Dakota Club Cab 2007,Large,0
3,00004_Hyundai Sonata Hybrid Sedan 2012.jpg,Hyundai Sonata Hybrid Sedan 2012,Midsize,0
4,00005_Ford F-450 Super Duty Crew Cab 2012.jpg,Ford F-450 Super Duty Crew Cab 2012,Large,0
6,00007_Dodge Journey SUV 2012.jpg,Dodge Journey SUV 2012,Midsize,0
...,...,...,...,...
26212,RamCVCargoVanMinivan201294.jpeg,Ram C/V Cargo Van Minivan 2012,Large,1
26213,RamCVCargoVanMinivan201296.jpeg,Ram C/V Cargo Van Minivan 2012,Large,1
26214,RamCVCargoVanMinivan201297.jpeg,Ram C/V Cargo Van Minivan 2012,Large,1
26215,RamCVCargoVanMinivan201298.jpeg,Ram C/V Cargo Van Minivan 2012,Large,1


## Prepare data

### split data into train, validation and test set

In [8]:
def split2(dataset: pd.DataFrame, val: float, test: float) -> dict:

    images= dataset["name"].to_numpy()
    labels= dataset["car_type"].to_numpy()

    validation_ratio = val
    test_ratio = test

    training_images, other_images, training_labels, other_labels = train_test_split(images, labels, test_size= (test_ratio+validation_ratio) , random_state= 1)

    validation_images, testing_images, validation_labels, testing_labels = train_test_split(other_images, other_labels, test_size= (test_ratio / (test_ratio + validation_ratio)), random_state= 1)

    sets = {"train_img": training_images, 
            "train_labels": training_labels, 
            "val_img": validation_images, 
            "val_labels": validation_labels, 
            "test_img": testing_images, 
            "test_labels": testing_labels
            }
    return sets

In [9]:
sets = split2(df, 0.15, 0.05)

In [11]:
x_train = sets["train_img"]
y_train = sets["train_labels"]
x_val = sets["val_img"]
y_val = sets["val_labels"]

### load actual images

In [128]:
def preprocess_dataset(image: str, size: tuple = (256, 256)) -> tf.Tensor:
    pth = tf.io.read_file(image)
    image = tf.io.decode_image(pth) 
    image.set_shape([None, None, 3])
    image = tf.image.resize(image, size)
    image = image / 255.0
    return image

def generate_tf_dataset(file_names: np.ndarray, labels: np.ndarray, batch_size=32):

    oneHot = OneHotEncoder()
    encoded_labels = oneHot.fit_transform([[label] for label in labels]).toarray()
    label_dataset = tf.data.Dataset.from_tensor_slices(encoded_labels)

    image_dataset = tf.data.Dataset.from_tensor_slices([img for img in file_names])
    image_dataset = image_dataset.map(preprocess_dataset)

    dataset = tf.data.Dataset.zip((image_dataset, label_dataset))

    return dataset

In [138]:
train_data = generate_tf_dataset(x_train, y_train)
len(train_data)



14669