# Set up a basic CNN to use for SSL later

In [8]:
# Imports

# Data Handling
import pandas as pd
import numpy as np
from collections import defaultdict
from concurrent.futures import ThreadPoolExecutor

# Efficient Loops
import itertools

# Traceback for Diagnosis
import traceback

# Data Visualization
import plotly.express as px
import plotly.graph_objects as go
import plotly.subplots as sp
from plotly.subplots import make_subplots
import plotly.figure_factory as ff
import plotly.io as pio
from IPython.display import display
from plotly.offline import init_notebook_mode
init_notebook_mode(connected=True)

# Statistics and Mathematics
import scipy.stats as stats
import statsmodels.api as sm
from scipy.stats import shapiro, skew, anderson, kstest
import math

# Feature Selection
from sklearn.feature_selection import RFECV, SelectKBest, chi2, f_classif, f_regression, mutual_info_classif, mutual_info_regression

# Machine Learning Pipeline
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin

# Preprocessing
from sklearn.preprocessing import StandardScaler, RobustScaler, QuantileTransformer, FunctionTransformer

# Model Selection
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold, RepeatedStratifiedKFold,  RepeatedKFold, TimeSeriesSplit

# Machine Learning Metrics
from sklearn.metrics import mean_squared_error,r2_score,mean_absolute_error,cohen_kappa_score,make_scorer,roc_curve,auc,accuracy_score,f1_score, precision_score,recall_score,confusion_matrix

# Machine Learning Regressors/Classifiers
from sklearn.linear_model import HuberRegressor, RANSACRegressor, TheilSenRegressor, Ridge, Lasso, ElasticNet, LogisticRegression, RidgeClassifier
from sklearn.svm import SVR, NuSVR, NuSVC, LinearSVR, LinearSVC
from sklearn.ensemble import HistGradientBoostingRegressor, StackingRegressor, AdaBoostRegressor, RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor, VotingRegressor
from sklearn.ensemble import HistGradientBoostingClassifier, StackingClassifier, AdaBoostClassifier, RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, VotingClassifier
from catboost import CatBoostRegressor, CatBoostClassifier
from xgboost import XGBRegressor, XGBClassifier
from lightgbm import LGBMRegressor, LGBMClassifier

# Clustering algorithms
from sklearn.cluster import KMeans

# Fine Tuning
import optuna

# Randomizer
import random

# Encoder
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder

# OS
import os

# File Handling
import pickle

# Image Package
from PIL import Image

# Hide Warnings
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Import Keras (Tensorflow)
from keras.models import Sequential # Neural Network model as a sequence of layers
from keras.layers import Conv2D # Convolutional Layer
from keras.layers import MaxPooling2D # MaxPooling Layer
from keras.layers import Flatten # Layer used to flatten 2D arrays for fully connected layers
from keras.layers import Dense # Layer adds fully connected layers to the neural network
from keras.layers import Dropout # Layer used to prevent overfitting by randomly setting a fraction of input units to 0 at each update during training time
from keras.layers import BatchNormalization # Layer used to normalize the activations of the neurons
from keras.layers import Activation # Layer used to apply an activation function to the output
from keras.callbacks import EarlyStopping, ModelCheckpoint # Classes used to save weights and stop training when improvements reach a limit
from keras.models import load_model # Function used to load a trained model
from keras.layers import Rescaling # Layer used to rescale the pixel values

import tensorflow as tf

## Set up the Notebook

In [5]:
# Configure Notebook
seed = 73
bg_color = "#EEF6FF"
paper_color = '#EEF6FF'

In [18]:
# Common Methods
def image_resizer(images):
    """
    This function resizes CIFAR-10 images to 32x32 pixels if they are not already in that size.
    """
    resized_images = []
    for img in images:
        if img.size != (32, 32):
            img = img.resize((32, 32), Image.ANTIALIAS)  # Resize non-32x32 images to 32x32
        resized_images.append(img)
    return resized_images

def plot_images_list(images, title, subtitle):
    """
    This function helps to plot a matric of images in a list
    """
    fig = sp.make_subplots(rows=3, cols=3)
    images = image_resizer(images)

    traces = []

    for i in range(min(9, len(images))):
        img = go.Image(z=np.array(images[i]))
        traces.append((img, i//3+1, i%3+1))

    fig.add_traces([trace[0] for trace in traces], rows=[trace[1] for trace in traces], cols=[trace[2] for trace in traces])
    fig.update_layout(
        title={'text': f'<b>{title} <br> <i><sub>{subtitle}</sub></i></b>', 'font': dict(size=22)},
        height=800,
        width=800,
        margin=dict(t=110, l=80),
        plot_bgcolor=bg_color, 
        paper_bgcolor=paper_color
    )

    fig.show()

In [7]:
# Use GPU if available
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        tf.config.experimental.set_visible_devices(gpus[0], 'GPU')
        strategy = tf.distribute.OneDeviceStrategy(device="/gpu:0")
        print('\nGPU found! Using GPU...')
    except RuntimeError as e:
        print(e)
else:
    strategy = tf.distribute.get_strategy()
    print('Number of replicas: ', strategy.num_replicas_in_sync)

Number of replicas:  1


## Load and Explore the Data (EDA)

In [10]:
# Load data
data_dir = '../data/cifar-10-batches-py'

def load_cifar_batch(batch_filename):
    with open(batch_filename, 'rb') as f:
        batch = pickle.load(f, encoding='latin1')
    return batch


train_dir = [os.path.join(data_dir, f'data_batch_{i}') for i in range(1, 5)]
test_dir = os.path.join(data_dir, 'test_batch')
val_dir = os.path.join(data_dir, 'data_batch_5')

Exploration of the individual directory sizes and measure of degree of imbalance per class.

In [11]:
def count_class_distributions(data_batch):
    counts = np.zeros(10, dtype=int)
    for label in data_batch['labels']:
        counts[label] += 1
    return counts

def analyze_data(data_dir, dataset_name):
    print(f'Analyzing {dataset_name}')
    class_counts = np.zeros(10, dtype=int)
    for i, dir in enumerate(data_dir):
        data_batch = load_cifar_batch(dir)
        class_counts += count_class_distributions(data_batch)
    
    fig = go.Figure(
        data=[
            go.Bar(
                x=['airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck'],
                y=class_counts)
            ]
        )
    
    fig.update_layout(
        title = f'Class Distribution of {dataset_name}',
        xaxis_title = 'Classes',
        yaxis_title = 'Count',
        xaxis_tickangle=-45,
    )

    fig.show()

analyze_data(train_dir, 'Training Data')
analyze_data([val_dir], 'Validation Data')
analyze_data([test_dir], 'Test Data')

Analyzing Training Data


Analyzing Validation Data


Analyzing Test Data


The largest imbalances are inside the validation set. As the balance is not perfect this may affect the outcome. 

CNNs require a fixed size for all images fed into it. 
The next check is for this requirement. If this condition is not met some sort of preprocessing will be necessary.

In [12]:
# Check image sizes
def check_image_sizes(data_dir, dataset_name):
    print(f'Checking {dataset_name}')
    
    size_count = defaultdict(int)

    for dir in data_dir:
        batch = load_cifar_batch(dir)
        images = batch['data'].reshape(-1, 3, 32, 32) #reshape to (num_images, channels, height, width)

        for image in images:
            image_size = tuple(image.shape[1:]) # Get the size of the image (height, width)
            size_count[image_size] += 1
    
    print(f'\n********** Image Sizes in {dataset_name} **********\n')
    for size, count in size_count.items():
        print(f'Image size {size} : {count}')

check_image_sizes(train_dir, 'Training Data')
check_image_sizes([val_dir], 'Validation Data')
check_image_sizes([test_dir], 'Test Data')

Checking Training Data

********** Image Sizes in Training Data **********

Image size (32, 32) : 40000
Checking Validation Data

********** Image Sizes in Validation Data **********

Image size (32, 32) : 10000
Checking Test Data

********** Image Sizes in Test Data **********

Image size (32, 32) : 10000


All images are of size 32x32 pixels.

Next some random images of each class are displayed to get a glimpse at the images of the dataset.

## Display random images

In [19]:
def select_random_images(data_dir, num_images_per_class):
    selected_images = {i: [] for i in range(10)} # 10 classes in CIFAR-10

    for dir in data_dir:
        batch = load_cifar_batch(dir)
        images = batch['data'].reshape(-1, 3, 32, 32) # Reshape to (num_images, channels, height, width)
        labels = batch['labels']

        for class_index in range(10):
            class_indices = [i for i, label in enumerate(labels) if label == class_index]
            random_indices = random.sample(class_indices, num_images_per_class)

            for idx in random_indices:
                selected_images[class_index].append(cifar_to_pil_image(images[idx]))
    
    return selected_images


def cifar_to_pil_image(image_array):
    image_array = np.transpose(image_array, (1, 2, 0)) # Reshape from (3, 32, 32) to (32, 32, 3)
    return Image.fromarray(image_array.astype('uint8'))


selected_images = select_random_images(train_dir, 3)
class_names = ['airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck']
for class_index in range(10):
    class_name = class_names[class_index]
    plot_images_list(selected_images[class_index], title=f'Random Images from CIFAR-10 Class {class_name}', subtitle=f'Class {class_name}')

## Preprocessing

In [25]:
def save_images_to_directory(images, labels, directory, class_names):
    if not os.path.exists(directory):
        os.makedirs(directory)
    
    for class_name in class_names:
        class_dir = os.path.join(directory, class_name)
        if not os.path.exists(class_dir):
            os.makedirs(class_dir)
    
    for idx, (image, label) in enumerate(zip(images, labels)):
        class_name = class_names[label]
        image_path = os.path.join(directory, class_name, f'{idx}.png')
        tf.keras.preprocessing.image.save_img(image_path, image)

# Create Tensorflow Datasets
def prepare_data_directory(data_dir, save_dir, class_names):
    all_images = []
    all_labels = []

    for dir in data_dir:
        batch = load_cifar_batch(dir)
        imgs = batch['data'].reshape(-1, 3, 32, 32) # Reshape to (num_images, channels, height, width)
        imgs = np.transpose(imgs, (0, 2, 3, 1)) # Transpose to (num_images, height, width, channels)
        lbls = np.array(batch['labels'])

        all_images.append(imgs)
        all_labels.append(lbls)
    
    images = np.concatenate(all_images, axis=0)
    labels = np.concatenate(all_labels, axis=0)

    # Normalize pixel values
    images = images.astype('float32') / 255.0

    save_images_to_directory(images, labels, save_dir, class_names)

prepare_data_directory(train_dir, 'train_data_dir', class_names)
prepare_data_directory([val_dir], 'val_data_dir', class_names)
prepare_data_directory([test_dir], 'test_data_dir', class_names)

In [23]:
# Creating a Dataset for the Training Data
train = tf.keras.utils.image_dataset_from_directory(
    train_dataset, # the training data
    labels='inferred', # class labels are inferred from subdirectory names
    label_mode='categorical', 
    class_names=class_names,
    batch_size=16, # number of processed samples before updating the models weights
    image_size=(32, 32), # fixed dimensions for all images
    shuffle=True, 
    seed=seed, # random seed for shuffling and transformations
    validation_split=0, # no validation split as this is already done
    crop_to_aspect_ratio=True # resize images without aspect ratio distortion
)

TypeError: Expected binary or unicode string, got <_TensorSliceDataset element_spec=(TensorSpec(shape=(3, 32, 32), dtype=tf.float32, name=None), TensorSpec(shape=(10,), dtype=tf.float64, name=None))>