# Preprocessing the Dataset

In [1]:
import numpy as np
from PIL import Image
import pandas as pd

In [2]:
# Reading the csv file
df = pd.read_csv("../2_SeparatingMaleDataset/maleDataset.csv")

# Directory of the images
src_path = "../2_SeparatingMaleDataset/maleDatasetImages/"

In [3]:

defectedImagesIDs = []
goodImagesIDs = []
folderPathOfImage = src_path

for id in df['id']:
    if id==39403 or id==39410 or id==39401 or id==39425 or id==12347:
        defectedImagesIDs.append(id)
        df.drop(df[df['id'] == id].index, inplace = True)
        # continue
    else:
        imagePathVariable = folderPathOfImage + str(id)+'.jpg'
        image = Image.open(imagePathVariable)
        numpyArrayImg = np.array(image)
        
        a = (numpyArrayImg.shape)  
        if a == (80, 60, 3):
            goodImagesIDs.append(id)
        else:
            defectedImagesIDs.append(id)
            df.drop(df[df['id'] == id].index, inplace = True)

In [4]:
print(len(goodImagesIDs))
print(len(defectedImagesIDs))
print(len(goodImagesIDs) + len(defectedImagesIDs))

24761
304
25065


In [5]:
df.count()

id                    24761
gender                24761
masterCategory        24761
subCategory           24761
articleType           24761
baseColour            24761
season                24761
year                  24761
usage                 24761
productDisplayName    24761
dtype: int64

In [6]:
NumpyArrays = []
for i in goodImagesIDs:
    imagePathVariable = folderPathOfImage + str(i)+'.jpg'
    image = Image.open(imagePathVariable)
    numpyArrayImg = np.array(image)
    NumpyArrays.append([numpyArrayImg])
df_images = np.array(NumpyArrays)
type(df_images)

numpy.ndarray

In [7]:
# Encoding the usage column into Casual and Formal
df_usage = df['usage']
df_usage.head()
df_usage.shape # (24761,)

(24761,)

In [8]:
# Defining Lambda Function to encode the values of the column
encoderFunction = lambda valueOfColumn: 1 if (valueOfColumn == 'Formal') else 0
df_usageEncoded = np.vectorize(encoderFunction)(df_usage)
print(type(df_usageEncoded))
df_usageEncoded.shape # (24761,)

<class 'numpy.ndarray'>


(24761,)

In [9]:
formalCount, casualCount = 0, 0
for i in df_usageEncoded:
    # print(i)
    if i ==1:
        formalCount += 1
    elif i ==0:
        casualCount += 1
print(f'formal count (1): {formalCount}') # 2,205 i.e. correct number of 1s (formal)
print(f'casual count (0): {casualCount}') # 22,556 i.e. correct number of 0s

formal count (1): 2205
casual count (0): 22556


so now we have two numpy.ndarray
1) df_images: contain numpy array of all images
2) df_usageEncoded: contain label of each image in form of 0 and 1

so df_images is x and df_usageEncoded is y

# Implementing ANN

In [10]:
import os
import numpy as np
import tensorflow as tf
import sklearn

from tensorflow.keras.utils import array_to_img
from PIL import Image

from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.callbacks import TensorBoard

from time import strftime # gives hours and minutes of current time.

from sklearn.metrics import confusion_matrix

import matplotlib.pyplot as plt

ModuleNotFoundError: No module named 'tensorflow'

### Scaling and flattening the data

In [None]:
#Scaling the data between 0 and 1 (Normalization)
x_scaled = df_images / 255.0

In [None]:
#Let us see the data again after scaling.
x_scaled
#note that this is a 4D tensor.

In [None]:
#Flattening the train tensor; placing all pixel for one image in one dimension
TOTAL_INPUTS = 80*60*3
x_scaled_flat = x_scaled.reshape(x_scaled.shape[0], TOTAL_INPUTS)

In [None]:
#Let us see how this flat array looks like
x_scaled_flat
#Now it is a 2D tensor.

In [None]:
x_scaled_flat.shape
#Note that 14400 = 80 x 60 x 3

### Creating train, test and validation Dataset

In [None]:
TEST_SIZE = 5000

##Creating test set
x_test = x_scaled_flat[:TEST_SIZE]
y_test = df_usageEncoded[:TEST_SIZE]
x_test.shape

In [None]:
VAL_SIZE = 1000

##Creating test set
x_val = x_scaled_flat[:VAL_SIZE]
y_val = df_usageEncoded[:VAL_SIZE]
x_val.shape

In [None]:
##Creating the remaining train set
x_train = x_scaled_flat[TEST_SIZE + VAL_SIZE:]
y_train = df_usageEncoded[TEST_SIZE + VAL_SIZE:]
x_train.shape

So now we have two scaled and flattened datasets:
- The train set haing 18761 samples
- The test set having 5000 samples
- The validation set having 1000 samples

## Define the Neural Network using Keras

### Model 1

In [None]:
model_1 = Sequential([
    Dense(units=128, input_dim=TOTAL_INPUTS, activation='relu', name='m1_hidden1'),
    Dense(units=64, activation='relu', name='m1_hidden2'),
    Dense(16, activation='relu', name='m1_hidden3'),
    Dense(10, activation='softmax', name='m1_output')
])
#if we donot give names to the layers, then the names keep on changing on every run

model_1.compile(optimizer='adam', 
                loss='sparse_categorical_crossentropy', 
                metrics=['accuracy'])

In [None]:
type(model_1)
model_1.summary()

- Total neurons in layer m1_hidden1 = (TOTAL_INPUTS+1)*128 = ((80*60*3)+1)*128 = 1843328
- Total neurons in layer m1_hidden2 = (128+1)*64  
- Total neurons in layer m1_hidden3 = (64+1)*16
- Total neurons in layer m1_output = (16+1)*10

## Tensorboard (visualising learning)

In [None]:
#Setting main folder and subfolders for tendboard
LOG_DIR = 'tensorboard_cifar_logs/'

def get_tensorboard(model_name):
    sub_folder_name = f'{model_name}_at_{strftime("%H_%M")}'
    dir_paths = os.path.join(LOG_DIR, sub_folder_name)
    os.makedirs(dir_paths)
    return TensorBoard(log_dir=dir_paths)

### Loading tensor board in notebook

In [None]:
# Load the TensorBoard notebook extension
%load_ext tensorboard
%tensorboard --logdir=tensorboard_cifar_logs

## Fitting the Model

In [None]:
samples_per_batch = 1000
nr_epochs = 150

In [None]:
%%time
model_1.fit(x_train, y_train, batch_size=samples_per_batch, epochs=nr_epochs,
            callbacks=[get_tensorboard('Model_1')], verbose=0, validation_data=(x_val, y_val))

## Making Predictions on Individual Images

- In the following code model_1 is used for prediction.
- You may use model_2 or model_3 too by making necessary alterations in the model name.

In [None]:
image_nr=10
x_val[image_nr].shape

In [None]:
##Adding a dimension as per requirement of predict method
test = np.expand_dims(x_val[image_nr], axis=0)
test.shape

In [None]:
model_1.predict(test)

In [None]:
#Picking the highest probability class
predicted_value=np.argmax(model_1.predict(test), axis=1)
actual_value=y_val[image_nr]

print(f'Actual value: {actual_value} vs. predicted: {predicted_value[0]}')

## Evaluation

In [None]:
#Recalling the metrics that we set during compilation of the model.
model_1.metrics_names

In [None]:
# Let us print the loss funcstion value and overall accuracy of our model on test data.
test_loss, test_accuracy = model_1.evaluate(x_test, y_test)
print(f'Test loss is {test_loss:0.3} and test accuracy is {test_accuracy:0.1%}')