# Set up

In [1]:
# General import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
import math
from random import random
import scipy.signal
import cv2
import os

# Sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC, LinearSVC
from sklearn.model_selection import GridSearchCV, PredefinedSplit
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import f1_score, accuracy_score
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier

# CNN import 
import tensorflow as tf
from tensorflow import keras
from keras.preprocessing.image import ImageDataGenerator, img_to_array
from tensorflow.keras.models import Sequential
from tensorflow.keras import datasets, layers, models, Model
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import regularizers
from tensorflow.keras.callbacks import LearningRateScheduler

In [None]:
pip freeze > requirements.txt  # Python2

In [6]:
# Var Path
data_path = '../input/fall2021-inf8245e-machine-learning/'
submit_path = './'

In [3]:
# Verif if we are have access to a GPU
# gpu_info = !nvidia-smi
# gpu_info = '\n'.join(gpu_info)
# if gpu_info.find('failed') >= 0:
#   print('Not connected to a GPU')
# else:
#   print(gpu_info)

In [4]:
def fix_seed_random(seed_value):
    # 1. Set the `PYTHONHASHSEED` environment variable at a fixed value
    import os
    os.environ['PYTHONHASHSEED']=str(seed_value)

    # 2. Set the `python` built-in pseudo-random generator at a fixed value
    import random
    random.seed(seed_value)

    # 3. Set the `numpy` pseudo-random generator at a fixed value
    import numpy as np
    np.random.seed(seed_value)

    # 4. Set the `tensorflow` pseudo-random generator at a fixed value
    import tensorflow as tf
    tf.random.set_seed(seed_value)
    tf.compat.v1.set_random_seed(seed_value)

    # 5. Configure a new global `tensorflow` session
    from keras import backend as K
    session_conf = tf.compat.v1.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
    sess = tf.compat.v1.Session(graph=tf.compat.v1.get_default_graph(), config=session_conf)
    tf.compat.v1.keras.backend.set_session(sess)

fix_seed_random(0)

In [7]:
# Import the raw data
X_raw = pd.read_pickle(data_path + "x_train.pkl")
y_raw = pd.read_pickle(data_path + "y_train.pkl")
x_test_raw = pd.read_pickle(data_path + "x_test.pkl")

# Data exploration

In [8]:
i = int(random() * len(y_raw))
print(i)
img = X_raw[i]
y = y_raw[i]
plt.imshow(img, interpolation='nearest')
plt.show()
img_height, img_width = img.shape
print(f"Image of a {y} with the shape: {img_height}x{img_width}")

In [9]:
unique, counts = np.unique(y_raw, return_counts=True)

result = np.column_stack((unique, counts)) 
num_class = len(result)
print(f"The dataset have {num_class} class")
print("Number of occ of each class in your data set")
for r in result:
  print(f"{r[0]}: {r[1]} pictures")

## Filter 

In [10]:
def mean_filter(img):
    Mean_filter = np.array([[1,1,1], [1,1,1], [1,1,1]])/float(9)
    return scipy.signal.convolve2d(img,Mean_filter,mode='same')

def median_filter(img):
    # img = cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)    
    Mean_filter = np.array([[1,1,1], [1,1,1], [1,1,1]])/float(9)
    return cv2.medianBlur(img, 3)

def gaussian_filter(img):
    """
    return: ['Original', 'Filtered', 'High Components', 'Enhanced']
    """
    Hg = np.zeros((20,20))
    # gray = cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)
    for i in range(20):
        for j in range(20):
            Hg[i,j] = np.exp(-((i-10) ** 2 + (j-10)**2)/10)

    gaussian_blur = scipy.signal.convolve2d(img, Hg, mode='same')
    gray_high = img - gaussian_blur
    gray_enhanced = img + 0.025 * gray_high

    column = 2
    row = 2

    return np.array([gaussian_blur, gray_high, gray_enhanced])

def edge_filter(img):
    # gray = cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)
    Hx = np.array([[1,0,-1], [2,0,-2],[1,0,-1]], dtype=np.float32)
    Hy = np.array([[-1,-2,-1],[0,0,0],[1,2,1]], dtype=np.float32)
    Gx = scipy.signal.convolve2d(img, Hx, mode ='same')
    Gy = scipy.signal.convolve2d(img, Hy, mode = 'same')
    G = (Gx*Gx + Gy*Gy) ** 0.5
    return np.array([Gx, Gy, G])

In [12]:


# Get Random Image
# i = int(random() * len(y_raw))
i = 259
img = X_raw[i]

# Mean filter
filtered_img = mean_filter(img)
plt.figure()
f, axarr = plt.subplots(1,2) 
f.suptitle("Mean filter", fontsize=14)
axarr[0].imshow(img) 
axarr[0].title.set_text('Original')
axarr[1].imshow(filtered_img)
axarr[1].title.set_text('Mean filter')

# Median filter
filtered_img = median_filter(img)
plt.figure()
f, axarr = plt.subplots(1,2) 
f.suptitle("Median filter", fontsize=14)
axarr[0].imshow(img)
axarr[0].title.set_text('Original')
axarr[1].imshow(filtered_img)
axarr[1].title.set_text('Median filter')

# Gaussian filter
imgs = gaussian_filter(img)
plt.figure()
f, axarr = plt.subplots(1,3) 
f.suptitle("Gaussian filter", fontsize=14)
axarr[0].imshow(imgs[0])
axarr[0].title.set_text('Filtered')
axarr[1].imshow(imgs[1])
axarr[1].title.set_text('High Components')
axarr[2].imshow(imgs[2])
axarr[2].title.set_text('Enhanced')

# Edge_filter
imgs = edge_filter(img)
plt.figure()
f, axarr = plt.subplots(1,3) 
f.suptitle("Edge filter", fontsize=14)
axarr[0].imshow(imgs[0])
axarr[0].title.set_text('Horizontal')
axarr[1].imshow(imgs[1])
axarr[1].title.set_text('Vertical')
axarr[2].imshow(imgs[2])
axarr[2].title.set_text('Filtered')

# Data Pre-processing 

In [13]:
# Usefull to convert id to name and name to id for y
id_ref = {'big_cats': 0, 'butterfly':1, 'cat':2, 'chicken': 3, 'cow':4, 'dog':5, 'elephant': 6, 'goat': 7, 'horse': 8, 'spider': 9, 'squirrel':10}
name_ref = {0:'big_cats', 1: 'butterfly', 2:'cat', 3:'chicken', 4:'cow', 5:'dog', 6:'elephant', 7:'goat', 8:'horse', 9:'spider', 10:'squirrel'}

# 
def find_original_y(Y):
    """
    Convert one hot back to orinal
    """
    y = []
    for one_hot in Y:
        for i in range(len(one_hot)):
            if one_hot[i] == 1:
                y.append(i)
    if len(Y) != len(y):
        raise Exception("Len not equal")
    return np.array(y)

In [14]:
# Usefull function for image processing
def process_image_flatt(img: np.ndarray):
    """
    Normalize and flatt the image
    """
    return (img / 255).flatten()


def process_image(img: np.ndarray):
    """
    Normalize the image
    """
    return (img / 255)

def resaphe(x):
    nsamples, nx, ny = x.shape
    return x.reshape((nsamples,nx*ny))

def process_images(imgs, process_func=process_image_flatt):
    imgs_processed = []
    for img in imgs:
        imgs_processed.append(process_func(img))
    return np.array(imgs_processed)

In [15]:
# Process imgs

#### MODIFICATION HERE TO ADD FILTER ####
# process_X =  np.array([edge_filter(img)[2] for img in process_images(X_raw, process_image)])
# process_X =  np.array([gaussian_filter(img)[2] for img in process_images(X_raw, process_image)])
process_X =  np.array([img for img in process_images(X_raw, process_image)])
process_X.shape

# Process y
process_y = np.array([id_ref[name] for name in y_raw])
# Encode the y vector to be in one hot format
encoder = LabelEncoder()
encoder.fit(process_y)
encoded_Y = encoder.transform(process_y)
process_Y_onehot = to_categorical(encoded_Y,num_classes=num_class)

# Create a train and valid set (X and Y MAJ because dim > 1)
# Partition: 0.25 x 0.8 = 0.2
X_train, X_valid, Y_train, Y_valid = train_test_split(process_X, 
                                                      process_Y_onehot, 
                                                      test_size=0.20,  
                                                      random_state=1)

# Keep a little bit of data to evaluate the finale model for the repport
X_valid, X_valid_2, Y_valid, Y_valid_2 = train_test_split(X_valid, Y_valid, test_size=0.12, random_state=1)

y_valid = find_original_y(Y_valid)
y_train = find_original_y(Y_train)
y_valid_2 = find_original_y(Y_valid_2)

In [16]:
unique, counts = np.unique(y_valid, return_counts=True)

result = np.column_stack((unique, counts)) 
num_class = len(result)
print(f"VALIDATION SET: {len(y_train)} datapoint")
print(f"The dataset have {num_class} class")
print("Number of occ of each class in your data set")
for r in result:
  print(f"{r[0]}: {r[1]} pictures")

In [17]:
unique, counts = np.unique(y_train, return_counts=True)

result = np.column_stack((unique, counts)) 
num_class = len(result)
print(f"TRAINING SET: {len(y_valid)} datapoint")
print(f"The dataset have {num_class} class")
print("Number of occ of each class in your data set")
for r in result:
  print(f"{r[0]}: {r[1]} pictures")

In [18]:
unique, counts = np.unique(y_valid_2, return_counts=True)

result = np.column_stack((unique, counts)) 
num_class = len(result)
print(f"REPPORT TRAINING SET: {len(y_valid_2)} datapoint")
print(f"The dataset have {num_class} class")
print("Number of occ of each class in your data set")
for r in result:
  print(f"{r[0]}: {r[1]} pictures")

# Testing function

In [20]:
def f1_bench(clf, X, y, print_=True):
    pred_values = clf.predict(X)
    f1 = f1_score(y, pred_values, average='micro')
    if print_:
        print(f"f1 score: {f1}")
    else:
        return f1
    
def f1_bench_nn(clf, X, y, print_=True):
    preds = clf.predict(X)
    pred_values = []
    for p in preds:
        pred_values.append(np.argmax(p))
    f1 = f1_score(y, pred_values, average='micro')
    if print_:
        print(f"f1 score: {f1}")
    else:
        return f1

# Model SVM

In [None]:
def data_split(X_t, y_t, X_v, y_v):
    # train set with valid and train
    X = np.concatenate((X_t, X_v), axis=0)
    y = np.concatenate((y_t, y_v), axis=0)
    
    # Labal train and valid data
    split_index = np.concatenate((np.full(X_t.shape[0], 1), np.zeros(X_v.shape[0])), axis=0)

    return X, y, PredefinedSplit(test_fold=split_index)

## Test on a sub-set 

### spider and cat

In [None]:
def get_sub_set(X, y, animals):
    x_sub = []
    y_sub = []

    for i in range(len(X)):
        if y[i] in animals:
            x_sub.append(X[i])
            y_sub.append(y[i])

    return np.array(x_sub), np.array(y_sub)

def print_parity(y):
    unique, counts = np.unique(y, return_counts=True)
    result = np.column_stack((unique, counts)) 
    print (result)

# Araigner et chat seulement
animals = [9,2]

# Traing sub data
x_sub, y_sub = get_sub_set(X_train, y_train, animals)

# Test sub data
x_sub_t, y_sub_t = get_sub_set(X_valid, y_valid, animals)

print(f"Number of train data: {len(x_sub)}")
print("training: ")    
print_parity(y_sub)

print(f"\nNumber of test data: {len(x_sub_t)}")
print("testing: ")
print_parity(y_sub_t)

X_sub = resaphe(x_sub)
X_sub_t = resaphe(x_sub_t)

In [None]:
from sklearn.linear_model import SGDClassifier
# Model with params optimisation
clf = make_pipeline(StandardScaler(), LinearSVC(C=0.5, tol=1e-3))
clf.fit(X_sub, y_sub)

# Make Prediction
y_pred = clf.predict(X_sub_t)

# Print the accuracy
print(f"The model is {accuracy_score(y_pred,y_sub_t)*100}% accurate")

In [None]:
# Model with params optimisation
clf = make_pipeline(StandardScaler(), SVC(gamma='auto',  C=100))
clf.fit(X_sub, y_sub)

# Make Prediction
y_pred = clf.predict(X_sub_t)

# Print the accuracy
print(f"The model is {accuracy_score(y_pred,y_sub_t)*100}% accurate")

### Spider, cat, horse and goat

In [None]:
# Spider, cat, horse and goat
animals = [9, 2, 7, 8]

# Traing sub data
x_sub, y_sub = get_sub_set(X_train, y_train, animals)

# Test sub data
x_sub_t, y_sub_t = get_sub_set(X_valid, y_valid, animals)

print(f"Number of train data: {len(x_sub)}")
print("training: ")    
print_parity(y_sub)

print(f"\nNumber of test data: {len(x_sub_t)}")
print("testing: ")
print_parity(y_sub_t)
X_sub = resaphe(x_sub)
X_sub_t = resaphe(x_sub_t)
# Model with params optimisation
clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))
clf.fit(X_sub, y_sub)

# Make Prediction
y_pred = clf.predict(X_sub_t)

# Print the accuracy
print(f"The model is {accuracy_score(y_pred,y_sub_t)*100}% accurate")

## With all the anaimal

In [None]:
# Spider, cat, horse and goat
animals = [0,1,2,3,4,5,6,7,8,9,10]

# Traing sub data
x_sub, y_sub = get_sub_set(X_train, y_train, animals)

# Test sub data
x_sub_t, y_sub_t = get_sub_set(X_valid, y_valid, animals)

print(f"Number of train data: {len(x_sub)}")
print("training: ")    
print_parity(y_sub)

print(f"\nNumber of test data: {len(x_sub_t)}")
print("testing: ")
print_parity(y_sub_t)

In [None]:
X_sub = resaphe(x_sub)
X_sub_t = resaphe(x_sub_t)
# Model with params optimisation
clf = make_pipeline(StandardScaler(), SVC(**{'kernel': 'poly'}))
clf.fit(X_sub, y_sub)

# Make Prediction
y_pred = clf.predict(X_sub_t)

# Print the accuracy
print(f"The model is {accuracy_score(y_pred,y_sub_t)*100}% accurate")

In [None]:
# Grid search
X_sub = resaphe(x_sub)
X_sub_t = resaphe(x_sub_t)
X, y, split = data_split(X_sub, y_sub, X_sub_t, y_sub_t)
# test_params = {'C':[0.1,1,10,100],'gamma':[0.0001,0.001,0.1,1],'kernel':['rbf','poly']}
test_params = {'kernel':['rbf','poly', 'linear', 'sigmoid']}

svc=svm.SVC(probability=True)
# clf=GridSearchCV(svc, param_grid)

grid_search = GridSearchCV(svc, param_grid=test_params, cv=split, refit=False, verbose=3, n_jobs=-1)
grid_search.fit(X, y)
print(grid_search.best_params_)

SVM_model = LinearSVC(**grid_search.best_params_).fit(X, y)

# Random forest

In [None]:
def get_sub_set(X, y, animals):
    x_sub = []
    y_sub = []

    for i in range(len(X)):
        if y[i] in animals:
            x_sub.append(X[i])
            y_sub.append(y[i])

    return np.array(x_sub), np.array(y_sub)

def print_parity(y):
    unique, counts = np.unique(y, return_counts=True)
    result = np.column_stack((unique, counts)) 
    print (result)

animals = [0,1,2,3,4,5,6,7,8,9,10,11]

# Traing sub data
x_sub, y_sub = get_sub_set(X_train, y_train, animals)

# Test sub data
x_sub_t, y_sub_t = get_sub_set(X_valid, y_valid, animals)

print(f"Number of train data: {len(x_sub)}")
print("training: ")    
print_parity(y_sub)

print(f"\nNumber of test data: {len(x_sub_t)}")
print("testing: ")
print_parity(y_sub_t)

X_sub = resaphe(x_sub)
X_sub_t = resaphe(x_sub_t)

In [None]:
Depths = [50,45,40,35,30,25,None]
Pruning_Alpha = [10,100,1000]
for j in range(len(Pruning_Alpha)):
    Model = RandomForestClassifier(criterion='gini',random_state=0,ccp_alpha=Pruning_Alpha[j])
    Model.fit(X_sub, y_sub)
    f1_bench(Model, X_sub_t, y_sub_t, print_=True)
    Model = DecisionTreeClassifier(criterion='entropy',random_state=0,ccp_alpha=Pruning_Alpha[j])
    Model.fit(X_sub, y_sub)
    f1_bench(Model,X_sub_t, y_sub_t, print_=True)
    print('alpha value =', Pruning_Alpha[j])
for j in range(len(Depths)):
    Model = RandomForestClassifier(criterion='gini',random_state=0,max_depth = Depths[j])
    Model.fit(X_sub, y_sub)
    f1_bench(Model, X_sub_t, y_sub_t, print_=True)
    Model = DecisionTreeClassifier(criterion='entropy',random_state=0,max_depth=Depths[j])
    Model.fit(X_sub, y_sub)
    f1_bench(Model, X_sub_t, y_sub_t, print_=True)
    print('Depths Value =', Depths[j])
n_estimator = [10,20,50,100,200, 500, 1000, 2000]
for j in range(len(n_estimator)):
    Model = RandomForestClassifier(n_estimators = n_estimator[j], criterion='gini',random_state=0)
    Model.fit(X_sub, y_sub)
    f1_bench(Model, X_sub_t, y_sub_t, print_=True)
    Model = RandomForestClassifier(n_estimators = n_estimator[j], criterion='entropy',random_state=0)
    Model.fit(X_sub, y_sub)
    f1_bench(Model, X_sub_t, y_sub_t, print_=True)
    print('n value =', n_estimator[j])

Depths = [50,45,40,35,30,25,None]
Vector_alpha_gini = [0.16610597140454164, 0.16610597140454164, 0.16610597140454164]
Vector_alpha_entropy = [0.16610597140454164, 0.16610597140454164, 0.16610597140454164]
Vector_Depths_gini = [0.33936080740117747, 0.33936080740117747, 0.34314550042052144, 0.340201850294365, 0.3435660218671152, 0.32253994953742643, 0.33936080740117747]
Vector_Depths_entropy = [0.19091673675357446, 0.19091673675357446, 0.19091673675357446, 0.19091673675357446, 0.19091673675357446, 0.19091673675357446, 0.19091673675357446]
Vector_n_gini = [0.255677039529016, 0.2994112699747687, 0.3301093355761144, 0.33936080740117747, 0.35407905803195955, 0.35912531539108494, 0.36206896551724144, 0.3549201009251472]
Vector_n_entropy = [0.255677039529016, 0.27544154751892347, 0.3036164844407065, 0.3288477712363331, 0.34146341463414637, 0.34440706476030275, 0.3528174936921783, 0.34524810765349034]

plt.plot(Depths, Vector_Depths_gini, label = "Gini")
plt.plot(Depths, Vector_Depths_entropy, label = "Entropy")
plt.xlabel('Depths')
plt.ylabel('f1 score')
plt.legend()
plt.show()
plt.plot(n_estimator, Vector_n_gini, label = "Gini")
plt.plot(n_estimator, Vector_n_entropy, label = "Entropy")
plt.xlabel('n_estimators')
plt.ylabel('f1 score')
plt.legend()
plt.show()

# CNN

Creation of the Achitecuture model with a Convolutional Neural Network approach

In [21]:
# Model Architecture 
def create_model(init_conv_size, conv_size, dense_size):
    model = models.Sequential()
    model.add(layers.Conv2D(init_conv_size, (3, 3), strides=2, input_shape=(96, 96, 1)))
    model.add(layers.BatchNormalization())
    model.add(layers.Activation('relu'))

    model.add(layers.MaxPooling2D((2, 2), strides=2))
    model.add(layers.Conv2D(conv_size, (3, 3), strides=1, padding='same'))
    model.add(layers.BatchNormalization())
    model.add(layers.Activation('relu'))
    model.add(layers.Conv2D(conv_size, (3, 3), strides=1, padding='same'))
    model.add(layers.BatchNormalization())
    model.add(layers.Activation('relu'))

    model.add(layers.MaxPooling2D((2, 2), strides=2))
    model.add(layers.Conv2D(conv_size*2, (3, 3), strides=1, padding='same'))
    model.add(layers.BatchNormalization())
    model.add(layers.Activation('relu'))
    model.add(layers.Conv2D(conv_size*2, (3, 3), strides=1, padding='same'))
    model.add(layers.BatchNormalization())
    model.add(layers.Activation('relu'))

    model.add(layers.MaxPooling2D((2, 2), strides=2))
    model.add(layers.Conv2D(conv_size*2, (3, 3), strides=1, padding='same'))
    model.add(layers.BatchNormalization())
    model.add(layers.Activation('relu'))
    model.add(layers.Conv2D(conv_size*2, (3, 3), strides=1, padding='same'))
    model.add(layers.BatchNormalization())
    model.add(layers.Activation('relu'))

    model.add(layers.MaxPooling2D((2, 2), strides=2))
    model.add(layers.Conv2D(conv_size*2, (3, 3), strides=1, padding='same'))
    model.add(layers.BatchNormalization())
    model.add(layers.Activation('relu'))
    model.add(layers.MaxPooling2D((2, 2), strides=2))
    
    model.add(layers.Flatten())
    model.add(layers.Dense(dense_size, activation='relu'))
    model.add(layers.BatchNormalization())
    model.add(layers.Dropout(0.5))
    model.add(layers.Dense(dense_size/2, activation='relu'))
    model.add(layers.BatchNormalization())
    model.add(layers.Dropout(0.65))
    model.add(layers.Dense(11, activation='softmax'))
    return model

### Model training

In [22]:
def lr_decay(epoch, lr):
  """
  Return a reduced learning rate
  """
  k = 0.02
  return init_lr * math.exp(-k*epoch)


To find the same result as our Kaggle, pls do the following precedure of this section with the model 1, 2 and 3.

1. Copy and paste each model in the following case
2. Run the code for each section. Stop the model training at the specified epoch.


Model 1:

```
# Trained with:
train_datagen = ImageDataGenerator(
        shear_range=0.2,
        zoom_range=0.2,
        horizontal_flip=True)
train_generator = train_datagen.flow(X_train.reshape((len(X_train), 96, 96, 1)), Y_train, batch_size=16)

# Random Seed
seed = 100
fix_seed_random(seed)

# Model Name
model_name = 'model_1'

# Stoped at
# Epoch=00069
```

Model 2:

```
# Trained with:
train_datagen = ImageDataGenerator(
        shear_range=0.18,
        zoom_range=0.18,
        horizontal_flip=True)
train_generator = train_datagen.flow(X_train.reshape((len(X_train), 96, 96, 1)), Y_train, batch_size=16)

# Random Seed
seed = 200
fix_seed_random(seed)

# Model Name
model_name = 'model_2'

# Stoped at
# Epoch=96
```

Model 3: With Gaussian filtered data !

```
# Trained with:
train_datagen = ImageDataGenerator(
        shear_range=0.22,
        zoom_range=0.22,
        horizontal_flip=True)
train_generator = train_datagen.flow(X_train.reshape((len(X_train), 96, 96, 1)), Y_train, batch_size=16)

# Random Seed
seed = 300
fix_seed_random(seed)

# Model Name
model_name = 'model_3'

# Stoped at
# Epoch=60
```

Model 4: With Edge filtered data !

```
# Trained with:
train_datagen = ImageDataGenerator(
        shear_range=0.2,
        zoom_range=0.2,
        horizontal_flip=True)
train_generator = train_datagen.flow(X_train.reshape((len(X_train), 96, 96, 1)), Y_train, batch_size=16)

# Random Seed
seed = 400
fix_seed_random(seed)

# Model Name
model_name = 'model_4'

# Stoped at
# Epoch=85
```


In [23]:
# Trained with:
train_datagen = ImageDataGenerator(
        shear_range=0.2,
        zoom_range=0.2,
        horizontal_flip=True)
train_generator = train_datagen.flow(X_train.reshape((len(X_train), 96, 96, 1)), Y_train, batch_size=16)

# Random Seed
seed = 100
fix_seed_random(seed)

# Model Name
model_name = 'model_1'



In [24]:
# Model creation
init_conv_sizes = 64
conv_sizes = 128
dense_sizes = 1024
f1_scores = []

model = create_model(init_conv_sizes, conv_sizes, dense_sizes)
model.compile(optimizer='adam',
              loss=tf.keras.losses.categorical_crossentropy,
              metrics=['accuracy'])

model.summary()

In [None]:
# Random Seed
fix_seed_random(seed)

# Learning rate
init_lr = 0.001

model.fit(train_generator, steps_per_epoch = len(X_train) // 16, epochs=150, validation_data=(X_valid, Y_valid), callbacks=[LearningRateScheduler(lr_decay, verbose=1)])
micro_f1 = f1_bench_nn(model,X_valid_cnn,y_valid, False)
f1_scores.append(micro_f1)

In [None]:
f1_bench_nn(model,X_train, y_train, True)
f1_bench_nn(model,X_valid,y_valid, True)
f1_bench_nn(model,X_valid_2,y_valid_2, True)

# MAKE PREDICTION WITH THE DATA

### Save The model

In [None]:
# Save model
from datetime import datetime
model_path_pre_valid = f'saved_{model_name}/model_without_validation'
model.save(model_path_pre_valid)

### Import a old model

In [None]:
# PATH TO A MODEL
path_model_to_finish = f'saved_{model_name}/model_without_validation'
model_ = tf.keras.models.load_model(path_model_to_finish)

# Check the accuracy pre validation
f1_bench_nn(model_ , X_train, y_train, True)
f1_bench_nn(model_ , X_valid, y_valid, True)
f1_bench_nn(model_,X_valid_2,y_valid_2, True)

### Add the validation Data 

In [None]:
# ~~~~~~~~~~ TRAIN WITH GENERATOR ~~~~~~~~~~
fix_seed_random(seed)
valid_generator = train_datagen.flow(X_valid.reshape((len(X_valid), 96, 96, 1)), Y_valid, batch_size=16)
init_lr = 0.0003
epochs_valid = 12
model_.fit(valid_generator, steps_per_epoch = len(X_valid) // 16, epochs=epochs_valid, validation_data=(X_train, Y_train), callbacks=[LearningRateScheduler(lr_decay, verbose=1)])

f1_bench_nn(model_,X_train, y_train, True)
f1_bench_nn(model_,X_valid,y_valid, True)
f1_bench_nn(model_,X_valid_2,y_valid_2, True)
model_.save(f'saved_{model_name}/model_generator_validation')

In [None]:
# ~~~~~~~~~~ TRAIN WITH VALIDATION ~~~~~~~~~~
fix_seed_random(seed)
path_model_to_finish = f'saved_{model_name}/model_without_validation'
model_ = tf.keras.models.load_model(path_model_to_finish)

model_.fit(X_valid, Y_valid, epochs=8, batch_size=16)

f1_bench_nn(model_,X_train, y_train, True)
f1_bench_nn(model_,X_valid,y_valid, True)
f1_bench_nn(model_,X_valid_2,y_valid_2, True)

model_.save(f'saved_{model_name}/model_validation')

### Make prediction

In [None]:
def create_pred_file(preds, file_name='df_remise.csv'):
    ids = [i for i in range(len(preds))]
    df_remise = pd.DataFrame({'Id': ids,
                          'class': preds})
    df_remise.to_csv(file_name, sep=',', index=False)

# USE THE SAME PREPROCESSING AS THE DATA USE FOR TRAINING !!!!!!!
# X_test =  np.array([edge_filter(img)[2] for img in process_images(x_test_raw, process_image)])
# X_test =  np.array([gaussian_filter(img)[2] for img in process_images(x_test_raw, process_image)])
# X_test = np.array([img for img in process_images(x_test_raw, process_image)])

In [None]:
# With Normal data
files = [
      'saved_model_1/model_validation',
      'saved_model_2/model_validation'
]
X_test = np.array([img for img in process_images(x_test_raw, process_image)])
X_test_cnn = np.array([x.reshape((96,96,1)) for x in X_test])

for model_file in files:
  model_ = tf.keras.models.load_model(model_file)
  preds = model_.predict(X_test_cnn)
  y_pred = []
  for p in preds:
      y_pred.append(np.argmax(p))
  pred_file_name = f'{model_file}.csv'
  create_pred_file(y_pred, pred_file_name)

In [None]:
# With Gaussian      
files = [
      'saved_model_3/model_validation',
      'saved_model_3/model_without_validation',
      'saved_model_3/model_generator_validation'
]

X_test =  np.array([gaussian_filter(img)[2] for img in process_images(x_test_raw, process_image)])
X_test_cnn = np.array([x.reshape((96,96,1)) for x in X_test])

for model_file in files:
  model_ = tf.keras.models.load_model(model_file)
  preds = model_.predict(X_test_cnn)
  y_pred = []
  for p in preds:
      y_pred.append(np.argmax(p))
  pred_file_name = f'{model_file}.csv'
  create_pred_file(y_pred, pred_file_name)

# Model Essemble

Testing Essemble model

In [None]:
# Test on valid set 2: Exemple with Model_3
model_files = [
      'saved_model_3/model_without_validation',
      'saved_model_3/model_validation',
      'saved_model_3/model_generator_validation',
]

preds = [[] for _ in range(len(y_valid_2))]
for model_file in model_files:
  model_ = tf.keras.models.load_model(model_file)
  pred = model_.predict(X_valid_2)
  for i in range(len(pred)):
    preds[i].append(np.argmax(pred[i]))

print(preds)

import random
def most_frequent(List):
    counter = 1
    num = random.choice(List)
    
    for i in List:
        curr_frequency = List.count(i)
        if(curr_frequency> counter):
            counter = curr_frequency
            num = i
 
    return num

    y_pred = []
for i in range(len(y_valid_2)):
    pred_arr = preds[i]
    y_pred.append(most_frequent(pred_arr))

good_pred = 0
for i in range(len(y_valid_2)):
  if y_valid_2[i] == y_pred[i]:
    good_pred += 1

print(good_pred/len(y_valid_2))

Using Model Essemble on test set

In [None]:
model_pred_df = [
    # pd.read_csv("Pred/model_1_validation.csv"),
    # pd.read_csv("Pred/model_2_validation.csv"),
    # pd.read_csv("Pred/model_3_validation.csv"),
    # pd.read_csv("Pred/model_3_without_validation.csv"),

    pd.read_csv('saved_model_1/model_validation.csv'), 
    pd.read_csv('saved_model_2/model_validation.csv'),
    pd.read_csv('saved_model_3/model_validation.csv'),
    pd.read_csv('saved_model_3/model_without_validation.csv')
]

def get_pred(i):
  pred = []
  for j in range(len(model_pred_df)):
    pred.append(model_pred_df[j].iloc[i]['class'])
  return pred

In [None]:
# Exploration
i = 1
pred_i = get_pred(i)
print(pred_i)

# Check if behavior is ok
print(most_frequent(pred_i))

In [None]:
# Generate file with combinaison
preds = []
for i in range(len(model_pred_df[0])):
    pred_arr = get_pred(i)
    preds.append(most_frequent(pred_arr))

create_pred_file(preds, "df_remise_multiple_model.csv")

In [None]:
!zip -r /content/file.zip /content
from google.colab import files
files.download("/content/file.zip")