# This notebook provides the functionality to build, train, and test a CNN for predicting mosquito age, grouped age, species, and status.

## Structure:
* Import packages to be used.
* Load mosquito data.
* Define fucntions for plotting, visualisation, and logging.
* Define a function to build the CNN.
* Define a function to train the CNN.
* Main section to organise data, define the CNN, and call the building and training of the CNN.


In [1]:
import pylab as pl
import datetime
import pandas as pd
import itertools
from itertools import cycle
import pickle
import random as rn
import os
from time import time
from tqdm import tqdm

import numpy as np

import matplotlib
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import normalize, StandardScaler
from sklearn.utils import resample

import tensorflow as tf
import keras
from keras.models import Sequential, Model
from keras import layers, metrics
from keras.layers import Input
from keras.layers.merge import Concatenate
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.layers.normalization import BatchNormalization
from keras.layers.convolutional import Conv1D, MaxPooling1D
from keras.models import model_from_json, load_model
from keras.regularizers import *
from keras.callbacks import CSVLogger
from keras import backend as K

Using TensorFlow backend.


In [2]:
# rand_seed = np.random.randint(low=0, high=100)
rand_seed = 16
print(rand_seed)

16


In [3]:
os.environ['PYTHONHASHSEED'] = '0'

## The below is necessary for starting Numpy generated random numbers in a well-defined initial state.
np.random.seed(42)

## The below is necessary for starting core Python generated random numbers in a well-defined state.
rn.seed(12345)

## Force TensorFlow to use single thread.
## Multiple threads are a potential source of
## non-reproducible results.
## For further details, see: https://stackoverflow.com/questions/42022950/which-seeds-have-to-be-set-where-to-realize-100-reproducibility-of-training-res
# session_conf = tf.ConfigProto(device_count = {'GPU':0}, intra_op_parallelism_threads=4) #session_conf = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
# session_conf = tf.ConfigProto(device_count = {'GPU':0}) #session_conf = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
#session_conf.gpu_options.per_process_gpu_memory_fraction = 0.5
## The below tf.set_random_seed() will make random number generation
## in the TensorFlow backend have a well-defined initial state.
## For further details, see: https://www.tensorflow.org/api_docs/python/tf/set_random_seed
tf.set_random_seed(1234)

gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.5)

sess = tf.Session(graph=tf.get_default_graph(), config=tf.ConfigProto(gpu_options=gpu_options))
K.set_session(sess)

## Load the data

The data file is created using Loco Mosquito:
https://github.com/magonji/MIMI-project/blob/master/Loco%20mosquito%204.0.ipynb

### The data file has headings: Species - Status - RearCnd - Age - Country- Frequencies

In [4]:
# df = pd.read_csv("/home/josh/Documents/Mosquito_Project/New_Data/Data/MIMIdata_update_19_02/mosquitoes_country_LM_5_0.dat", '\t')

# df['AgeGroup'] = 0
# df['AgeGroup'] = np.where(df['Age']>10, 2, np.where(df['Age']>4, 1, 0))

# df['Temp'] = 0
# df['Humidity'] = 0

# cols = df.columns.tolist()
# cols = cols[:6] + cols[-3:] + cols[6:-3]

# df = df[cols]

# for index, rows in tqdm(df.iterrows()):
#     if rows['RearCnd']=='TL':    
#         if rows['Country']=='S':
#             if rows['Species']=='AA':
#                 rows['Temp']=26
#                 rows['Humidity']=78
#             if rows['Species']=='AC':
#                 rows['Temp']=26
#                 rows['Humidity']=80
#             if rows['Species']=='AG':
#                 rows['Temp']=26
#                 rows['Humidity']=83
#         if rows['Country']=='B':
#             if rows['Species']=='AC':
#                 rows['Temp']=27.5
#                 rows['Humidity']=86.1
#             if rows['Species']=='AG':
#                 rows['Temp']=27.5
#                 rows['Humidity']=86.1
#         if rows['Country']=='T':
#             if rows['Species']=='AA':
#                 rows['Temp']=27
#                 rows['Humidity']=80
#             if rows['Species']=='AG':
#                 rows['Temp']=27
#                 rows['Humidity']=80
#     if rows['RearCnd']=='TF':    
#         if rows['Country']=='B':
#             if rows['Species']=='AC':
#                 rows['Temp']=30.3
#                 rows['Humidity']=64.3
#             if rows['Species']=='AG':
#                 rows['Temp']=30.3
#                 rows['Humidity']=64.3
#         if rows['Country']=='T':
#             if rows['Species']=='AA':
#                 rows['Temp']=27
#                 rows['Humidity']=80
#             if rows['Species']=='AG':
#                 rows['Temp']=27
#                 rows['Humidity']=80
#     if rows['RearCnd']=='TV':    
#         if rows['Country']=='B':
#             if rows['Species']=='AC':
#                 rows['Temp']=28.9
#                 rows['Humidity']=76.3
#             if rows['Species']=='AG':
#                 rows['Temp']=29
#                 rows['Humidity']=73.7
#         if rows['Country']=='T':
#             if rows['Species']=='AA':
#                 rows['Temp']=28.3
#                 rows['Humidity']=84.5
#             if rows['Species']=='AG':
#                 rows['Temp']=28.3
#                 rows['Humidity']=84.5
#     df.loc[index] = rows
                
                
# print(df.head(10))




In [5]:
# df.to_csv("/home/josh/Documents/Mosquito_Project/New_Data/Data/MIMIdata_update_19_02/mosquitoes_country_LM_5_0_inc_temp.dat", '\t')


In [6]:
# df = pd.read_csv("/home/josh/Documents/Mosquito_Project/New_Data/Data/MIMIdata_update_19_02/mosquitoes_country_LM_5_0.dat", '\t')
# df.head()
# print(df.groupby('RearCnd').size())

In [7]:
class data_loader_class():
    def __init__(self, valid_perc):
        df = pd.read_csv("/home/josh/Documents/Mosquito_Project/New_Data/Data/MIMIdata_update_19_02/mosquitoes_country_LM_5_0.dat", '\t')
        df.head(10)

        df['AgeGroup'] = 0
        df['AgeGroup'] = np.where(df['Age']>10, 2, np.where(df['Age']>4, 1, 0))

        df_vf = df[df['RearCnd']=='VF']
        df_vf = df_vf[df_vf['Status']=='UN']
        df = df[df['RearCnd']!='VF']
        df = df[df['Status']!='UN']
        df_l = df[df['RearCnd']=='TL']
        df_l_g = df_l[df_l['Country']=='S']
        
        df_l_g_a = df_l_g[df_l_g['Species']=='AA']
        age_counts = df_l_g_a.groupby('AgeGroup').size()
        df_l_g_g = df_l_g[df_l_g['Species']=='AG']
        age_counts = df_l_g_g.groupby('AgeGroup').size()
        df_l_g_c = df_l_g[df_l_g['Species']=='AC']
        age_counts = df_l_g_c.groupby('AgeGroup').size()
        df_l_t = df_l[df_l['Country']=='T']
        df_l_t_a = df_l_t[df_l_t['Species']=='AA']
        age_counts = df_l_t_a.groupby('AgeGroup').size()
        df_l_t_g = df_l_t[df_l_t['Species']=='AG']
        age_counts = df_l_t_g.groupby('AgeGroup').size()
        df_l_b = df_l[df_l['Country']=='B']
        df_l_b_g = df_l_b[df_l_b['Species']=='AG']
        age_counts = df_l_b_g.groupby('AgeGroup').size()
        df_l_b_c = df_l_b[df_l_b['Species']=='AC']
        age_counts = df_l_b_c.groupby('AgeGroup').size()
        df_f = df[df['RearCnd']=='TF']
        df_f_t = df_f[df_f['Country']=='T']
        df_f_t_a = df_f_t[df_f_t['Species']=='AA']
        # df_f_t_g = df_f_t[df_f_t['Species']=='AG'] #There isn't any
        df_f_b = df_f[df_f['Country']=='B']
        df_f_b_g = df_f_b[df_f_b['Species']=='AG']
        age_counts = df_f_b_g.groupby('AgeGroup').size()
        df_f_b_c = df_f_b[df_f_b['Species']=='AC']
        age_counts = df_f_b_c.groupby('AgeGroup').size()
        df_vf_t = df_vf[df_vf['Country']=='T']
        df_vf_t_a = df_vf_t[df_vf_t['Species']=='AA']
        age_counts = df_vf_t_a.groupby('AgeGroup').size()
#         print(age_counts)
        df_vf_t_g = df_vf_t[df_vf_t['Species']=='AG']
        age_counts = df_vf_t_g.groupby('AgeGroup').size()
#         print(age_counts)
        df_vf_b = df_vf[df_vf['Country']=='B']
        df_vf_b_g = df_vf_b[df_vf_b['Species']=='AG']
        age_counts = df_vf_b_g.groupby('AgeGroup').size()
#         print(age_counts)
        df_vf_b_c = df_vf_b[df_vf_b['Species']=='AC']
        age_counts = df_vf_b_c.groupby('AgeGroup').size()
#         print(age_counts)

        VF_size_t = len(df_vf_t)
        VF_size_b = len(df_vf_b)
#         print('validation size tanzania : {}'.format(VF_size_t))
#         print('validation size bobo : {}'.format(VF_size_b))
        val_group_size_t = int(((((valid_perc*VF_size_t)/2)/3))) #/2 (t/b) /3 (age groups)
        val_group_size_b = int(((((valid_perc*VF_size_b)/2)/3)))
#         print('validation size for testing tanzania : {}'.format(val_group_size_t))
#         print('validation size for testing bobo : {}'.format(val_group_size_b))

        size_inc = 400
        for age in range(3):
            df_temp = df_l_t_a[df_l_t_a['AgeGroup']==age]
            size_df_temp = np.arange(len(df_temp))
            np.random.seed(42)
            np.random.shuffle(size_df_temp)
            index_df_temp_inc = size_df_temp[:size_inc]
            index_df_temp_not_inc = size_df_temp[size_inc:]
            if age == 0:
                df_train = df_temp.iloc[index_df_temp_inc]
        #         df_test = df_temp.iloc[index_df_temp_not_inc]
            else:
                df_train = pd.concat([df_train, df_temp.iloc[index_df_temp_inc]])
        #         df_test = pd.concat([df_test, df_temp.iloc[index_df_temp_not_inc]])
        for age in range(3):
            df_temp = df_l_t_g[df_l_t_g['AgeGroup']==age]
            size_df_temp = np.arange(len(df_temp))
            np.random.seed(42)
            np.random.shuffle(size_df_temp)
            index_df_temp_inc = size_df_temp[:size_inc]
            index_df_temp_not_inc = size_df_temp[size_inc:]
            df_train = pd.concat([df_train, df_temp.iloc[index_df_temp_inc]])
        #     df_test = pd.concat([df_test, df_temp.iloc[index_df_temp_not_inc]])
        size_inc = 400
        for age in range(3):
            df_temp = df_l_b_g[df_l_b_g['AgeGroup']==age]
            size_df_temp = np.arange(len(df_temp))
            np.random.seed(42)
            np.random.shuffle(size_df_temp)
            index_df_temp_inc = size_df_temp[:size_inc]
            index_df_temp_not_inc = size_df_temp[size_inc:]
            df_train = pd.concat([df_train, df_temp.iloc[index_df_temp_inc]])
        #     df_test = pd.concat([df_test, df_temp.iloc[index_df_temp_not_inc]])
        for age in range(3):
            df_temp = df_l_b_c[df_l_b_c['AgeGroup']==age]
            size_df_temp = np.arange(len(df_temp))
            np.random.seed(42)
            np.random.shuffle(size_df_temp)
            index_df_temp_inc = size_df_temp[:size_inc]
            index_df_temp_not_inc = size_df_temp[size_inc:]
            df_train = pd.concat([df_train, df_temp.iloc[index_df_temp_inc]])
        #     df_test = pd.concat([df_test, df_temp.iloc[index_df_temp_not_inc]])
        size_inc = 300
        for age in range(3):
            df_temp = df_f_t_a[df_f_t_a['AgeGroup']==age]
            size_df_temp = np.arange(len(df_temp))
            np.random.seed(42)
            np.random.shuffle(size_df_temp)
            index_df_temp_inc = size_df_temp[:size_inc]
            index_df_temp_not_inc = size_df_temp[size_inc:]
            df_train = pd.concat([df_train, df_temp.iloc[index_df_temp_inc]])
        #     df_test = pd.concat([df_test, df_temp.iloc[index_df_temp_not_inc]])
        for age in range(3):
            df_temp = df_f_b_g[df_f_b_g['AgeGroup']==age]
            size_df_temp = np.arange(len(df_temp))
            np.random.seed(42)
            np.random.shuffle(size_df_temp)
            index_df_temp_inc = size_df_temp[:size_inc]
            index_df_temp_not_inc = size_df_temp[size_inc:]
            df_train = pd.concat([df_train, df_temp.iloc[index_df_temp_inc]])
        #     df_test = pd.concat([df_test, df_temp.iloc[index_df_temp_not_inc]])
        size_inc = 300
        for age in range(3):
            df_temp = df_f_b_c[df_f_b_c['AgeGroup']==age]
            size_df_temp = np.arange(len(df_temp))
            np.random.seed(42)
            np.random.shuffle(size_df_temp)
            index_df_temp_inc = size_df_temp[:size_inc]
            index_df_temp_not_inc = size_df_temp[size_inc:]
            df_train = pd.concat([df_train, df_temp.iloc[index_df_temp_inc]])
        #     df_test = pd.concat([df_test, df_temp.iloc[index_df_temp_not_inc]])
        size_inc = val_group_size_t
        for age in range(3):
            df_temp = df_vf_t_a[df_vf_t_a['AgeGroup']==age]
            size_df_temp = np.arange(len(df_temp))
            if len(size_df_temp) < size_inc:
                print('Warning Tanzania Arabiensis VF group {} smaller than amount requested'.format(age))
            np.random.seed(42)
            np.random.shuffle(size_df_temp)
            index_df_temp_inc = size_df_temp[:size_inc]
            index_df_temp_not_inc = size_df_temp[size_inc:]
            df_train = pd.concat([df_train, df_temp.iloc[index_df_temp_inc]])
            if age == 0:
                df_test = df_temp.iloc[index_df_temp_not_inc]
            else:
                df_test = pd.concat([df_test, df_temp.iloc[index_df_temp_not_inc]])
        for age in range(3):
            df_temp = df_vf_t_g[df_vf_t_g['AgeGroup']==age]
            size_df_temp = np.arange(len(df_temp))
            if len(size_df_temp) < size_inc:
                print('Warning Tanzania Gambie VF group {} smaller than amount requested'.format(age))
            np.random.seed(42)
            np.random.shuffle(size_df_temp)
            index_df_temp_inc = size_df_temp[:size_inc]
            index_df_temp_not_inc = size_df_temp[size_inc:]
            df_train = pd.concat([df_train, df_temp.iloc[index_df_temp_inc]])
            df_test = pd.concat([df_test, df_temp.iloc[index_df_temp_not_inc]])
        size_inc = val_group_size_b
        for age in range(3):
            df_temp = df_vf_b_g[df_vf_b_g['AgeGroup']==age]
            size_df_temp = np.arange(len(df_temp))
            if len(size_df_temp) < size_inc:
                print('Warning Bobo Gambie VF group {} smaller than amount requested'.format(age))
            np.random.seed(42)
            np.random.shuffle(size_df_temp)
            index_df_temp_inc = size_df_temp[:size_inc]
            index_df_temp_not_inc = size_df_temp[size_inc:]
            df_train = pd.concat([df_train, df_temp.iloc[index_df_temp_inc]])
            df_test = pd.concat([df_test, df_temp.iloc[index_df_temp_not_inc]])
        for age in range(3):
            df_temp = df_vf_b_c[df_vf_b_c['AgeGroup']==age]
            size_df_temp = np.arange(len(df_temp))
            if len(size_df_temp) < size_inc:
                print('Warning Bobo Colluzzi VF group {} smaller than amount requested'.format(age))
            np.random.seed(42)
            np.random.shuffle(size_df_temp)
            index_df_temp_inc = size_df_temp[:size_inc]
            index_df_temp_not_inc = size_df_temp[size_inc:]
            df_train = pd.concat([df_train, df_temp.iloc[index_df_temp_inc]])
            df_test = pd.concat([df_test, df_temp.iloc[index_df_temp_not_inc]])

        print('Percentage of field mosquitoes inc {} - Num mosquitoes {} / {}'.format(valid_perc*100, len(df_train[df_train['RearCnd']=='VF']), len(df_vf)))
        print('Total number of mosquitoes in the Train set {}'.format(len(df_train)))
        
        X = df_train.iloc[:,6:-1]
        y_age = df_train["Age"]
        y_age_groups = df_train["AgeGroup"]
        y_species = df_train["Species"]
        y_status = df_train["Status"]
#         print('shape of X : {}'.format(X.shape))
#         print('shape of y age : {}'.format(y_age.shape))
#         print('shape of y age groups : {}'.format(y_age_groups.shape))
#         print('shape of y species : {}'.format(y_species.shape))
#         print('shape of y status : {}'.format(y_status.shape))
        self.X = np.asarray(X)
        y_age = np.asarray(y_age)
        self.y_age_groups = np.asarray(y_age_groups)
        self.y_species = np.asarray(y_species)
        y_status = np.asarray(y_status)

        X_vf = df_test.iloc[:,6:-1]
        y_age_vf = df_test["Age"]
        y_age_groups_vf = df_test["AgeGroup"]
        y_species_vf = df_test["Species"]
        y_status_vf = df_test["Status"]
#         print('shape of X_vf : {}'.format(X_vf.shape))
#         print('shape of y_age_vf age : {}'.format(y_age_vf.shape))
#         print('shape of y_age_groups_vf : {}'.format(y_age_groups_vf.shape))
#         print('shape of y y_species_vf : {}'.format(y_species_vf.shape))
#         print('shape of y y_status_vf : {}'.format(y_status_vf.shape))
        self.X_vf = np.asarray(X_vf)
        y_age_vf = np.asarray(y_age_vf)
        self.y_age_groups_vf = np.asarray(y_age_groups_vf)
        self.y_species_vf = np.asarray(y_species_vf)
        y_status_vf = np.asarray(y_status_vf)



In [8]:
def data_loader(valid_perc):
    return data_loader_class(valid_perc)

In [9]:
for valid_inc_perc in [0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5]:
    data_loader(valid_inc_perc)
    print('------------------------------------------------------------')


Percentage of field mosquitoes inc 0 - Num mosquitoes 0 / 3277
Total number of mosquitoes in the Train set 7200
------------------------------------------------------------
Percentage of field mosquitoes inc 5.0 - Num mosquitoes 162 / 3277
Total number of mosquitoes in the Train set 7362
------------------------------------------------------------
Percentage of field mosquitoes inc 10.0 - Num mosquitoes 324 / 3277
Total number of mosquitoes in the Train set 7524
------------------------------------------------------------
Percentage of field mosquitoes inc 15.0 - Num mosquitoes 486 / 3277
Total number of mosquitoes in the Train set 7686
------------------------------------------------------------
Percentage of field mosquitoes inc 20.0 - Num mosquitoes 654 / 3277
Total number of mosquitoes in the Train set 7854
------------------------------------------------------------
Percentage of field mosquitoes inc 25.0 - Num mosquitoes 815 / 3277
Total number of mosquitoes in the Train set 8015

In [None]:
data_loader(0.2)

In [None]:
data_loader(0.3)

In [None]:
data_loader(0.4)

## Function used to create a new folder for the CNN outputs.
Useful to stop forgetting to name a new folder when trying out a new model varient and overwriting a days training.

In [5]:
def build_folder(fold, to_build = False):
    if not os.path.isdir(fold):
        if to_build == True:
            os.mkdir(fold)
        else:
            print('Directory does not exists, not creating directory!')
    else:
        if to_build == True:
            raise NameError('Directory already exists, cannot be created!')

## Function for plotting confusion matrcies
This normalizes the confusion matrix and ensures neat plotting for all outputs.

In [7]:
def plot_confusion_matrix(cm, classes, output, save_path, model_name, fold,
                          normalize=True,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues,
                          printout=False):

    font = {'weight' : 'normal',
            'size'   : 18}

    matplotlib.rc('font', **font)

    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        if printout:
            print("Normalized confusion matrix")
    else:
        if printout:
            print('Confusion matrix, without normalization')

    if printout:
        print(cm)
    
    plt.figure(figsize=(8,8))
    plt.imshow(cm, interpolation='nearest', cmap=cmap, vmin=0, vmax=1) # np.max(np.sum(cm, axis=1)))
#     plt.title([title+' - '+model_name])
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout(pad=2)
#     plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.savefig((save_path+"Confusion_Matrix_"+model_name+"_"+fold+"_"+output[1:]+".png"))
    plt.close()

## Function used for visualizing outputs
This splits the output data into the four categories before plotting the confusion matricies.

In [8]:
## for visualizing losses and metrics once the neural network fold is trained
def visualize(histories, save_path, model_name, fold, classes, outputs, predicted, true, title='Confusion Matrix'):
    # Sort out predictions and true labels
    for label_predictions_arr, label_true_arr, classes, outputs in zip(predicted, true, classes, outputs):
        classes_pred = np.argmax(label_predictions_arr, axis=-1)
        classes_true = np.argmax(label_true_arr, axis=-1)
        cnf_matrix = confusion_matrix(classes_true, classes_pred)
        plot_confusion_matrix(cnf_matrix, classes, outputs, save_path, model_name, fold, title=title)

## Data logging

In [7]:
## for logging data associated with the model
def log_data(log, name, fold, save_path):
    f = open((save_path+name+'_'+str(fold)+'_log.txt'), 'w')
    np.savetxt(f, log)
    f.close()

In [8]:
def save_obj(obj, name, savedir_main):
    with open(savedir_main + name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

In [9]:
def load_obj(name, savedir_main):
    with open(savedir_main + name + '.pkl', 'rb') as f:
        return pickle.load(f)

## Fucntion for graphing the training data
This fucntion creates tidy graphs of loss and accuracy as the models are training.

In [12]:
def graph_history(history, model_name, model_ver_num, fold, save_path):

    font = {'weight' : 'normal',
            'size'   : 18}

    matplotlib.rc('font', **font)
    
    #not_validation = list(filter(lambda x: x[0:3] != "val", history.history.keys()))
#     print('history.history.keys : {}'.format(history.history.keys()))
    filtered = filter(lambda x: x[0:3] != "val", history.history.keys())
    not_validation = list(filtered)
    for i in not_validation:
        plt.figure(figsize=(15,7))
#         plt.title(i+"/ "+"val_"+i)
        plt.plot(history.history[i], label=i)
        plt.plot(history.history["val_"+i], label="val_"+i)
        plt.legend()
        plt.xlabel("epoch")
        plt.ylabel(i)
        plt.savefig(save_path +model_name+"_"+str(model_ver_num)+"_"+str(fold)+"_"+i+".png")
        plt.close()

## funciton to create the CNN
This function takes as an input a list of dictionaries. Each element in the list is a new hidden layer in the model. For each layer the dictionary defines the layer to be used.

### Available options are:
Convolutional Layer:
* type = 'c'
* filter = optional number of filters
* kernel = optional size of the filters
* stride = optional size of stride to take between filters
* pooling = optional width of the max pooling
* {'type':'c', 'filter':16, 'kernel':5, 'stride':1, 'pooling':2}

dense layer:
* type = 'd'
* width = option width of the layer
* {'type':'d', 'width':500}

In [13]:
def create_models(model_shape, input_layer):

    regConst = 0.02
    sgd = keras.optimizers.SGD(lr=0.003, decay=1e-5, momentum=0.9, nesterov=True, clipnorm=1.)
    cce = 'categorical_crossentropy'

    input_vec = Input(name='input', shape=(input_layer_dim,1))

    for i, layerwidth in zip(range(len(model_shape)),model_shape):
        if i == 0:
            if model_shape[i]['type'] == 'c':
                xd = Conv1D(name=('Conv'+str(i+1)), filters=model_shape[i]['filter'], 
                 kernel_size = model_shape[i]['kernel'], strides = model_shape[i]['stride'],
                 activation = 'relu',
                 kernel_regularizer=l2(regConst), 
                 kernel_initializer='he_normal')(input_vec)
                xd = BatchNormalization(name=('batchnorm_'+str(i+1)))(xd)
                xd = MaxPooling1D(pool_size=(model_shape[i]['pooling']))(xd)
                
            elif model_shape[i]['type'] == 'd':
                xd = Dense(name=('d'+str(i+1)), units=model_shape[i]['width'], activation='relu', 
                 kernel_regularizer=l2(regConst), 
                 kernel_initializer='he_normal')(input_vec)
                xd = BatchNormalization(name=('batchnorm_'+str(i+1)))(xd) 
                xd = Dropout(name=('dout'+str(i+1)), rate=0.5)(xd) 
                
        else:
            if model_shape[i]['type'] == 'c':
                xd = Conv1D(name=('Conv'+str(i+1)), filters=model_shape[i]['filter'], 
                 kernel_size = model_shape[i]['kernel'], strides = model_shape[i]['stride'],
                 activation = 'relu',
                 kernel_regularizer=l2(regConst), 
                 kernel_initializer='he_normal')(xd)
                xd = BatchNormalization(name=('batchnorm_'+str(i+1)))(xd)
                xd = MaxPooling1D(pool_size=(model_shape[i]['pooling']))(xd)
                
            elif model_shape[i]['type'] == 'd':
                if model_shape[i-1]['type'] == 'c':
                    xd = Flatten()(xd)
                    
                xd = Dropout(name=('dout'+str(i+1)), rate=0.5)(xd)
                xd = Dense(name=('d'+str(i+1)), units=model_shape[i]['width'], activation='relu', 
                 kernel_regularizer=l2(regConst), 
                 kernel_initializer='he_normal')(xd)
                xd = BatchNormalization(name=('batchnorm_'+str(i+1)))(xd) 
        
    
#     xAge     = Dense(name = 'age', units = 17, 
#                      activation = 'softmax', 
#                      kernel_regularizer = l2(regConst), 
#                      kernel_initializer = 'he_normal')(xd)
    xAgeGroup     = Dense(name = 'age_group', units = 3, 
                     activation = 'softmax', 
                     kernel_regularizer = l2(regConst), 
                     kernel_initializer = 'he_normal')(xd)
    xSpecies = Dense(name ='species', units = 3, 
                     activation = 'softmax', 
                     kernel_regularizer = l2(regConst), 
                     kernel_initializer = 'he_normal')(xd)

    outputs = []
#     for i in ['xAge', 'xAgeGroup', 'xSpecies']:
    for i in ['xAgeGroup', 'xSpecies']:
        outputs.append(locals()[i])
    model = Model(inputs = input_vec, outputs = outputs)
    
    model.compile(loss=cce, metrics=['acc'], 
                  optimizer=sgd)
#     model.summary()
    return model

## Function to train the model

This function will split the data into training and validation and call the create models function. This fucntion returns the model and training history.

In [14]:
def train_models(model_to_test, save_path, SelectFreqs=False):

    model_shape = model_to_test["model_shape"][0]
    model_name = model_to_test["model_name"][0]
#     input_layer_dim = model_to_test["input_layer_dim"][0]
    model_ver_num = model_to_test["model_ver_num"][0]
    fold = model_to_test["fold"][0]
    label = model_to_test["labels"][0]
    features = model_to_test["features"][0]
    classes = model_to_test["classes"][0]
    outputs = model_to_test["outputs"][0]
    compile_loss = model_to_test["compile_loss"][0]
    compile_metrics = model_to_test["compile_metrics"][0]

    ## Split into training / testing
    test_splits = train_test_split(features, *(label), test_size=0.1, shuffle=True, random_state=rand_seed)
    ## Pack up data
    X_train = test_splits.pop(0)
    X_val = test_splits.pop(0)
    y_train = test_splits[::2]
    y_val = test_splits[1::2]
    
    out_model = create_models(model_shape, input_layer_dim)
    out_model.summary()
    out_history = out_model.fit(x = X_train, 
                            y = y_train,
                            batch_size = 128*16, 
                            verbose = 0, 
                            epochs = 8000,
                            validation_data = (X_val, y_val),
                            callbacks = [keras.callbacks.EarlyStopping(monitor='val_loss', 
                                        patience=400, verbose=0, mode='auto'), 
                                        CSVLogger(save_path+model_name+"_"+str(model_ver_num)+'.csv', append=True, separator=';')])
    scores = out_model.evaluate(X_val, y_val)
#     print(out_model.metrics_names)
    
#     ## Kfold training
#     seed = rand_seed
#     kfold = KFold(n_splits=10, shuffle=True, random_state=seed)
#     ## Split data into test and train
    
#     model_ver_num = 0
#     cv_scores = []
#     best_score = 0
#     for train_index, val_index in kfold.split(features):
#         print('Fold {} Running'.format(model_ver_num))
        
#         X_train, X_val = features[train_index], features[val_index]
#         y_train, y_val = list(map(lambda y:y[train_index], label)), list(map(lambda y:y[val_index], label))

#         model = create_models(model_shape, input_layer_dim)
#         if model_ver_num == 0:
#             model.summary()

#         history = model.fit(x = X_train, 
#                             y = y_train,
#                             batch_size = 128*16, 
#                             verbose = 0, 
#                             epochs = 8000,
#                             validation_data = (X_val, y_val),
#                             callbacks = [keras.callbacks.EarlyStopping(monitor='val_loss', 
#                                         patience=400, verbose=0, mode='auto'), 
#                                         CSVLogger(save_path+model_name+"_"+str(model_ver_num)+'.csv', append=True, separator=';')])
#         scores = model.evaluate(X_val, y_val)
#         print(model.metrics_names)
#         print(scores)
#         if (scores[3] + scores[4]) > best_score:
#             out_model = model
#             out_history = history
        
#         model_ver_num += 1
        
#         # Clear the Keras session, otherwise it will keep adding new
#         # models to the same TensorFlow graph each time we create
#         # a model with a different set of hyper-parameters.
#         K.clear_session()

#         # Delete the Keras model with these hyper-parameters from memory.
#         del model
        
    out_model.save((save_path+model_name+"_"+'Model.h5'))
    graph_history(out_history, model_name, 0, 0, save_path)
    
    return out_model, out_history

## Main section

Functionality:
* Oganises the data into a format of lists of data, classes, labels.
* Define the CNN to be built.
* Define the KFold validation to be used.
* Build a folder to output data into.
* Standardize and oragnise data into training/testing.
* Call the model training.
* Organize outputs and call visualization for plotting and graphing.


In [15]:
## Name a folder for the outputs to go into
outdir = "Results_Paper/"
build_folder(outdir, False)

savedir_main = (outdir+"Trian_Val_Inc/")
build_folder(savedir_main, True)

val_results = {'loss':[], 'age_group_loss':[], 'species_loss':[], 'age_group_acc':[], 'species_acc':[]}
histories = []

start_time = time()

for valid_inc_perc in [0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5]:
    
    savedir = (savedir_main+"valid_data_inc_"+str(valid_inc_perc)+"/")
    build_folder(savedir, True)

    data_extract = data_loader(valid_inc_perc)
    
    ## Input CNN Size
    input_layer_dim = len(data_extract.X[0])

    ## Transform Data
    y_age_groups_list = [[age] for age in data_extract.y_age_groups]
    y_species_list = [[species] for species in data_extract.y_species]
    age_groups = MultiLabelBinarizer().fit_transform(np.array(y_age_groups_list))
    age_group_classes = ["1-4", "5-10", "11-17"]
    species = MultiLabelBinarizer().fit_transform(np.array(y_species_list))
    species_classes = list(np.unique(y_species_list))
    y_age_groups_list_vf = [[age] for age in data_extract.y_age_groups_vf]
    y_species_list_vf = [[species] for species in data_extract.y_species_vf]
    age_groups_vf = MultiLabelBinarizer().fit_transform(np.array(y_age_groups_list_vf))
    species_vf = MultiLabelBinarizer().fit_transform(np.array(y_species_list_vf))

    ## Labels
    labels_default, classes_default, outputs_default = [age_groups, species], [age_group_classes, species_classes], ['xAgeGroup', 'xSpecies']
    labels_default_vf, classes_default_vf, outputs_default_vf = [age_groups_vf, species_vf], [age_group_classes, species_classes], ['xAgeGroup', 'xSpecies']

    ## Declare and train the model
    model_size = [{'type':'c', 'filter':16, 'kernel':8, 'stride':1, 'pooling':1},
                 {'type':'c', 'filter':16, 'kernel':8, 'stride':2, 'pooling':1},
                 {'type':'c', 'filter':16, 'kernel':3, 'stride':1, 'pooling':1},
                 {'type':'c', 'filter':16, 'kernel':6, 'stride':2, 'pooling':1},
                 {'type':'c', 'filter':16, 'kernel':5, 'stride':1, 'pooling':2},
                 {'type':'d', 'width':500}]

    ## Name the model
    model_name = ('Valid_Inc_'+str(valid_inc_perc))

    ## Scale train, test
    scl = StandardScaler()
    features_scl = scl.fit(X=np.vstack((data_extract.X, data_extract.X_vf)))
    X_train = features_scl.transform(X=data_extract.X)
    X_test = features_scl.transform(X=data_extract.X_vf)

    ## Split data into test and train
    y_train, y_test = list(map(lambda y:y, labels_default)), list(map(lambda y:y, labels_default_vf))

    X_train = np.expand_dims(X_train, axis=2)
    X_test = np.expand_dims(X_test, axis=2)

    model_to_test = {
        "model_shape" : [model_size], # defines the hidden layers of the model
        "model_name"  : [model_name],
        "input_layer_dim"  : [input_layer_dim], # size of input layer
        "model_ver_num"  : [0],
        "fold"  : [0], # kf.split number on
        "labels"   : [y_train],
        "features" : [X_train],
        "classes"  : [classes_default],
        "outputs"   : [outputs_default],
        "compile_loss": [{'age': 'categorical_crossentropy'}],
        "compile_metrics" :[{'age': 'accuracy'}]
    }

    ## Call function to train all the models from the dictionary
    model, history = train_models(model_to_test, savedir)
    histories.append(history)

    predicted_labels = list([] for i in range(len(y_train)))
    true_labels = list([] for i in range(len(y_train)))

    y_predicted = model.predict(X_test)
    temp_eval = model.evaluate(X_test, y_test)
    for metric, res in zip(model.metrics_names, temp_eval):
        val_results[metric].append(res)
    print(val_results)

    predicted_labels = [x+[y] for x,y in zip(predicted_labels,y_predicted)]
    true_labels = [x+[y] for x,y in zip(true_labels,y_test)]

    predicted_labels = [predicted_labels[i][0].tolist() for i in range(len(predicted_labels))]
    true_labels = [true_labels[i][0].tolist() for i in range(len(true_labels))]

    ## Visualize the results
    visualize(histories, savedir, model_name, "0", classes_default, outputs_default, predicted_labels, true_labels)

    # Clear the Keras session, otherwise it will keep adding new
    # models to the same TensorFlow graph each time we create
    # a model with a different set of hyper-parameters.
    K.clear_session()

    # Delete the Keras model with these hyper-parameters from memory.
    del model

end_time = time()
print('Run time : {} s'.format(end_time-start_time))
print('Run time : {} m'.format((end_time-start_time)/60))
print('Run time : {} h'.format((end_time-start_time)/3600))

save_obj(val_results, 'Validation_Results_Dict', savedir_main)

Percentage of field mosquitoes inc 0 - Num mosquitoes 0 / 3277
Total number of mosquitoes in the Train set 7200
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input (InputLayer)              (None, 1625, 1)      0                                            
__________________________________________________________________________________________________
Conv1 (Conv1D)                  (None, 1618, 16)     144         input[0][0]                      
__________________________________________________________________________________________________
batchnorm_1 (BatchNormalization (None, 1618, 16)     64          Conv1[0][0]                      
__________________________________________________________________________________________________
max_pooling1d_1 (MaxPooling1D)  (None, 1618, 16)     0           batchnorm_1[0][0]              

{'loss': [4.942620219346187, 2.4815747823225363], 'age_group_loss': [1.9838111671610024, 1.2416391557832973], 'species_loss': [2.8875805031818556, 1.172873507935201], 'age_group_acc': [0.34330180042722, 0.5338683788121991], 'species_acc': [0.4876411351846201, 0.5852327448120087]}
Percentage of field mosquitoes inc 10.0 - Num mosquitoes 324 / 3277
Total number of mosquitoes in the Train set 7524
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input (InputLayer)              (None, 1625, 1)      0                                            
__________________________________________________________________________________________________
Conv1 (Conv1D)                  (None, 1618, 16)     144         input[0][0]                      
__________________________________________________________________________________________________
batchnor

{'loss': [4.942620219346187, 2.4815747823225363, 1.874446988065405, 1.484810124349953], 'age_group_loss': [1.9838111671610024, 1.2416391557832973, 0.9549200318758766, 0.7446512885569234], 'species_loss': [2.8875805031818556, 1.172873507935201, 0.8500014498976178, 0.6701478279969548], 'age_group_acc': [0.34330180042722, 0.5338683788121991, 0.6352861496782932, 0.6864922966678609], 'species_acc': [0.4876411351846201, 0.5852327448120087, 0.662038604849038, 0.7216051594624165]}
Percentage of field mosquitoes inc 20.0 - Num mosquitoes 654 / 3277
Total number of mosquitoes in the Train set 7854
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input (InputLayer)              (None, 1625, 1)      0                                            
__________________________________________________________________________________________________
Conv1 (Co

{'loss': [4.942620219346187, 2.4815747823225363, 1.874446988065405, 1.484810124349953, 1.2420932128599387, 1.0482862967964501], 'age_group_loss': [1.9838111671610024, 1.2416391557832973, 0.9549200318758766, 0.7446512885569234, 0.6229374602783558, 0.5180432014984391], 'species_loss': [2.8875805031818556, 1.172873507935201, 0.8500014498976178, 0.6701478279969548, 0.5495299646702605, 0.4602895513189023], 'age_group_acc': [0.34330180042722, 0.5338683788121991, 0.6352861496782932, 0.6864922966678609, 0.7483797177780325, 0.7806661254404813], 'species_acc': [0.4876411351846201, 0.5852327448120087, 0.662038604849038, 0.7216051594624165, 0.7651544032640417, 0.8050365556579213]}
Percentage of field mosquitoes inc 30.0 - Num mosquitoes 973 / 3277
Total number of mosquitoes in the Train set 8173
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input (

{'loss': [4.942620219346187, 2.4815747823225363, 1.874446988065405, 1.484810124349953, 1.2420932128599387, 1.0482862967964501, 0.9535149013002714, 0.8382450232190277], 'age_group_loss': [1.9838111671610024, 1.2416391557832973, 0.9549200318758766, 0.7446512885569234, 0.6229374602783558, 0.5180432014984391, 0.4801018638536334, 0.42035708500754576], 'species_loss': [2.8875805031818556, 1.172873507935201, 0.8500014498976178, 0.6701478279969548, 0.5495299646702605, 0.4602895513189023, 0.4021792692753176, 0.3469646150334795], 'age_group_acc': [0.34330180042722, 0.5338683788121991, 0.6352861496782932, 0.6864922966678609, 0.7483797177780325, 0.7806661254404813, 0.8038194444444444, 0.8355079217148182], 'species_acc': [0.4876411351846201, 0.5852327448120087, 0.662038604849038, 0.7216051594624165, 0.7651544032640417, 0.8050365556579213, 0.8285590277777778, 0.863932898415657]}
Percentage of field mosquitoes inc 40.0 - Num mosquitoes 1294 / 3277
Total number of mosquitoes in the Train set 8494
____

{'loss': [4.942620219346187, 2.4815747823225363, 1.874446988065405, 1.484810124349953, 1.2420932128599387, 1.0482862967964501, 0.9535149013002714, 0.8382450232190277, 0.7638672286070661, 0.6683299169148484], 'age_group_loss': [1.9838111671610024, 1.2416391557832973, 0.9549200318758766, 0.7446512885569234, 0.6229374602783558, 0.5180432014984391, 0.4801018638536334, 0.42035708500754576, 0.3933762730340193, 0.3253837525273023], 'species_loss': [2.8875805031818556, 1.172873507935201, 0.8500014498976178, 0.6701478279969548, 0.5495299646702605, 0.4602895513189023, 0.4021792692753176, 0.3469646150334795, 0.2996454734544951, 0.2716564917890993], 'age_group_acc': [0.34330180042722, 0.5338683788121991, 0.6352861496782932, 0.6864922966678609, 0.7483797177780325, 0.7806661254404813, 0.8038194444444444, 0.8355079217148182, 0.8567826522761262, 0.8717808219178083], 'species_acc': [0.4876411351846201, 0.5852327448120087, 0.662038604849038, 0.7216051594624165, 0.7651544032640417, 0.8050365556579213, 0.

In [18]:
plt.rcParams.update({'font.size': 20})

outdir = "Results_Paper/"
build_folder(outdir, False)

savedir_main = (outdir+"Trian_Val_Inc/")
build_folder(savedir_main, False)

val_results = load_obj('Validation_Results_Dict', savedir_main)
print(val_results["age_group_acc"])
# x = [0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50]
x = [0, 5, 10, 15, 20, 25, 30, 35, 40, 45]

fig = plt.figure(figsize=(15,7))
# fig.suptitle('Time Course Validation Data Inclusion Study', fontsize=20, fontweight='bold')
ax = fig.add_subplot(111)
ax.plot(x, val_results["age_group_acc"][:-1], lw=2, c='b', label="Age Group Acc")
ax.plot(x, val_results["species_acc"][:-1], lw=2, c='g', label="Species Acc")
ax.legend(loc='upper left')
ax.set_xlabel("Percentage of Time Course Validation Data Included During Training")
ax.set_ylabel("Test Accuracy")
ax.set_xticks([0,5,10,15,20,25,30,35,40,45])
# ax.set_xlim([0,45])
ax.set_ylim([0.2,1])
for i,j in zip(x, val_results["age_group_acc"][:-1]):
    ax.annotate('{:1.2f}'.format(j), xy=(i,j), xytext=(i+0,j-0.06), color='b', arrowprops=dict(facecolor='black', width=1, headwidth=4, headlength=4, shrink=0.05))
for i,j in zip(x, val_results["species_acc"][:-1]):
    ax.annotate('{:1.2f}'.format(j), xy=(i,j), xytext=(i+0,j+0.06), color='g', arrowprops=dict(facecolor='black', width=1, headwidth=4, headlength=4, shrink=0.05))
plt.savefig(savedir_main + "Validation_Study.png")
plt.close()

[0.34330180042722, 0.5338683788121991, 0.6352861496782932, 0.6864922966678609, 0.7483797177780325, 0.7806661254404813, 0.8038194444444444, 0.8355079217148182, 0.8567826522761262, 0.8717808219178083, 0.8965517242424395]


In [17]:
plt.rcParams.update({'font.size': 20})

outdir = "Results_Paper/"
build_folder(outdir, False)

savedir_main = (outdir+"Trian_Val_Inc/")
build_folder(savedir_main, False)

val_results = load_obj('Validation_Results_Dict', savedir_main)
print(val_results["age_group_acc"])
# x = [0, 2.2, 4.3, 6.3, 8.3, 10.2, 11.9, 13.6, 15.2, 16.8, 17.9]
x = [0, 2.2, 4.3, 6.3, 8.3, 10.2, 11.9, 13.6, 15.2, 16.8]

fig = plt.figure(figsize=(15,7))
# fig.suptitle('Time Course Validation Data Inclusion Study', fontsize=20, fontweight='bold')
ax = fig.add_subplot(111)
ax.plot(x, val_results["age_group_acc"][:-1], lw=2, c='b', label="Age Group Acc")
ax.plot(x, val_results["species_acc"][:-1], lw=2, c='g', label="Species Acc")
ax.legend(loc='upper left')
ax.set_xlabel("Proportion of Semifeld Data in the Total Training Dataset")
ax.set_ylabel("Test Accuracy")
# ax.set_xticks([0, 2.2, 4.3, 6.3, 8.3, 10.2, 11.9, 13.6, 15.2, 16.8])
# ax.set_xlim([0,17.6])
ax.set_ylim([0.2,1])
for i,j in zip(x, val_results["age_group_acc"][:-1]):
    ax.annotate('{:1.2f}'.format(j), xy=(i,j), xytext=(i+0,j-0.06), color='b', arrowprops=dict(facecolor='black', width=1, headwidth=4, headlength=4, shrink=0.05))
for i,j in zip(x, val_results["species_acc"][:-1]):
    ax.annotate('{:1.2f}'.format(j), xy=(i,j), xytext=(i+0,j+0.06), color='g', arrowprops=dict(facecolor='black', width=1, headwidth=4, headlength=4, shrink=0.05))
plt.savefig(savedir_main + "Validation_Study.png")
plt.close()

[0.34330180042722, 0.5338683788121991, 0.6352861496782932, 0.6864922966678609, 0.7483797177780325, 0.7806661254404813, 0.8038194444444444, 0.8355079217148182, 0.8567826522761262, 0.8717808219178083, 0.8965517242424395]


In [23]:
plt.rcParams.update({'font.size': 20})

outdir = "Results_Paper/"
build_folder(outdir, False)

savedir_main = (outdir+"Trian_Val_Inc/")
build_folder(savedir_main, False)

val_results = load_obj('Validation_Results_Dict', savedir_main)
print(val_results["age_group_acc"])
# x = [0, 162, 324, 486, 654, 815, 973, 1131, 1294, 1452, 1566]
x = [0, 162, 324, 486, 654, 815, 973, 1131, 1294, 1452]

fig = plt.figure(figsize=(15,7))
# fig.suptitle('Time Course Validation Data Inclusion Study', fontsize=20, fontweight='bold')
ax = fig.add_subplot(111)
ax.plot(x, val_results["age_group_acc"][:-1], lw=2, c='b', label="Age Group Acc")
ax.plot(x, val_results["species_acc"][:-1], lw=2, c='g', label="Species Acc")
ax.legend(loc='upper left')
ax.set_xlabel("Number of Semifeld Datapoints in the Total Training Dataset (+ 7200 Non-Semifield Datapoints)")
ax.set_ylabel("Test Accuracy")
# ax.set_xticks([0, 2.2, 4.3, 6.3, 8.3, 10.2, 11.9, 13.6, 15.2, 16.8])
# ax.set_xlim([0,17.6])
ax.set_ylim([0.2,1])
for i,j in zip(x, val_results["age_group_acc"][:-1]):
    ax.annotate('{:1.2f}'.format(j), xy=(i,j), xytext=(i+0,j-0.06), color='b', arrowprops=dict(facecolor='black', width=1, headwidth=4, headlength=4, shrink=0.05))
for i,j in zip(x, val_results["species_acc"][:-1]):
    ax.annotate('{:1.2f}'.format(j), xy=(i,j), xytext=(i+0,j+0.06), color='g', arrowprops=dict(facecolor='black', width=1, headwidth=4, headlength=4, shrink=0.05))
plt.savefig(savedir_main + "Validation_Study.png")
plt.close()

[0.34330180042722, 0.5338683788121991, 0.6352861496782932, 0.6864922966678609, 0.7483797177780325, 0.7806661254404813, 0.8038194444444444, 0.8355079217148182, 0.8567826522761262, 0.8717808219178083, 0.8965517242424395]


## Leave one out TimeCourseValidation
### Validation with Gambie only + 2 African sites

## funciton to create the CNN
This function takes as an input a list of dictionaries. Each element in the list is a new hidden layer in the model. For each layer the dictionary defines the layer to be used.

### Available options are:
Convolutional Layer:
* type = 'c'
* filter = optional number of filters
* kernel = optional size of the filters
* stride = optional size of stride to take between filters
* pooling = optional width of the max pooling
* {'type':'c', 'filter':16, 'kernel':5, 'stride':1, 'pooling':2}

dense layer:
* type = 'd'
* width = option width of the layer
* {'type':'d', 'width':500}

In [26]:
def create_models(model_shape, input_layerTrian_no_TClab_TCfield_20vf_dim):

    regConst = 0.02
    sgd = keras.optimizers.SGD(lr=0.003, decay=1e-5, momentum=0.9, nesterov=True, clipnorm=1.)
    cce = 'categorical_crossentropy'

    input_vec = Input(name='input', shape=(input_layer_dim,1))

    for i, layerwidth in zip(range(len(model_shape)),model_shape):
        if i == 0:
            if model_shape[i]['type'] == 'c':
                xd = Conv1D(name=('Conv'+str(i+1)), filters=model_shape[i]['filter'], 
                 kernel_size = model_shape[i]['kernel'], strides = model_shape[i]['stride'],
                 activation = 'relu',
                 kernel_regularizer=l2(regConst), 
                 kernel_initializer='he_normal')(input_vec)
                xd = BatchNormalization(name=('batchnorm_'+str(i+1)))(xd)
                xd = MaxPooling1D(pool_size=(model_shape[i]['pooling']))(xd)
                
            elif model_shape[i]['type'] == 'd':
                xd = Dense(name=('d'+str(i+1)), units=model_shape[i]['width'], activation='relu', 
                 kernel_regularizer=l2(regConst), 
                 kernel_initializer='he_normal')(input_vec)
                xd = BatchNormalization(name=('batchnorm_'+str(i+1)))(xd) 
                xd = Dropout(name=('dout'+str(i+1)), rate=0.5)(xd) 
                
        else:
            if model_shape[i]['type'] == 'c':
                xd = Conv1D(name=('Conv'+str(i+1)), filters=model_shape[i]['filter'], 
                 kernel_size = model_shape[i]['kernel'], strides = model_shape[i]['stride'],
                 activation = 'relu',
                 kernel_regularizer=l2(regConst), 
                 kernel_initializer='he_normal')(xd)
                xd = BatchNormalization(name=('batchnorm_'+str(i+1)))(xd)
                xd = MaxPooling1D(pool_size=(model_shape[i]['pooling']))(xd)
                
            elif model_shape[i]['type'] == 'd':
                if model_shape[i-1]['type'] == 'c':
                    xd = Flatten()(xd)
                    
                xd = Dropout(name=('dout'+str(i+1)), rate=0.5)(xd)
                xd = Dense(name=('d'+str(i+1)), units=model_shape[i]['width'], activation='relu', 
                 kernel_regularizer=l2(regConst), 
                 kernel_initializer='he_normal')(xd)
                xd = BatchNormalization(name=('batchnorm_'+str(i+1)))(xd) 
        
    
#     xAge     = Dense(name = 'age', units = 17, 
#                      activation = 'softmax', 
#                      kernel_regularizer = l2(regConst), 
#                      kernel_initializer = 'he_normal')(xd)
    xAgeGroup     = Dense(name = 'age_group', units = 3, 
                     activation = 'softmax', 
                     kernel_regularizer = l2(regConst), 
                     kernel_initializer = 'he_normal')(xd)
#     xSpecies = Dense(name ='species', units = 1, 
#                      activation = 'softmax', 
#                      kernel_regularizer = l2(regConst), 
#                      kernel_initializer = 'he_normal')(xd)

    outputs = []
#     for i in ['xAge', 'xAgeGroup', 'xSpecies']:
    for i in ['xAgeGroup']:
        outputs.append(locals()[i])
    model = Model(inputs = input_vec, outputs = outputs)
    
    model.compile(loss=cce, metrics=['acc'], 
                  optimizer=sgd)
#     model.summary()
    return model

## Function to train the model

This function will split the data into training and validation and call the create models function. This fucntion returns the model and training history.

In [44]:
def train_models(model_to_test, save_path):

    model_shape = model_to_test["model_shape"][0]
    model_name = model_to_test["model_name"][0]
    input_layer_dim = model_to_test["input_layer_dim"][0]
    model_ver_num = model_to_test["model_ver_num"][0]
    fold = model_to_test["fold"][0]
    y_train = model_to_test["labels"][0]
    X_train = model_to_test["features"][0]
    classes = model_to_test["classes"][0]
    outputs = model_to_test["outputs"][0]
    compile_loss = model_to_test["compile_loss"][0]
    compile_metrics = model_to_test["compile_metrics"][0]

    ## Split into training / testing
    test_splits = train_test_split(X_train, *(y_train), test_size=0.4, shuffle=True, random_state=42)
    ## Pack up data
    X_train = test_splits.pop(0)
    X_val = test_splits.pop(0)
    y_train = test_splits[::2]
    y_val = test_splits[1::2]

    model = create_models(model_shape, input_layer_dim)
#     model.summary()
    
    history = model.fit(x = X_train, 
                        y = y_train,
                        batch_size = 128*16, 
                        verbose = 0, 
                        epochs = 8000,
                        validation_data = (X_val, y_val),
                        callbacks = [keras.callbacks.EarlyStopping(monitor='val_loss', 
                                    patience=400, verbose=0, mode='auto'), 
                                    CSVLogger(save_path+model_name+"_"+str(model_ver_num)+'.csv', append=True, separator=';')])

    model.save((save_path+model_name+"_"+str(model_ver_num)+"_"+str(fold)+"_"+'Model.h5'))
    graph_history(history, model_name, model_ver_num, fold, save_path)
    
    return model, history

## Leave out Bobo

In [45]:
df = pd.read_csv("/home/josh/Documents/Mosquito_Project/New_Data/Data/MIMIdata_update_19_02/mosquitoes_country_LM_5_0.dat", '\t')
df.head(10)

df['AgeGroup'] = 0
df['AgeGroup'] = np.where(df['Age']>10, 2, np.where(df['Age']>4, 1, 0))

df_vf = df[df['RearCnd']=='VF']
df_vf = df_vf[df_vf['Status']=='UN']
df = df[df['RearCnd']!='VF']
df = df[df['Status']!='UN']
df_l = df[df['RearCnd']=='TL']
df_l_g = df_l[df_l['Country']=='S']
df_l_g_a = df_l_g[df_l_g['Species']=='AA']
age_counts = df_l_g_a.groupby('AgeGroup').size()
df_l_g_g = df_l_g[df_l_g['Species']=='AG']
age_counts = df_l_g_g.groupby('AgeGroup').size()
df_l_g_c = df_l_g[df_l_g['Species']=='AC']
age_counts = df_l_g_c.groupby('AgeGroup').size()
df_l_t = df_l[df_l['Country']=='T']
df_l_t_a = df_l_t[df_l_t['Species']=='AA']
age_counts = df_l_t_a.groupby('AgeGroup').size()
df_l_t_g = df_l_t[df_l_t['Species']=='AG']
age_counts = df_l_t_g.groupby('AgeGroup').size()
df_l_b = df_l[df_l['Country']=='B']
df_l_b_g = df_l_b[df_l_b['Species']=='AG']
age_counts = df_l_b_g.groupby('AgeGroup').size()
df_l_b_c = df_l_b[df_l_b['Species']=='AC']
age_counts = df_l_b_c.groupby('AgeGroup').size()
df_f = df[df['RearCnd']=='TF']
df_f_t = df_f[df_f['Country']=='T']
df_f_t_a = df_f_t[df_f_t['Species']=='AA']
# df_f_t_g = df_f_t[df_f_t['Species']=='AG'] #There isn't any
df_f_b = df_f[df_f['Country']=='B']
df_f_b_g = df_f_b[df_f_b['Species']=='AG']
age_counts = df_f_b_g.groupby('AgeGroup').size()
df_f_b_c = df_f_b[df_f_b['Species']=='AC']
age_counts = df_f_b_c.groupby('AgeGroup').size()
df_vf_t = df_vf[df_vf['Country']=='T']
df_vf_t_a = df_vf_t[df_vf_t['Species']=='AA']
age_counts = df_vf_t_a.groupby('AgeGroup').size()
#         print(age_counts)
df_vf_t_g = df_vf_t[df_vf_t['Species']=='AG']
age_counts = df_vf_t_g.groupby('AgeGroup').size()
#         print(age_counts)
df_vf_b = df_vf[df_vf['Country']=='B']
df_vf_b_g = df_vf_b[df_vf_b['Species']=='AG']
age_counts = df_vf_b_g.groupby('AgeGroup').size()
#         print(age_counts)
df_vf_b_c = df_vf_b[df_vf_b['Species']=='AC']
age_counts = df_vf_b_c.groupby('AgeGroup').size()
#         print(age_counts)


size_inc = 400
# for age in range(3):
#     df_temp = df_l_t_a[df_l_t_a['AgeGroup']==age]
#     size_df_temp = np.arange(len(df_temp))
#     np.random.seed(42)
#     np.random.shuffle(size_df_temp)
#     index_df_temp_inc = size_df_temp[:size_inc]
#     index_df_temp_not_inc = size_df_temp[size_inc:]
#     if age == 0:
#         df_train = df_temp.iloc[index_df_temp_inc]
# #         df_test = df_temp.iloc[index_df_temp_not_inc]
#     else:
#         df_train = pd.concat([df_train, df_temp.iloc[index_df_temp_inc]])
# #         df_test = pd.concat([df_test, df_temp.iloc[index_df_temp_not_inc]])
for age in range(3):
    df_temp = df_l_t_g[df_l_t_g['AgeGroup']==age]
    size_df_temp = np.arange(len(df_temp))
    np.random.seed(42)
    np.random.shuffle(size_df_temp)
    index_df_temp_inc = size_df_temp[:size_inc]
    index_df_temp_not_inc = size_df_temp[size_inc:]
    if age == 0:
        df_train = df_temp.iloc[index_df_temp_inc]
#         df_test = df_temp.iloc[index_df_temp_not_inc]
    else:
        df_train = pd.concat([df_train, df_temp.iloc[index_df_temp_inc]])
#         df_test = pd.concat([df_test, df_temp.iloc[index_df_temp_not_inc]])
size_inc = 400
for age in range(3):
    df_temp = df_l_b_g[df_l_b_g['AgeGroup']==age]
    size_df_temp = np.arange(len(df_temp))
    np.random.seed(42)
    np.random.shuffle(size_df_temp)
    index_df_temp_inc = size_df_temp[:size_inc]
    index_df_temp_not_inc = size_df_temp[size_inc:]
    df_train = pd.concat([df_train, df_temp.iloc[index_df_temp_inc]])
#     df_test = pd.concat([df_test, df_temp.iloc[index_df_temp_not_inc]])
# for age in range(3):
#     df_temp = df_l_b_c[df_l_b_c['AgeGroup']==age]
#     size_df_temp = np.arange(len(df_temp))
#     np.random.seed(42)
#     np.random.shuffle(size_df_temp)
#     index_df_temp_inc = size_df_temp[:size_inc]
#     index_df_temp_not_inc = size_df_temp[size_inc:]
#     df_train = pd.concat([df_train, df_temp.iloc[index_df_temp_inc]])
# #     df_test = pd.concat([df_test, df_temp.iloc[index_df_temp_not_inc]])
size_inc = 300
# for age in range(3):
#     df_temp = df_f_t_a[df_f_t_a['AgeGroup']==age]
#     size_df_temp = np.arange(len(df_temp))
#     np.random.seed(42)
#     np.random.shuffle(size_df_temp)
#     index_df_temp_inc = size_df_temp[:size_inc]
#     index_df_temp_not_inc = size_df_temp[size_inc:]
#     df_train = pd.concat([df_train, df_temp.iloc[index_df_temp_inc]])
# #     df_test = pd.concat([df_test, df_temp.iloc[index_df_temp_not_inc]])
for age in range(3):
    df_temp = df_f_b_g[df_f_b_g['AgeGroup']==age]
    size_df_temp = np.arange(len(df_temp))
    np.random.seed(42)
    np.random.shuffle(size_df_temp)
    index_df_temp_inc = size_df_temp[:size_inc]
    index_df_temp_not_inc = size_df_temp[size_inc:]
    df_train = pd.concat([df_train, df_temp.iloc[index_df_temp_inc]])
#     df_test = pd.concat([df_test, df_temp.iloc[index_df_temp_not_inc]])
size_inc = 300
# for age in range(3):
#     df_temp = df_f_b_c[df_f_b_c['AgeGroup']==age]
#     size_df_temp = np.arange(len(df_temp))
#     np.random.seed(42)
#     np.random.shuffle(size_df_temp)
#     index_df_temp_inc = size_df_temp[:size_inc]
#     index_df_temp_not_inc = size_df_temp[size_inc:]
#     df_train = pd.concat([df_train, df_temp.iloc[index_df_temp_inc]])
# #     df_test = pd.concat([df_test, df_temp.iloc[index_df_temp_not_inc]])
size_inc = 70
# for age in range(3):
#     df_temp = df_vf_t_a[df_vf_t_a['AgeGroup']==age]
#     size_df_temp = np.arange(len(df_temp))
#     if len(size_df_temp) < size_inc:
#         print('Warning Tanzania Arabiensis VF group {} smaller than amount requested'.format(age))
#     np.random.seed(42)
#     np.random.shuffle(size_df_temp)
#     index_df_temp_inc = size_df_temp[:size_inc]
#     index_df_temp_not_inc = size_df_temp[size_inc:]
#     df_train = pd.concat([df_train, df_temp.iloc[index_df_temp_inc]])
#     if age == 0:
#         df_test = df_temp.iloc[index_df_temp_not_inc]
#     else:
#         df_test = pd.concat([df_test, df_temp.iloc[index_df_temp_not_inc]])
for age in range(3):
    df_temp = df_vf_t_g[df_vf_t_g['AgeGroup']==age]
    size_df_temp = np.arange(len(df_temp))
    if len(size_df_temp) < size_inc:
        print('Warning Tanzania Gambie VF group {} smaller than amount requested'.format(age))
    np.random.seed(42)
    np.random.shuffle(size_df_temp)
    index_df_temp_inc = size_df_temp[:size_inc]
    index_df_temp_not_inc = size_df_temp[size_inc:]
    df_train = pd.concat([df_train, df_temp.iloc[index_df_temp_inc]])
#     df_test = pd.concat([df_test, df_temp.iloc[index_df_temp_not_inc]])
size_inc = 0
for age in range(3):
    df_temp = df_vf_b_g[df_vf_b_g['AgeGroup']==age]
    size_df_temp = np.arange(len(df_temp))
    if len(size_df_temp) < size_inc:
        print('Warning Bobo Gambie VF group {} smaller than amount requested'.format(age))
    np.random.seed(42)
    np.random.shuffle(size_df_temp)
    index_df_temp_inc = size_df_temp[:size_inc]
    index_df_temp_not_inc = size_df_temp[size_inc:]
#     df_train = pd.concat([df_train, df_temp.iloc[index_df_temp_inc]])
    if age == 0:
        df_test = df_temp.iloc[index_df_temp_not_inc]
    else:
        df_test = pd.concat([df_test, df_temp.iloc[index_df_temp_not_inc]])
# for age in range(3):
#     df_temp = df_vf_b_c[df_vf_b_c['AgeGroup']==age]
#     size_df_temp = np.arange(len(df_temp))
#     if len(size_df_temp) < size_inc:
#         print('Warning Bobo Colluzzi VF group {} smaller than amount requested'.format(age))
#     np.random.seed(42)
#     np.random.shuffle(size_df_temp)
#     index_df_temp_inc = size_df_temp[:size_inc]
#     index_df_temp_not_inc = size_df_temp[size_inc:]
# #     df_train = pd.concat([df_train, df_temp.iloc[index_df_temp_inc]])
#     df_test = pd.concat([df_test, df_temp.iloc[index_df_temp_not_inc]])

X = df_train.iloc[:,6:-1]
y_age = df_train["Age"]
y_age_groups = df_train["AgeGroup"]
y_species = df_train["Species"]
y_status = df_train["Status"]
X = np.asarray(X)
y_age = np.asarray(y_age)
y_age_groups = np.asarray(y_age_groups)
y_species = np.asarray(y_species)
y_status = np.asarray(y_status)

X_vf = df_test.iloc[:,6:-1]
y_age_vf = df_test["Age"]
y_age_groups_vf = df_test["AgeGroup"]
y_species_vf = df_test["Species"]
y_status_vf = df_test["Status"]
X_vf = np.asarray(X_vf)
y_age_vf = np.asarray(y_age_vf)
y_age_groups_vf = np.asarray(y_age_groups_vf)
y_species_vf = np.asarray(y_species_vf)
y_status_vf = np.asarray(y_status_vf)

In [46]:
print(X.shape)
print(X_vf.shape)

(3510, 1625)
(265, 1625)


In [47]:
## Name a folder for the outputs to go into
outdir = "output_data_update_19_02/"
build_folder(outdir, False)

savedir = (outdir+"Paper_Results/Trian_Val_No_Bobo/")
build_folder(savedir, True)

val_results = {'loss':[], 'age_group_loss':[], 'species_loss':[], 'age_group_acc':[], 'species_acc':[]}
histories = []

start_time = time()

## Input CNN Size
input_layer_dim = len(X[0])

## Transform Data
y_age_groups_list = [[age] for age in y_age_groups]
y_species_list = [[species] for species in y_species]
age_groups = MultiLabelBinarizer().fit_transform(np.array(y_age_groups_list))
age_group_classes = ["1-4", "5-10", "11-17"]
species = MultiLabelBinarizer().fit_transform(np.array(y_species_list))
species_classes = list(np.unique(y_species_list))
y_age_groups_list_vf = [[age] for age in y_age_groups_vf]
y_species_list_vf = [[species] for species in y_species_vf]
age_groups_vf = MultiLabelBinarizer().fit_transform(np.array(y_age_groups_list_vf))
species_vf = MultiLabelBinarizer().fit_transform(np.array(y_species_list_vf))

## Labels
labels_default, classes_default, outputs_default = [age_groups], [age_group_classes], ['xAgeGroup']
labels_default_vf, classes_default_vf, outputs_default_vf = [age_groups_vf], [age_group_classes], ['xAgeGroup']

## Declare and train the model
model_size = [{'type':'c', 'filter':16, 'kernel':8, 'stride':1, 'pooling':1},
             {'type':'c', 'filter':16, 'kernel':8, 'stride':2, 'pooling':1},
             {'type':'c', 'filter':16, 'kernel':3, 'stride':1, 'pooling':1},
             {'type':'c', 'filter':16, 'kernel':6, 'stride':2, 'pooling':1},
             {'type':'c', 'filter':16, 'kernel':5, 'stride':1, 'pooling':2},
             {'type':'d', 'width':500}]

## Name the model
model_name = ('Valid_Inc_No_Bobo')

## Scale train, test
scl = StandardScaler()
features_scl = scl.fit(X=np.vstack((X, X_vf)))
X_train = features_scl.transform(X=X)
X_test = features_scl.transform(X=X_vf)

## Split data into test and train
y_train, y_test = list(map(lambda y:y, labels_default)), list(map(lambda y:y, labels_default_vf))

X_train = np.expand_dims(X_train, axis=2)
X_test = np.expand_dims(X_test, axis=2)

model_to_test = {
    "model_shape" : [model_size], # defines the hidden layers of the model
    "model_name"  : [model_name],
    "input_layer_dim"  : [input_layer_dim], # size of input layer
    "model_ver_num"  : [0],
    "fold"  : [0], # kf.split number on
    "labels"   : [y_train],
    "features" : [X_train],
    "classes"  : [classes_default],
    "outputs"   : [outputs_default],
    "compile_loss": [{'age': 'categorical_crossentropy'}],
    "compile_metrics" :[{'age': 'accuracy'}]
}

## Call function to train all the models from the dictionary
model, history = train_models(model_to_test, savedir)
histories.append(history)

predicted_labels = list([] for i in range(len(y_train)))
true_labels = list([] for i in range(len(y_train)))

y_predicted = model.predict(X_test)
y_predicted = [y_predicted]
# temp_eval = model.evaluate(X_test, y_test)
# for metric, res in zip(model.metrics_names, temp_eval):
#     val_results[metric].append(res)
# print(val_results)

predicted_labels = [x+[y] for x,y in zip(predicted_labels,y_predicted)]
true_labels = [x+[y] for x,y in zip(true_labels,y_test)]

predicted_labels = [predicted_labels[i][0].tolist() for i in range(len(predicted_labels))]
true_labels = [true_labels[i][0].tolist() for i in range(len(true_labels))]

## Visualize the results
visualize(histories, savedir, model_name, "0", classes_default, outputs_default, predicted_labels, true_labels, title='Prediction on Bobo Validation Field')

# Clear the Keras session, otherwise it will keep adding new
# models to the same TensorFlow graph each time we create
# a model with a different set of hyper-parameters.
K.clear_session()

# Delete the Keras model with these hyper-parameters from memory.
del model

end_time = time()
print('Run time : {} s'.format(end_time-start_time))
print('Run time : {} m'.format((end_time-start_time)/60))
print('Run time : {} h'.format((end_time-start_time)/3600))

save_obj(history.history, 'Validation_Results_History', savedir)

Run time : 1511.5541138648987 s
Run time : 25.192568564414977 m
Run time : 0.41987614274024965 h


## Leave out Ifakara

In [48]:
df = pd.read_csv("/home/josh/Documents/Mosquito_Project/New_Data/Data/MIMIdata_update_19_02/mosquitoes_country_LM_5_0.dat", '\t')
df.head(10)

df['AgeGroup'] = 0
df['AgeGroup'] = np.where(df['Age']>10, 2, np.where(df['Age']>4, 1, 0))

df_vf = df[df['RearCnd']=='VF']
df_vf = df_vf[df_vf['Status']=='UN']
df = df[df['RearCnd']!='VF']
df = df[df['Status']!='UN']
df_l = df[df['RearCnd']=='TL']
df_l_g = df_l[df_l['Country']=='S']
df_l_g_a = df_l_g[df_l_g['Species']=='AA']
age_counts = df_l_g_a.groupby('AgeGroup').size()
df_l_g_g = df_l_g[df_l_g['Species']=='AG']
age_counts = df_l_g_g.groupby('AgeGroup').size()
df_l_g_c = df_l_g[df_l_g['Species']=='AC']
age_counts = df_l_g_c.groupby('AgeGroup').size()
df_l_t = df_l[df_l['Country']=='T']
df_l_t_a = df_l_t[df_l_t['Species']=='AA']
age_counts = df_l_t_a.groupby('AgeGroup').size()
df_l_t_g = df_l_t[df_l_t['Species']=='AG']
age_counts = df_l_t_g.groupby('AgeGroup').size()
df_l_b = df_l[df_l['Country']=='B']
df_l_b_g = df_l_b[df_l_b['Species']=='AG']
age_counts = df_l_b_g.groupby('AgeGroup').size()
df_l_b_c = df_l_b[df_l_b['Species']=='AC']
age_counts = df_l_b_c.groupby('AgeGroup').size()
df_f = df[df['RearCnd']=='TF']
df_f_t = df_f[df_f['Country']=='T']
df_f_t_a = df_f_t[df_f_t['Species']=='AA']
# df_f_t_g = df_f_t[df_f_t['Species']=='AG'] #There isn't any
df_f_b = df_f[df_f['Country']=='B']
df_f_b_g = df_f_b[df_f_b['Species']=='AG']
age_counts = df_f_b_g.groupby('AgeGroup').size()
df_f_b_c = df_f_b[df_f_b['Species']=='AC']
age_counts = df_f_b_c.groupby('AgeGroup').size()
df_vf_t = df_vf[df_vf['Country']=='T']
df_vf_t_a = df_vf_t[df_vf_t['Species']=='AA']
age_counts = df_vf_t_a.groupby('AgeGroup').size()
#         print(age_counts)
df_vf_t_g = df_vf_t[df_vf_t['Species']=='AG']
age_counts = df_vf_t_g.groupby('AgeGroup').size()
#         print(age_counts)
df_vf_b = df_vf[df_vf['Country']=='B']
df_vf_b_g = df_vf_b[df_vf_b['Species']=='AG']
age_counts = df_vf_b_g.groupby('AgeGroup').size()
#         print(age_counts)
df_vf_b_c = df_vf_b[df_vf_b['Species']=='AC']
age_counts = df_vf_b_c.groupby('AgeGroup').size()
#         print(age_counts)


size_inc = 400
# for age in range(3):
#     df_temp = df_l_t_a[df_l_t_a['AgeGroup']==age]
#     size_df_temp = np.arange(len(df_temp))
#     np.random.seed(42)
#     np.random.shuffle(size_df_temp)
#     index_df_temp_inc = size_df_temp[:size_inc]
#     index_df_temp_not_inc = size_df_temp[size_inc:]
#     if age == 0:
#         df_train = df_temp.iloc[index_df_temp_inc]
# #         df_test = df_temp.iloc[index_df_temp_not_inc]
#     else:
#         df_train = pd.concat([df_train, df_temp.iloc[index_df_temp_inc]])
# #         df_test = pd.concat([df_test, df_temp.iloc[index_df_temp_not_inc]])
for age in range(3):
    df_temp = df_l_t_g[df_l_t_g['AgeGroup']==age]
    size_df_temp = np.arange(len(df_temp))
    np.random.seed(42)
    np.random.shuffle(size_df_temp)
    index_df_temp_inc = size_df_temp[:size_inc]
    index_df_temp_not_inc = size_df_temp[size_inc:]
    if age == 0:
        df_train = df_temp.iloc[index_df_temp_inc]
#         df_test = df_temp.iloc[index_df_temp_not_inc]
    else:
        df_train = pd.concat([df_train, df_temp.iloc[index_df_temp_inc]])
#         df_test = pd.concat([df_test, df_temp.iloc[index_df_temp_not_inc]])
size_inc = 400
for age in range(3):
    df_temp = df_l_b_g[df_l_b_g['AgeGroup']==age]
    size_df_temp = np.arange(len(df_temp))
    np.random.seed(42)
    np.random.shuffle(size_df_temp)
    index_df_temp_inc = size_df_temp[:size_inc]
    index_df_temp_not_inc = size_df_temp[size_inc:]
    df_train = pd.concat([df_train, df_temp.iloc[index_df_temp_inc]])
#     df_test = pd.concat([df_test, df_temp.iloc[index_df_temp_not_inc]])
# for age in range(3):
#     df_temp = df_l_b_c[df_l_b_c['AgeGroup']==age]
#     size_df_temp = np.arange(len(df_temp))
#     np.random.seed(42)
#     np.random.shuffle(size_df_temp)
#     index_df_temp_inc = size_df_temp[:size_inc]
#     index_df_temp_not_inc = size_df_temp[size_inc:]
#     df_train = pd.concat([df_train, df_temp.iloc[index_df_temp_inc]])
# #     df_test = pd.concat([df_test, df_temp.iloc[index_df_temp_not_inc]])
size_inc = 300
# for age in range(3):
#     df_temp = df_f_t_a[df_f_t_a['AgeGroup']==age]
#     size_df_temp = np.arange(len(df_temp))
#     np.random.seed(42)
#     np.random.shuffle(size_df_temp)
#     index_df_temp_inc = size_df_temp[:size_inc]
#     index_df_temp_not_inc = size_df_temp[size_inc:]
#     df_train = pd.concat([df_train, df_temp.iloc[index_df_temp_inc]])
# #     df_test = pd.concat([df_test, df_temp.iloc[index_df_temp_not_inc]])
for age in range(3):
    df_temp = df_f_b_g[df_f_b_g['AgeGroup']==age]
    size_df_temp = np.arange(len(df_temp))
    np.random.seed(42)
    np.random.shuffle(size_df_temp)
    index_df_temp_inc = size_df_temp[:size_inc]
    index_df_temp_not_inc = size_df_temp[size_inc:]
    df_train = pd.concat([df_train, df_temp.iloc[index_df_temp_inc]])
#     df_test = pd.concat([df_test, df_temp.iloc[index_df_temp_not_inc]])
size_inc = 300
# for age in range(3):
#     df_temp = df_f_b_c[df_f_b_c['AgeGroup']==age]
#     size_df_temp = np.arange(len(df_temp))
#     np.random.seed(42)
#     np.random.shuffle(size_df_temp)
#     index_df_temp_inc = size_df_temp[:size_inc]
#     index_df_temp_not_inc = size_df_temp[size_inc:]
#     df_train = pd.concat([df_train, df_temp.iloc[index_df_temp_inc]])
# #     df_test = pd.concat([df_test, df_temp.iloc[index_df_temp_not_inc]])
size_inc = 0
# for age in range(3):
#     df_temp = df_vf_t_a[df_vf_t_a['AgeGroup']==age]
#     size_df_temp = np.arange(len(df_temp))
#     if len(size_df_temp) < size_inc:
#         print('Warning Tanzania Arabiensis VF group {} smaller than amount requested'.format(age))
#     np.random.seed(42)
#     np.random.shuffle(size_df_temp)
#     index_df_temp_inc = size_df_temp[:size_inc]
#     index_df_temp_not_inc = size_df_temp[size_inc:]
#     df_train = pd.concat([df_train, df_temp.iloc[index_df_temp_inc]])
#     if age == 0:
#         df_test = df_temp.iloc[index_df_temp_not_inc]
#     else:
#         df_test = pd.concat([df_test, df_temp.iloc[index_df_temp_not_inc]])
for age in range(3):
    df_temp = df_vf_t_g[df_vf_t_g['AgeGroup']==age]
    size_df_temp = np.arange(len(df_temp))
    if len(size_df_temp) < size_inc:
        print('Warning Tanzania Gambie VF group {} smaller than amount requested'.format(age))
    np.random.seed(42)
    np.random.shuffle(size_df_temp)
    index_df_temp_inc = size_df_temp[:size_inc]
    index_df_temp_not_inc = size_df_temp[size_inc:]
#     df_train = pd.concat([df_train, df_temp.iloc[index_df_temp_inc]])
    if age == 0:
        df_test = df_temp.iloc[index_df_temp_not_inc]
    else:
        df_test = pd.concat([df_test, df_temp.iloc[index_df_temp_not_inc]])
size_inc = 70
for age in range(3):
    df_temp = df_vf_b_g[df_vf_b_g['AgeGroup']==age]
    size_df_temp = np.arange(len(df_temp))
    if len(size_df_temp) < size_inc:
        print('Warning Bobo Gambie VF group {} smaller than amount requested'.format(age))
    np.random.seed(42)
    np.random.shuffle(size_df_temp)
    index_df_temp_inc = size_df_temp[:size_inc]
    index_df_temp_not_inc = size_df_temp[size_inc:]
    df_train = pd.concat([df_train, df_temp.iloc[index_df_temp_inc]])
#     if age == 0:
#         df_test = df_temp.iloc[index_df_temp_not_inc]
#     else:
#         df_test = pd.concat([df_test, df_temp.iloc[index_df_temp_not_inc]])
# for age in range(3):
#     df_temp = df_vf_b_c[df_vf_b_c['AgeGroup']==age]
#     size_df_temp = np.arange(len(df_temp))
#     if len(size_df_temp) < size_inc:
#         print('Warning Bobo Colluzzi VF group {} smaller than amount requested'.format(age))
#     np.random.seed(42)
#     np.random.shuffle(size_df_temp)
#     index_df_temp_inc = size_df_temp[:size_inc]
#     index_df_temp_not_inc = size_df_temp[size_inc:]
# #     df_train = pd.concat([df_train, df_temp.iloc[index_df_temp_inc]])
#     df_test = pd.concat([df_test, df_temp.iloc[index_df_temp_not_inc]])

X = df_train.iloc[:,6:-1]
y_age = df_train["Age"]
y_age_groups = df_train["AgeGroup"]
y_species = df_train["Species"]
y_status = df_train["Status"]
X = np.asarray(X)
y_age = np.asarray(y_age)
y_age_groups = np.asarray(y_age_groups)
y_species = np.asarray(y_species)
y_status = np.asarray(y_status)

X_vf = df_test.iloc[:,6:-1]
y_age_vf = df_test["Age"]
y_age_groups_vf = df_test["AgeGroup"]
y_species_vf = df_test["Species"]
y_status_vf = df_test["Status"]
X_vf = np.asarray(X_vf)
y_age_vf = np.asarray(y_age_vf)
y_age_groups_vf = np.asarray(y_age_groups_vf)
y_species_vf = np.asarray(y_species_vf)
y_status_vf = np.asarray(y_status_vf)



In [49]:
## Name a folder for the outputs to go into
outdir = "output_data_update_19_02/"
build_folder(outdir, False)

savedir = (outdir+"Paper_Results/Trian_Val_No_Ifakara/")
build_folder(savedir, True)

val_results = {'loss':[], 'age_group_loss':[], 'species_loss':[], 'age_group_acc':[], 'species_acc':[]}
histories = []

start_time = time()

## Input CNN Size
input_layer_dim = len(X[0])

## Transform Data
y_age_groups_list = [[age] for age in y_age_groups]
y_species_list = [[species] for species in y_species]
age_groups = MultiLabelBinarizer().fit_transform(np.array(y_age_groups_list))
age_group_classes = ["1-4", "5-10", "11-17"]
species = MultiLabelBinarizer().fit_transform(np.array(y_species_list))
species_classes = list(np.unique(y_species_list))
y_age_groups_list_vf = [[age] for age in y_age_groups_vf]
y_species_list_vf = [[species] for species in y_species_vf]
age_groups_vf = MultiLabelBinarizer().fit_transform(np.array(y_age_groups_list_vf))
species_vf = MultiLabelBinarizer().fit_transform(np.array(y_species_list_vf))

## Labels
labels_default, classes_default, outputs_default = [age_groups], [age_group_classes], ['xAgeGroup']
labels_default_vf, classes_default_vf, outputs_default_vf = [age_groups_vf], [age_group_classes], ['xAgeGroup']

## Declare and train the model
model_size = [{'type':'c', 'filter':16, 'kernel':8, 'stride':1, 'pooling':1},
             {'type':'c', 'filter':16, 'kernel':8, 'stride':2, 'pooling':1},
             {'type':'c', 'filter':16, 'kernel':3, 'stride':1, 'pooling':1},
             {'type':'c', 'filter':16, 'kernel':6, 'stride':2, 'pooling':1},
             {'type':'c', 'filter':16, 'kernel':5, 'stride':1, 'pooling':2},
             {'type':'d', 'width':500}]

## Name the model
model_name = ('Valid_Inc_No_Ifakara')

## Scale train, test
scl = StandardScaler()
features_scl = scl.fit(X=np.vstack((X, X_vf)))
X_train = features_scl.transform(X=X)
X_test = features_scl.transform(X=X_vf)

## Split data into test and train
y_train, y_test = list(map(lambda y:y, labels_default)), list(map(lambda y:y, labels_default_vf))

X_train = np.expand_dims(X_train, axis=2)
X_test = np.expand_dims(X_test, axis=2)

model_to_test = {
    "model_shape" : [model_size], # defines the hidden layers of the model
    "model_name"  : [model_name],
    "input_layer_dim"  : [input_layer_dim], # size of input layer
    "model_ver_num"  : [0],
    "fold"  : [0], # kf.split number on
    "labels"   : [y_train],
    "features" : [X_train],
    "classes"  : [classes_default],
    "outputs"   : [outputs_default],
    "compile_loss": [{'age': 'categorical_crossentropy'}],
    "compile_metrics" :[{'age': 'accuracy'}]
}

## Call function to train all the models from the dictionary
model, history = train_models(model_to_test, savedir)
histories.append(history)

predicted_labels = list([] for i in range(len(y_train)))
true_labels = list([] for i in range(len(y_train)))

y_predicted = model.predict(X_test)
y_predicted = [y_predicted]
# temp_eval = model.evaluate(X_test, y_test)
# for metric, res in zip(model.metrics_names, temp_eval):
#     val_results[metric].append(res)
# print(val_results)

predicted_labels = [x+[y] for x,y in zip(predicted_labels,y_predicted)]
true_labels = [x+[y] for x,y in zip(true_labels,y_test)]

predicted_labels = [predicted_labels[i][0].tolist() for i in range(len(predicted_labels))]
true_labels = [true_labels[i][0].tolist() for i in range(len(true_labels))]

## Visualize the results
visualize(histories, savedir, model_name, "0", classes_default, outputs_default, predicted_labels, true_labels, title='Prediction on Ifakara Validation Field')

# Clear the Keras session, otherwise it will keep adding new
# models to the same TensorFlow graph each time we create
# a model with a different set of hyper-parameters.
K.clear_session()

# Delete the Keras model with these hyper-parameters from memory.
del model

end_time = time()
print('Run time : {} s'.format(end_time-start_time))
print('Run time : {} m'.format((end_time-start_time)/60))
print('Run time : {} h'.format((end_time-start_time)/3600))

save_obj(history.history, 'Validation_Results_History', savedir)

Run time : 1353.243168592453 s
Run time : 22.554052809874218 m
Run time : 0.3759008801645703 h


## Prepearing csv file with confusion matricies for power analysis

In [10]:
# cms = []
# folders = ['valid_data_inc_0', 'valid_data_inc_0.05', 'valid_data_inc_0.1', 'valid_data_inc_0.15', 'valid_data_inc_0.2', 'valid_data_inc_0.25', 'valid_data_inc_0.3', 'valid_data_inc_0.35', 'valid_data_inc_0.4', 'valid_data_inc_0.45', 'valid_data_inc_0.5']
# for folder in folders:
#     cm = np.load(('Results/Trian_Val_Inc_V2/'+folder+'/Confusion_Matrix_data.npy'))
#     cms.append(cm.reshape(-1))
# print(np.array(cms))
# np.savetxt('Results/Trian_Val_Inc_V2/Confusion_Matricies.csv', cms, delimiter=',')

[[0.98365527 0.         0.01634473 0.         0.0483871  0.9516129
  0.82590612 0.00713012 0.16696376]
 [0.72435395 0.         0.27564605 0.         0.11440678 0.88559322
  0.52309613 0.00873908 0.46816479]
 [0.77069536 0.         0.22930464 0.         0.125      0.875
  0.42800789 0.01314924 0.55884287]
 [0.81474978 0.         0.18525022 0.         0.24056604 0.75943396
  0.41527778 0.01875    0.56597222]
 [0.78317757 0.         0.21682243 0.         0.30456853 0.69543147
  0.33480826 0.01327434 0.6519174 ]
 [0.83316683 0.         0.16683317 0.         0.3655914  0.6344086
  0.33176471 0.01098039 0.6572549 ]
 [0.87982833 0.         0.12017167 0.         0.30898876 0.69101124
  0.30653266 0.00586265 0.68760469]
 [0.85863268 0.         0.14136732 0.         0.31176471 0.68823529
  0.24079066 0.00808625 0.75112309]
 [0.84130982 0.         0.15869018 0.         0.50625    0.49375
  0.20991254 0.00874636 0.78134111]
 [0.88965517 0.         0.11034483 0.         0.61184211 0.38815789
  0.19

In [13]:
age_cms = [[0.24, 0.13, 0.63, 0.26, 0.16, 0.58, 0.31, 0.09, 0.60],
          [0.40, 0.12, 0.48, 0.21, 0.33, 0.46, 0.20, 0.15, 0.65],
          [0.49, 0.09, 0.42, 0.11, 0.40, 0.50, 0.07, 0.14, 0.79],
          [0.56, 0.08, 0.36, 0.09, 0.53, 0.38, 0.07, 0.18, 0.75],
          [0.57, 0.12, 0.31, 0.09, 0.61, 0.30, 0.09, 0.18, 0.73],
          [0.55, 0.07, 0.38, 0.08, 0.62, 0.30, 0.08, 0.13, 0.78],
          [0.52, 0.08, 0.40, 0.07, 0.67, 0.26, 0.04, 0.15, 0.81],
          [0.51, 0.07, 0.41, 0.06, 0.65, 0.29, 0.04, 0.07, 0.89],
          [0.63, 0.08, 0.29, 0.05, 0.72, 0.23, 0.05, 0.10, 0.85],
          [0.59, 0.06, 0.34, 0.05, 0.77, 0.19, 0.03, 0.10, 0.86],
          [0.57, 0.04, 0.39, 0.05, 0.76, 0.19, 0.03, 0.08, 0.89]]

age_cms = np.array(age_cms)
np.savetxt('Results/Trian_Val_Inc_V2/Confusion_Matricies.csv', age_cms, delimiter=',')