## Importar librerías

In [1]:
from tqdm import tqdm
import numpy as np
import pandas as pd
import os
import warnings
import cv2
warnings.filterwarnings(action='ignore')
from tensorflow.keras.applications import VGG16

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dropout, Flatten, Dense, GlobalAveragePooling2D 
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping 
from tensorflow.keras import optimizers

## Definir la arquitectura del modelo
Para esta caso se aplicará transfer learning usando el modelo pre-entrenado de VGG-16 agregando dos capas fully conected de 1024 cada uno, con una capa de dropout de 0.5 entre ellos. Al entrenar la presencia del campo del formulario completo y no, se ha optado por una clasificacion binario por eso en la campa de salida, se debe configurar el valor igual 2 y la función de activación sigmoid

In [2]:
def training_model(x_train, y_train):
    ## load the base model of VGG-16 without the last layer
    base_model = VGG16(weights="imagenet",
                       include_top=False,
                       input_shape = (x_train.shape[1], x_train.shape[2], x_train.shape[3]))
    X = base_model.output
    X = Flatten()(X)
    X = Dense(1024, activation="relu")(X)
    X = Dropout(rate=0.5)(X)
    X = Dense(1024, activation="relu")(X)

    output = Dense(1, activation='sigmoid')(X)

    model = Model(base_model.input,output)

    for layer in model.layers[:]:# [:-5] solo entrena las 5 últimas capa de bgg
        layer.trainable = True
        
    model.compile(optimizer= optimizers.SGD(lr=0.001, momentum=0.9),#Stocastid Gradient Descent #lr=0.001
              loss='binary_crossentropy',#
              metrics=['accuracy'],
            )

    model.fit(x_train,
              y_train,
              epochs= 50,
              validation_split = 0.1,
              verbose = True) ## verbose = False
    return model

## Definir la dimension de las imagenes
Al hacer pruebas en la conveniencia de eficiencia y velocidad de entrenamiento, se ha optado por lo siguiente.

In [3]:
image_heigth = 56
image_width = 128

## Cargar la data

In [4]:
def load_dataset(ruta_data):
    data = []
    for i in tqdm(os.listdir(ruta_data)):
        if 'jpg' not in i:
            continue
        ruta_img = ruta_data + i
        image = cv2.imread(ruta_img)
        image = cv2.resize(image,(image_width,image_heigth))  
        data.append(image)
    data = np.array(data)
    return data

In [5]:
def load_ytrain(ruta_data_train, ruta_label_train, column):
    ids = [ i.split('.')[0] for i in os.listdir(ruta_data_train) if 'jpg' in i]
    y_train = pd.read_csv(ruta_label_train,keep_default_na=False, encoding = 'utf-8', dtype = 'str')
    y_train = y_train.set_index('id').loc[ids]
    y_train[column] = y_train[column].astype(int)
    y_train = y_train[[column]].copy()
    return y_train

In [6]:
ruta_label_train = '../data/output_train.csv'
ruta_submit = '../data/sampleSubmission.csv'

## Entrenamiento de Modelos

Se entrenarán cinco modelos para identificar la existencia de cada uno de los campos, es decir, idenficar si existe o no sign_1, sign_2, date_day, date_month, date_year. Donde 1 representa la existencia del campo y 0 caso contrario

In [21]:
# Definir la funcion para guardar los modelos
def save_model_custom(model, path):
    model_json = model.to_json()
    with open(f"{path}.json", "w") as json_file:
        json_file.write(model_json)
    # Serializar el modelo a HDF5
    model.save_weights(f"{path}.h5")

In [7]:
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report
import re
dict_target = {'sign_1':'firma1','sign_2':'firma2','date_day':'fecha','date_month':'fecha','date_year':'fecha'}
list_y_test_column = []
for column in dict_target:
    print(f'{"#"*40} {column} {"#"*40}')
    ruta_data_train = f'../data/output/image_train_transform/{dict_target[column]}/'
    y_train = load_ytrain(ruta_data_train, ruta_label_train, column)
    data_train = load_dataset(ruta_data_train)
    #print(data_train.shape, data_test.shape)
    data_train = data_train/255.0
    if dict_target[column] == 'fecha':
        y_train[column] = y_train[column].apply(lambda x: 1 if x!=0 else 0)
    y_train = y_train.values
    
    print(data_train.shape, y_train.shape)
    model = training_model(data_train, y_train)
    save_model_custom(model,f"../models/model_vgg_identificacion_{column}")
#     model.save(f"../models/model_vgg_identificacion_{column}.h5")

######################################## date_year ########################################


100%|██████████| 244/244 [00:00<00:00, 3235.70it/s]
100%|██████████| 108/108 [00:00<00:00, 3348.65it/s]

(243, 56, 128, 3) (243, 1)





Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
