# Proyecto ML

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import cv2 as cv
import os
import shutil
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Input, Dense, Dropout, Flatten, MaxPool2D
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.layers import Conv2D
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from keras.applications.xception import preprocess_input
import splitfolders
%matplotlib inline

In [2]:
print('GPU name: ', tf.config.experimental.list_physical_devices('GPU'))

GPU name:  [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [3]:
PATH = r'C:\Users\batch-pc\Documents\Data science ft\ds_ft_sep_22\3-Machine_Learning\Proyecto_ML\Data\book-covers'
filenames = os.listdir(PATH)
categories = []
for filename in filenames:
    category = filename
    categories.append(category)

In [4]:
categories

['Art-Photography',
 'Biography',
 'Business-Finance-Law',
 'Childrens-Books',
 'Computing',
 'Crafts-Hobbies',
 'Crime-Thriller',
 'Dictionaries-Languages',
 'Entertainment',
 'Food-Drink',
 'Graphic-Novels-Anime-Manga',
 'Health',
 'History-Archaeology',
 'Home-Garden',
 'Humour',
 'Medical',
 'Mind-Body-Spirit',
 'Natural-History',
 'Personal-Development',
 'Poetry-Drama',
 'Reference',
 'Religion',
 'Romance',
 'Science-Fiction-Fantasy-Horror',
 'Science-Geography',
 'Society-Social-Sciences',
 'Sport',
 'Stationery',
 'Teaching-Resources-Education',
 'Technology-Engineering',
 'Teen-Young-Adult',
 'Transport',
 'Travel-Holiday-Guides']

In [5]:
categories_names_label = {class_name:i for i ,class_name in enumerate(categories)}
categories_names_label

{'Art-Photography': 0,
 'Biography': 1,
 'Business-Finance-Law': 2,
 'Childrens-Books': 3,
 'Computing': 4,
 'Crafts-Hobbies': 5,
 'Crime-Thriller': 6,
 'Dictionaries-Languages': 7,
 'Entertainment': 8,
 'Food-Drink': 9,
 'Graphic-Novels-Anime-Manga': 10,
 'Health': 11,
 'History-Archaeology': 12,
 'Home-Garden': 13,
 'Humour': 14,
 'Medical': 15,
 'Mind-Body-Spirit': 16,
 'Natural-History': 17,
 'Personal-Development': 18,
 'Poetry-Drama': 19,
 'Reference': 20,
 'Religion': 21,
 'Romance': 22,
 'Science-Fiction-Fantasy-Horror': 23,
 'Science-Geography': 24,
 'Society-Social-Sciences': 25,
 'Sport': 26,
 'Stationery': 27,
 'Teaching-Resources-Education': 28,
 'Technology-Engineering': 29,
 'Teen-Young-Adult': 30,
 'Transport': 31,
 'Travel-Holiday-Guides': 32}

In [11]:
# Split with a ratio.
# To only split into training and validation set, set a tuple to `ratio`, i.e, `(.8, .2)`.
splitfolders.ratio(r".\Data\book-covers", output=r".\Data\train_test",
    seed=1337, ratio=(.8, .1, .1), group_prefix=None, move=False) # default values

Copying files: 32614 files [06:21, 85.49 files/s] 


In [2]:
import splitfolders

In [4]:
splitfolders.copy_files??

[1;31mSignature:[0m [0msplitfolders[0m[1;33m.[0m[0mcopy_files[0m[1;33m([0m[0mfiles_type[0m[1;33m,[0m [0mclass_dir[0m[1;33m,[0m [0moutput[0m[1;33m,[0m [0mprog_bar[0m[1;33m,[0m [0mmove[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mSource:[0m   
[1;32mdef[0m [0mcopy_files[0m[1;33m([0m[0mfiles_type[0m[1;33m,[0m [0mclass_dir[0m[1;33m,[0m [0moutput[0m[1;33m,[0m [0mprog_bar[0m[1;33m,[0m [0mmove[0m[1;33m)[0m[1;33m:[0m[1;33m
[0m    [1;34m"""
    Copies the files from the input folder to the output folder
    """[0m[1;33m
[0m[1;33m
[0m    [0mcopy_fun[0m [1;33m=[0m [0mshutil[0m[1;33m.[0m[0mmove[0m [1;32mif[0m [0mmove[0m [1;32melse[0m [0mshutil[0m[1;33m.[0m[0mcopy2[0m[1;33m
[0m[1;33m
[0m    [1;31m# get the last part within the file[0m[1;33m
[0m    [0mclass_name[0m [1;33m=[0m [0mpath[0m[1;33m.[0m[0msplit[0m[1;33m([0m[0mclass_dir[0m[1;33m)[0m[1;33m[[0m[1;36m1[0m[1;33m][0m[1;33m
[0

In [6]:
image_size = 224
batch_size = 64
epochs = 20

train_datagen = ImageDataGenerator(
                                    rescale=1./255,
                                    shear_range=0.2,
                                    zoom_range=0.2,
                                    horizontal_flip=True
                                  )

train_generator = train_datagen.flow_from_directory('Data/train_test/train/', class_mode='categorical', 
                                                    batch_size = batch_size, target_size=(image_size,image_size), 
                                                    shuffle=True, seed=42)
validation_generator = train_datagen.flow_from_directory('Data/train_test/val/', class_mode='categorical',
                                                        batch_size = batch_size, target_size=(image_size,image_size),
                                                        shuffle=True, seed=42)
test_datagen = ImageDataGenerator(rescale=1./255)

Found 26050 images belonging to 33 classes.
Found 3249 images belonging to 33 classes.


In [27]:
xception_model = tf.keras.applications.xception.Xception(
    include_top=False,
    weights='imagenet',
    input_tensor=None,
    input_shape=(224, 224, 3),
    pooling='avg',
    classes=33,
    classifier_activation='softmax'
    )
tf.random.set_seed(73)
model = Sequential()
model.add(xception_model)
model.add(Flatten()) 
model.add(Dense(units=2048, activation="relu"))
model.add(Dropout(0.2))
model.add(Dense(units=1024, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(units=512, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(33, activation='softmax'))
model.layers[0].trainable=False
model.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 xception (Functional)       (None, 2048)              20861480  
                                                                 
 flatten_4 (Flatten)         (None, 2048)              0         
                                                                 
 dense_16 (Dense)            (None, 2048)              4196352   
                                                                 
 dropout_12 (Dropout)        (None, 2048)              0         
                                                                 
 dense_17 (Dense)            (None, 1024)              2098176   
                                                                 
 dropout_13 (Dropout)        (None, 1024)              0         
                                                                 
 dense_18 (Dense)            (None, 512)              

In [28]:
model.compile(loss='categorical_crossentropy', optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001), metrics=['accuracy'])

In [29]:
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)
save_best = ModelCheckpoint(
filepath = 'best_model.hdf5',
verbose=1, save_best_only=True
)

In [30]:
history = model.fit_generator(train_generator, steps_per_epoch=train_generator.samples // batch_size, validation_data = validation_generator,\
                         validation_steps = validation_generator.samples // batch_size, epochs = epochs, callbacks=[save_best,early_stopping], verbose=2)

  history = model.fit_generator(train_generator, steps_per_epoch=train_generator.samples // batch_size, validation_data = validation_generator,\


Epoch 1/20

Epoch 1: val_loss improved from inf to 3.04221, saving model to best_model.hdf5
407/407 - 264s - loss: 3.2305 - accuracy: 0.1215 - val_loss: 3.0422 - val_accuracy: 0.1741 - 264s/epoch - 648ms/step
Epoch 2/20

Epoch 2: val_loss improved from 3.04221 to 2.96013, saving model to best_model.hdf5
407/407 - 262s - loss: 3.0020 - accuracy: 0.1799 - val_loss: 2.9601 - val_accuracy: 0.1822 - 262s/epoch - 644ms/step
Epoch 3/20

Epoch 3: val_loss improved from 2.96013 to 2.91665, saving model to best_model.hdf5
407/407 - 282s - loss: 2.9210 - accuracy: 0.1948 - val_loss: 2.9166 - val_accuracy: 0.2037 - 282s/epoch - 693ms/step
Epoch 4/20

Epoch 4: val_loss improved from 2.91665 to 2.91005, saving model to best_model.hdf5
407/407 - 191s - loss: 2.8551 - accuracy: 0.2087 - val_loss: 2.9101 - val_accuracy: 0.1981 - 191s/epoch - 470ms/step
Epoch 5/20

Epoch 5: val_loss improved from 2.91005 to 2.87361, saving model to best_model.hdf5
407/407 - 228s - loss: 2.7872 - accuracy: 0.2257 - val_l

In [None]:
#sacar el top 3 de categorias y comprobar la accuracy

In [None]:
#hacer un transformer con imagenes y titulos para determinar el genero