In [16]:
# Procesamiento de imágenes
import cv2 as cv
import numpy as np
import os

# Manipulación de datos
import pandas as pd
from sklearn.model_selection import train_test_split

# TensorFlow y Keras
from tensorflow.keras import layers, models, callbacks
import tensorflow.keras.backend as K
import tensorflow as tf

# Visualización
import matplotlib.pyplot as plt


In [17]:
# Definir RMSE (usando TensorFlow directamente)
def rmse(y_true, y_pred):
    return tf.sqrt(tf.reduce_mean(tf.square(y_true - y_pred)))


In [18]:
# Rutas para los archivos CSV
ruta_train_csv = r'C:\Users\mikel\OneDrive\Documentos\TB-DS-BIO-23.09.24\REPOSITORIOS\Mikel\clip_count\train.csv'
ruta_test_csv = r'C:\Users\mikel\OneDrive\Documentos\TB-DS-BIO-23.09.24\REPOSITORIOS\Mikel\clip_count\test.csv'

# Rutas para las imágenes
ruta_train_img = r'C:\Users\mikel\OneDrive\Documentos\TB-DS-BIO-23.09.24\REPOSITORIOS\Mikel\clip_count\train'
ruta_test_img = r'C:\Users\mikel\OneDrive\Documentos\TB-DS-BIO-23.09.24\REPOSITORIOS\Mikel\clip_count\test'


In [19]:
def cleaning_img(img): 
    b, g, r = cv.split(img)
    blur = cv.GaussianBlur(b, (3,3), cv.BORDER_DEFAULT)
    _, thresh = cv.threshold(blur, 225, 255, cv.THRESH_BINARY)
    thresh = 255 - thresh
    canny = cv.Canny(thresh, 0, 25)
    dilated = cv.dilate(canny, (15,15), iterations=2)
    eroded = cv.erode(dilated, (7,7), iterations=1)
    
    # Eliminar líneas verticales
    vertical_kernel = cv.getStructuringElement(cv.MORPH_RECT, (1, 100))
    detected_lines_vertical = cv.morphologyEx(eroded, cv.MORPH_OPEN, vertical_kernel, iterations=1)
    cnts_vertical, _ = cv.findContours(detected_lines_vertical, cv.RETR_EXTERNAL, cv.CHAIN_APPROX_SIMPLE)
    
    mask_vertical = np.zeros_like(eroded)
    for cnt_vertical in cnts_vertical:
        x, y, w, h = cv.boundingRect(cnt_vertical)
        cv.rectangle(mask_vertical, (x, y), (x + w, y + h), 255, -1)
        
    mask_inverted_vertical = cv.bitwise_not(mask_vertical)
    clean_img = cv.bitwise_and(eroded, eroded, mask=mask_inverted_vertical)
    
    return clean_img, len(cnts_vertical)


In [20]:
def read_data(path):
    X = []
    num_clips_list = []
    for img in os.listdir(path):
        image = cv.imread(os.path.join(path, img))
        if image is not None:
            img_masked, num_clips = cleaning_img(image)
            if img_masked is not None:
                smallimage = cv.resize(img_masked, (96, 96))
                smallimage = smallimage / 255.0  # Normalizar
                X.append(smallimage)
                num_clips_list.append(int(num_clips))
    return np.array(X), np.array(num_clips_list)



In [21]:
# Cargar datasets
df_train = pd.read_csv(ruta_train_csv)
df_test = pd.read_csv(ruta_test_csv)

# Extraer etiquetas
y_train = df_train['clip_count'].astype("float32")

# Leer imágenes
X_train, y_train = read_data(ruta_train_img)
X_test, _ = read_data(ruta_test_img)

# Convertir etiquetas a float32 (¡Este es el cambio crítico!)
y_train = y_train.astype('float32')

In [22]:
# Leer imágenes
X_train, y_train = read_data(ruta_train_img)
X_test, _ = read_data(ruta_test_img)

# Dividir en Train/Validación (80% Train, 20% Val)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
print(f"Train: {X_train.shape}, Val: {X_val.shape}, Test: {X_test.shape}")


Train: (12000, 96, 96), Val: (3000, 96, 96), Test: (5000, 96, 96)


In [25]:
# Usar directamente la métrica RMSE integrada
from keras.metrics import RootMeanSquaredError

# Modelo CNN
model = models.Sequential([
    layers.Flatten(input_shape=(96, 96)), 
    layers.Dense(128, activation='relu'),
    layers.Dense(1, activation='linear')    
])

# Compilar modelo con RMSE integrada
model.compile(optimizer='adam',
              loss='mean_squared_error', 
              metrics=[RootMeanSquaredError(name='rmse')])



In [26]:
# Convertir etiquetas a float32 (entrenamiento y validación)
y_train = y_train.astype('float32')
y_val = y_val.astype('float32')

# Convertir imágenes a float32 si no lo están
X_train = X_train.astype('float32')
X_val = X_val.astype('float32')
X_test = X_test.astype('float32')


In [27]:
# Callbacks (early stopping para evitar sobreajuste)
early_stopping = callbacks.EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Entrenamiento del modelo
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=10,
    callbacks=[early_stopping]
)



Epoch 1/10
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 8ms/step - loss: 3.6683 - rmse: 1.8856 - val_loss: 1.5197 - val_rmse: 1.2328
Epoch 2/10
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 8ms/step - loss: 1.1799 - rmse: 1.0851 - val_loss: 0.9772 - val_rmse: 0.9885
Epoch 3/10
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 8ms/step - loss: 0.5596 - rmse: 0.7480 - val_loss: 0.8611 - val_rmse: 0.9279
Epoch 4/10
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 8ms/step - loss: 0.3394 - rmse: 0.5825 - val_loss: 0.8278 - val_rmse: 0.9098
Epoch 5/10
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 8ms/step - loss: 0.2235 - rmse: 0.4727 - val_loss: 0.7932 - val_rmse: 0.8906
Epoch 6/10
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 8ms/step - loss: 0.1604 - rmse: 0.4005 - val_loss: 0.8049 - val_rmse: 0.8972
Epoch 7/10
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m

In [28]:
# Evaluar el modelo en el conjunto de validación
val_loss, val_rmse = model.evaluate(X_val, y_val)
print(f'Validation Loss: {val_loss:.4f}')
print(f'Validation RMSE: {val_rmse:.4f}')



[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.7731 - rmse: 0.8790
Validation Loss: 0.7932
Validation RMSE: 0.8906


In [30]:
# Realizar predicciones
y_pred = model.predict(X_test)
y_pred = y_pred.flatten().astype('float32')

# Redondear las predicciones al entero más cercano
y_pred = np.round(y_pred).astype('int32')


[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step


In [31]:
# Crear el archivo de submission con IDs y clip_count redondeados
df_submission = pd.DataFrame({
    'id': df_test['id'],
    'clip_count': y_pred
})

# Guardar archivo
ruta_submission = r'C:\Users\mikel\OneDrive\Documentos\TB-DS-BIO-23.09.24\REPOSITORIOS\Mikel\clip_count\submission.csv'
df_submission.to_csv(ruta_submission, index=False)

print(f"✅ Archivo de submission generado correctamente en: {ruta_submission}")


✅ Archivo de submission generado correctamente en: C:\Users\mikel\OneDrive\Documentos\TB-DS-BIO-23.09.24\REPOSITORIOS\Mikel\clip_count\submission.csv
