In [1]:
import os
import random
import joblib

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.cluster import KMeans
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, mean_squared_error
from sklearn.datasets import make_classification
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from keras.layers import Dense

import cv2


In [29]:
# Definir la ruta del dataset
DATASET_PATH = r"C:\Users\javid\OneDrive\Escritorio\Javidev\Equipo_Manitas\src\data\asl_dataset"

In [3]:
# Obtener las clases disponibles (carpetas dentro del dataset)
clases = sorted(os.listdir(DATASET_PATH))  # ['0', '1', ..., '9', 'A', 'B', ..., 'Z']
num_clases = len(clases)

X = []
y = []

# Cargar imágenes y convertirlas a arrays
for label, clase in enumerate(clases):
    folder_path = os.path.join(DATASET_PATH, clase)
    for img_name in os.listdir(folder_path):
        img_path = os.path.join(folder_path, img_name)
        img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)  # Convertimos a escala de grises
        img = cv2.resize(img, (64, 64))  # Redimensionamos a 64x64 píxeles
        X.append(img.flatten())  # Convertimos la imagen en un vector
        y.append(label)  # Guardamos la etiqueta de la clase

# Convertir a arrays de numpy
X = np.array(X)
y = np.array(y)

# Mezclar los datos aleatoriamente
indices = np.arange(len(X))
np.random.shuffle(indices)

# Aplicar el desorden a X e y
X = X[indices]
y = y[indices]

# Dividir en train y test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Guardar los datos para uso futuro
np.save("asl_features_train.npy", X_train)
np.save("asl_labels_train.npy", y_train)
np.save("asl_features_test.npy", X_test)
np.save("asl_labels_test.npy", y_test)

print(f"Datos cargados y mezclados: {X.shape[0]} imágenes, {num_clases} clases")
print(f"Train: {X_train.shape[0]} imágenes | Test: {X_test.shape[0]} imágenes")


Datos cargados y mezclados: 2515 imágenes, 36 clases
Train: 2012 imágenes | Test: 503 imágenes


In [4]:
X_train.shape

(2012, 4096)

In [5]:
X_test.shape

(503, 4096)

In [6]:
lr = LinearRegression()

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=33)

In [8]:
lasso = Lasso()


pipe = Pipeline([('Linear', LinearRegression())])

pipe2 = Pipeline([('Standard Scaler', StandardScaler())])

In [9]:


def ejecutar(pipe, X_train, X_test, y_train, y_test):
    pipe.fit(X_train, y_train) 
    pred = pipe.predict(X_test)  
    
    # Calcula y muestra el error cuadrático medio
    mse = mean_squared_error(y_test, pred)
    print(f"MSE = {mse}")

    # Validación cruzada usando el propio pipeline
    cv_scores = cross_val_score(pipe, X_train, y_train, cv=5, scoring="neg_mean_squared_error")
    print(f"Cross-validation MSE: {-cv_scores.mean()}") 


In [10]:
def ejecutar2(pipe, X_train, X_test, y_train, y_test):
    pipe.fit(X_train, y_train)
    
    # Mostrar solo una parte de X_train para evitar impresión masiva
    if hasattr(X_train, "head"):  # Si es un DataFrame de pandas
        print(X_train.head())
    else:  # Si es un array de NumPy
        print(X_train[:5])


In [11]:
ejecutar(pipe, X_train, X_test, y_train, y_test)

MSE = 64.18377153093664
Cross-validation MSE: 64.19807772030184


In [12]:
ejecutar2(pipe2, X_train, X_test, y_train, y_test)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [13]:
pipe2.fit(X_train, y_train)

In [14]:
pipe

In [15]:
model = keras.Sequential([ 
    keras.layers.Dense(128, activation='relu'),  
    keras.layers.Dense(64, activation='relu'),   
    keras.layers.Dense(36, activation='softmax') # Capa de salida con 10 neuronas (10 clases)
])

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2, stratify=y)

In [17]:
# pipeline = Pipeline([('model', model)])

In [18]:
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [19]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False)),  
    ("kmeans", KMeans(n_clusters=10, random_state=42)),  
    ('clf', RandomForestClassifier(random_state=42))])

param_grid = {
    'clf__n_estimators': [50, 100, 200],
    'clf__max_depth': [None, 10, 20],
    'clf__min_samples_split': [2, 5, 10]
}

In [None]:
grid_search = GridSearchCV(pipeline, param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

In [21]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy',metrics=['accuracy'])


In [22]:
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)


model.fit(X_train, y_train, epochs=36, validation_data=(X_test, y_test), callbacks=[early_stopping])

Epoch 1/36
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - accuracy: 0.1190 - loss: 95.7215 - val_accuracy: 0.2008 - val_loss: 3.2124
Epoch 2/36
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.2736 - loss: 2.9942 - val_accuracy: 0.3459 - val_loss: 2.3948
Epoch 3/36
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.3717 - loss: 2.2778 - val_accuracy: 0.4155 - val_loss: 2.0003
Epoch 4/36
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - accuracy: 0.4056 - loss: 2.0620 - val_accuracy: 0.4672 - val_loss: 1.9822
Epoch 5/36
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.4867 - loss: 1.7420 - val_accuracy: 0.5249 - val_loss: 1.6339
Epoch 6/36
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.5618 - loss: 1.4886 - val_accuracy: 0.6103 - val_loss: 1.4662
Epoch 7/36
[1m63/63[0m [32m━━━━━━

<keras.src.callbacks.history.History at 0x22449f26570>

In [23]:


# # Ejemplo de KMeans con 3 clusters
# kmeans = KMeans(n_clusters=3, random_state=42)
# kmeans.fit(X)
# labels = kmeans.predict(X)


In [24]:


# # OneHotEncoder con manejo de categorías nuevas en test
# encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
# X_train_encoded = encoder.fit_transform(X_train)
# X_test_encoded = encoder.transform(X_test)


In [25]:



# # Definir modelo y parámetros
# model = RandomForestClassifier(random_state=42)
# param_grid = {'n_estimators': [50, 100, 200], 'max_depth': [None, 10, 20]}

# # GridSearchCV
# grid_search = GridSearchCV(model, param_grid, cv=5)
# grid_search.fit(X_train, y_train)

# Mejor modelo
best_model = grid_search.best_estimator_
print("Mejores parámetros:", grid_search.best_params_)


Mejores parámetros: {'clf__max_depth': None, 'clf__min_samples_split': 2, 'clf__n_estimators': 50}


In [26]:
save_dir = ("../models")

In [28]:
model_path = os.path.join(save_dir, "random_forest_model_v3.pkl")
joblib.dump(grid_search, model_path)
print(f"Modelo guardado en: {model_path}")

Modelo guardado en: ../models\random_forest_model_v3.pkl


In [None]:
# loaded_model = joblib.load(model_path)
# print("Modelo cargado correctamente")