In [12]:
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import LabelEncoder

In [13]:
# Datos
nombres = [
    "Ana", "Pedro", "María", "Juan", "Lucía", "Luis", "Carmen", "Miguel", "Laura", "Jorge",
    "Carlos", "Verónica", "Emmanuel", "Giovanni", "Karla", "Oswaldo", "Mónica", "Jonathan",
    "Teresa", "Mariana", "Christian", "Itzel", "Gilberto", "Estefania", "Pablo", "Montserrat", "Jesus"
]
generos = [
    "Femenino", "Masculino", "Femenino", "Masculino", "Femenino", "Masculino", "Femenino", "Masculino", "Femenino", "Masculino",
    "Masculino", "Femenino", "Masculino", "Masculino", "Femenino", "Masculino", "Femenino", "Masculino",
    "Femenino", "Femenino", "Masculino", "Femenino", "Masculino", "Femenino", "Masculino", "Femenino", "Masculino"
]

In [14]:
# Minúsculas para estandarizar
nombres = [n.lower() for n in nombres]

In [15]:
# Tokenización y padding
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(nombres)
X = tokenizer.texts_to_sequences(nombres)
X = pad_sequences(X, padding='post')
maxlen = X.shape[1]

In [16]:
# Codificación del target
encoder = LabelEncoder()
y = encoder.fit_transform(generos)

In [17]:
# Split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=42)

In [18]:
# Modelo mejorado
model = Sequential([
    Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=16, input_length=maxlen),
    Bidirectional(LSTM(32)),
    Dense(1, activation='sigmoid')
])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])



In [19]:
# Entrenamiento con EarlyStopping
callback = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
history = model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=100, callbacks=[callback], verbose=0)


In [20]:
# Evaluación
loss, acc = model.evaluate(X_val, y_val, verbose=0)
print(f"Accuracy validación: {acc:.2f}")

Accuracy validación: 1.00


In [21]:
# Overfitting
train_acc = history.history['accuracy'][-1]
val_acc = history.history['val_accuracy'][-1]
print("¿Overfitting?:", "Sí" if train_acc - val_acc > 0.1 else "No")

¿Overfitting?: No


In [24]:
# Prueba con datos nuevos
nuevos = ["Francisco", "Andrea", "Roberto", "Valeria"]
nuevos_proc = tokenizer.texts_to_sequences([n.lower() for n in nuevos])
nuevos_proc = pad_sequences(nuevos_proc, maxlen=maxlen, padding='post')
preds = model.predict(nuevos_proc)
for nombre, pred in zip(nuevos, preds):
    print(f"{nombre} => {'Masculino' if pred > 0.5 else 'Femenino'} ({pred[0]:.2f})")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 351ms/step
Francisco => Masculino (0.79)
Andrea => Femenino (0.00)
Roberto => Masculino (1.00)
Valeria => Femenino (0.00)


In [23]:
# Guardar modelo
model.save("modelo_nombre_genero_mejorado.keras")