In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.models import save_model
from joblib import dump  # To save the TF-IDF vectorizer

# 1. Read Data
# Assuming the dataset is in a CSV file named 'gender_data.csv'
data = pd.read_csv('names.csv')  # Update with your actual file path

# 2. Preprocess Data
# Map Gender to binary values: 'M' -> 1 (Male), 'F' -> 0 (Female)
data['Gender'] = data['Gender'].map({'M': 1, 'F': 0})

# Optional: If you want to use 'Probability' or 'Count' as weights, you could do so here
# For simplicity, we'll stick to just 'Name' and 'Gender'
# Example (commented out): weights = data['Probability'].values

# 3. Convert text data into numerical data using TF-IDF
tfidf = TfidfVectorizer(analyzer='char', ngram_range=(1, 3))
X = tfidf.fit_transform(data['Name']).toarray()  # Convert names into numerical features
y = data['Gender'].values  # Labels: 1 for Male, 0 for Female

# 4. Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 5. Build the Neural Network Model
model = Sequential()
model.add(Dense(128, activation='relu', input_shape=(X_train.shape[1],)))
model.add(Dropout(0.5))  # Add dropout to prevent overfitting
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))  # Output layer with sigmoid for binary classification

# 6. Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# 7. Train the model with epochs
model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2)

# 8. Save the model after training
model.save('gender_prediction_model.h5')

# 9. Save the TF-IDF vectorizer
dump(tfidf, 'tfidf_vectorizer.joblib')

# 10. Evaluate the model
y_pred = (model.predict(X_test) > 0.5).astype("int32")  # Convert probabilities to binary output
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%")

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
E0000 00:00:1742138377.419535  454309 cuda_executor.cc:1228] INTERNAL: CUDA Runtime error: Failed call to cudaGetRuntimeVersion: Error loading CUDA libraries. GPU will not be used.: Error loading CUDA libraries. GPU will not be used.
W0000 00:00:1742138377.440525  454309 gpu_device.cc:2341] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


Epoch 1/50
[1m2946/2946[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m76s[0m 25ms/step - accuracy: 0.7188 - loss: 0.5429 - val_accuracy: 0.7870 - val_loss: 0.4548
Epoch 2/50
[1m2946/2946[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m76s[0m 26ms/step - accuracy: 0.8026 - loss: 0.4349 - val_accuracy: 0.7916 - val_loss: 0.4469
Epoch 3/50
[1m2946/2946[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m76s[0m 26ms/step - accuracy: 0.8177 - loss: 0.4111 - val_accuracy: 0.7946 - val_loss: 0.4418
Epoch 4/50
[1m2946/2946[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m73s[0m 25ms/step - accuracy: 0.8301 - loss: 0.3832 - val_accuracy: 0.7970 - val_loss: 0.4407
Epoch 5/50
[1m2946/2946[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 26ms/step - accuracy: 0.8385 - loss: 0.3680 - val_accuracy: 0.7956 - val_loss: 0.4484
Epoch 6/50
[1m2946/2946[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m71s[0m 24ms/step - accuracy: 0.8447 - loss: 0.3498 - val_accuracy: 0.7934 - val_loss: 0.4562
Epoc



[1m921/921[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step
Model Accuracy: 75.70%
