In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, load_model  # Import load_model directly from keras.models
from keras.layers import Dense, LSTM, Embedding, Dropout
from keras.utils import to_categorical
import pickle
import sklearn.metrics as m

ModuleNotFoundError: No module named 'tensorflow'

In [None]:
# Load dataset
dataset = pd.read_csv(r"C:\Users\gowtham.veepujerla\Downloads\Text to audio\data\combined_dataset.csv")

In [None]:
# Remove duplicate and missing values
dataset.drop_duplicates('name', inplace=True)
dataset.dropna(inplace=True)

In [None]:
dataset.head()

In [None]:
import plotly.express as px
gender = dataset.gender.value_counts()
fig = px.pie(dataset, values=gender.values, names=gender.index, title='Distribution of Gender')
fig.show()

In [None]:
# Encode labels
le = LabelEncoder()
labels = le.fit_transform(dataset['gender'])

In [None]:
# Convert names to sequences of integers
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(dataset['name'])
sequences = tokenizer.texts_to_sequences(dataset['name'])
padded_sequences = pad_sequences(sequences, maxlen=15, padding='post')

In [None]:
# Split data into training and testing sets
feature_train, feature_test, label_train, label_test = train_test_split(
    padded_sequences, labels, test_size=0.2, random_state=42)

In [None]:
# Define the vocabulary size
vocab_size = len(tokenizer.word_index) + 1  # Adding 1 because of reserved 0 index

# Define the maximum sequence length
max_sequence_length = padded_sequences.shape[1]  # Length of the padded sequences


In [None]:
# Build and train the model
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=64))
model.add(LSTM(128, return_sequences=True))
model.add(LSTM(64))
model.add(Dropout(0.5))
model.add(Dense(2, activation='sigmoid'))
# Build the model by passing some input data
model.build(input_shape=(None, 15)) 
model.summary()

In [None]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(feature_train, to_categorical(label_train), epochs=50, batch_size=1000, validation_data=(feature_test, to_categorical(label_test)))

In [None]:
# Save the model and tokenizer
model.save('model.h5')
with open('tokenizer.pkl', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# Load the model and tokenizer
model = load_model('model.h5')
with open('tokenizer.pkl', 'rb') as handle:
    tokenizer = pickle.load(handle)

In [None]:
# Define input and prediction functions
def preprocess_input(name):
    sequence = tokenizer.texts_to_sequences([name])
    padded_sequence = pad_sequences(sequence, maxlen=15, padding='post')
    return padded_sequence

import re

def predict_gender(name):
    # Check if the name contains only alphabetic characters
    if not name.isalpha():
        return 'Enter a proper name'
    
    # Check if the name consists of common non-human elements
    non_human_patterns = [
        r'^\d+$',  # Only digits
        r'^[^\w\s]+$',  # Only symbols
        r'^(?:abc|def|ghi|jkl|mno|pqr|stu|vwx|yz)+$',  # Sequential alphabets
        r'(mon|tue|wed|thu|fri|sat|sun)',  # Days of the week
        r'(and|but|or|for|nor|so|yet)',  # Conjunctions
        r'(at|by|for|in|of|on|to|with)',  # Prepositions
        r'(am|is|are|was|were|be|being|been)',  # Verbs
        r'(very|quite|rather|pretty|fairly|somewhat)',  # Adverbs
    ]
    if any(re.match(pattern, name.lower()) for pattern in non_human_patterns):
        return 'Enter a proper name'

    # Process input if it's a proper human name
    processed_input = preprocess_input(name)
    prediction = model.predict(processed_input)
    predicted_label = np.argmax(prediction)
    if predicted_label == 0:
        return 'Female'
    else:
        return 'Male' 



In [None]:
# Evaluate the model
label_pred = np.argmax(model.predict(feature_test), axis=1)
accuracy = m.accuracy_score(label_test, label_pred)
classification_report = m.classification_report(label_test, label_pred)
confusion_matrix = m.confusion_matrix(label_test, label_pred)

print(f"Model Accuracy: {accuracy}")
print("Classification Report:")
print(classification_report)
print("Confusion Matrix:")
print(confusion_matrix)

In [None]:
# Function to predict gender for multiple user inputs
def predict_multiple_names():
    while True:
        name = input("Enter a name (type 'exit' to stop): ")
        if name.lower() == 'exit':
            break
        print(f"The predicted gender for the name '{name}' is: {predict_gender(name)}")

# Predict gender for multiple inputs
predict_multiple_names()