In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount = True)

Mounted at /content/drive


In [None]:
datapath = "/content/drive/MyDrive/Colab Notebooks/DATASETS/NAMES/"
modelpath = "/content/drive/MyDrive/Colab Notebooks/MODELS/GENDER/"

In [None]:
import nltk
import joblib

import numpy as np
import pandas as pd

from tensorflow.keras.models import Sequential
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, Dense

import warnings

warnings.filterwarnings('ignore')



In [None]:
df = pd.read_csv(datapath + 'gender_training040723.csv')
print(df.shape[0])
df.isna().sum()

508359


Name      0
Gender    0
dtype: int64

In [None]:
df['Name'] = df['Name'].str.lower()
df['Name'] = df['Name'].str.replace('[^a-zA-Z]', '')

In [None]:
df.isna().sum()

Name      0
Gender    0
dtype: int64

In [None]:
df = df.sample(frac = 1)
df.head(6)

Unnamed: 0,Name,Gender
427986,hilaal,male
252681,pulen,male
68297,mini,female
433587,jaysh,male
89012,indira,female
504575,durgashankar,male


In [None]:
n = 2
ngrams = []
for name in df['Name']:
    bigrams = nltk.ngrams(name, n)
    ngrams.append([''.join(gram) for gram in bigrams])

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(ngrams)
sequences = tokenizer.texts_to_sequences(ngrams)
padded_sequences = pad_sequences(sequences)

In [None]:
target = df['Gender'].map({'male': 0, 'female': 1})

In [None]:
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, target, test_size=0.2, random_state=42)

In [None]:
earlystop_callback = EarlyStopping(monitor='val_loss', patience=3)

In [None]:
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=64, input_length=padded_sequences.shape[1]))
model.add(Bidirectional(LSTM(128, dropout=0.2, recurrent_dropout=0.2, return_sequences=True)))
model.add(Bidirectional(LSTM(64, dropout=0.2, recurrent_dropout=0.2)))
model.add(Dense(64, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])




In [None]:
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs = 100, batch_size = 1000, callbacks=[earlystop_callback])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100


<keras.callbacks.History at 0x7f7f518baf20>

In [None]:
joblib.dump(tokenizer, modelpath + "nameTokenizer040723.pkl")
model.save(modelpath + "genderModel040723.h5")

In [None]:
from tensorflow.keras.models import load_model
def predict_gender(name):
    #joblib.load(modelpath + )
    model = load_model(modelpath + 'genderModel040723.h5')
    n = 2
    name = name.lower().replace('[^a-z\s]+', '')
    trigrams = nltk.ngrams(name, n)
    ngrams = [''.join(gram) for gram in trigrams]
    sequence = tokenizer.texts_to_sequences([ngrams])
    padded_sequence = pad_sequences(sequence, maxlen=31)

    # Make a prediction using the model
    prediction = model.predict(padded_sequence)[0][0]
    gender = 'female' if prediction > 0.5 else 'male'
    prob_score = prediction if gender == 'female' else 1 - prediction

    #return f"{name} is {gender}, Prob Score: {prob_score}"
    return gender, prob_score

In [None]:
predict_gender('zia')





('female', 0.98432434)