In [None]:
#1
import pandas as pd
import numpy as np
from keras.models import Sequential
from keras.layers import Embedding, Dense, Dropout, SimpleRNN, LSTM, GRU
from keras.utils import np_utils, to_categorical
from sklearn.model_selection import train_test_split
import string, random

data = pd.read_csv("/content/name_gender.csv")
data['name'] = data['name'].apply(lambda x: ''.join(filter(lambda y: y in string.printable, x)))
chars = sorted(list(set(''.join(data['name'].values))))

char_to_int = dict((c, i) for i, c in enumerate(chars))
int_to_char = dict((i, c) for i, c in enumerate(chars))

def train_model(cell_type, data_size):
    data_sample = data.sample(frac=data_size, random_state=42)
    max_len = max([len(name) for name in data_sample['name']])
    data_X = np.zeros((len(data_sample), max_len, len(chars)), dtype=np.bool)
    data_Y = np.zeros((len(data_sample), 2), dtype=np.bool)
    for i, name in enumerate(data_sample['name']):
        for j, char in enumerate(name):
            data_X[i, j, char_to_int[char]] = 1
        if data_sample.iloc[i]['gender'] == 'M':
            data_Y[i, 0] = 1
        else:
            data_Y[i, 1] = 1

    X_train, X_test, y_train, y_test = train_test_split(data_X, data_Y, test_size=0.2, random_state=42)

    model = Sequential()
    if cell_type == 'SimpleRNN':
        model.add(SimpleRNN(128, input_shape=(max_len, len(chars))))
    elif cell_type == 'LSTM':
        model.add(LSTM(128, input_shape=(max_len, len(chars))))
    elif cell_type == 'GRU':
        model.add(GRU(128, input_shape=(max_len, len(chars))))
    else:
        print("Not an expected type")
        return

    model.add(Dropout(0.2))
    model.add(Dense(2, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.fit(X_train, y_train, epochs=20, batch_size=128, validation_data=(X_test, y_test), verbose=0)
    scores = model.evaluate(X_test, y_test, verbose=0)

    print("Cell type:", cell_type)
    print("Data size:", data_size)
    print("Accuracy: %.2f%%" % (scores[1]*100))

    y_pred = model.predict(X_test)
    y_pred_classes = np.argmax(y_pred, axis=1)
    y_test_classes = np.argmax(y_test, axis=1)
    male_indices = np.where(y_test_classes == 0)[0]
    female_indices = np.where(y_test_classes == 1)[0]
    male_acc = np.mean(y_pred_classes[male_indices] == y_test_classes[male_indices])
    female_acc = np.mean(y_pred_classes[female_indices] == y_test_classes[female_indices])
    print("Male accuracy: %.2f%%" % (male_acc*100))
    print("Female accuracy: %.2f%%" % (female_acc*100))
    print()

for cell_type in ['SimpleRNN', 'LSTM', 'GRU']:
        for data_size in [0.25, 0.5, 0.75, 1.0]:
            train_model(cell_type, data_size)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  data_X = np.zeros((len(data_sample), max_len, len(chars)), dtype=np.bool)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  data_Y = np.zeros((len(data_sample), 2), dtype=np.bool)


Cell type: SimpleRNN
Data size: 0.25
Accuracy: 84.74%
Male accuracy: 77.56%
Female accuracy: 88.79%

Cell type: SimpleRNN
Data size: 0.5
Accuracy: 87.53%
Male accuracy: 83.47%
Female accuracy: 89.92%

Cell type: SimpleRNN
Data size: 0.75
Accuracy: 87.17%
Male accuracy: 82.99%
Female accuracy: 89.57%

Cell type: SimpleRNN
Data size: 1.0
Accuracy: 88.45%
Male accuracy: 85.39%
Female accuracy: 90.24%

Cell type: LSTM
Data size: 0.25
Accuracy: 86.03%
Male accuracy: 78.26%
Female accuracy: 90.40%

Cell type: LSTM
Data size: 0.5
Accuracy: 88.56%
Male accuracy: 83.67%
Female accuracy: 91.44%

Cell type: LSTM
Data size: 0.75
Accuracy: 89.27%
Male accuracy: 89.03%
Female accuracy: 89.40%

Cell type: LSTM
Data size: 1.0
Accuracy: 90.07%
Male accuracy: 87.05%
Female accuracy: 91.84%

Cell type: GRU
Data size: 0.25
Accuracy: 85.61%
Male accuracy: 86.85%
Female accuracy: 84.91%

Cell type: GRU
Data size: 0.5
Accuracy: 88.87%
Male accuracy: 88.87%
Female accuracy: 88.87%

Cell type: GRU
Data size: 0

In [None]:
#2
import pandas as pd
import random, csv

# load the CSV file into a pandas DataFrame
df = pd.read_csv('/content/name_gender.csv')

# filter out the rows where gender is unknown or probability is less than 1
df = df[(df['gender'] != 'U') & (df['probability'] == 1)]

# create a dictionary to store the frequency of each character transition
def create_transition_dict(names):
    transition_dict = {}
    for name in names:
        name = name.lower()
        for i in range(len(name)-1):
            current_char = name[i]
            next_char = name[i+1]
            if current_char not in transition_dict:
                transition_dict[current_char] = {}
            if next_char not in transition_dict[current_char]:
                transition_dict[current_char][next_char] = 0
            transition_dict[current_char][next_char] += 1
    return transition_dict

# generate a name based on the Markov Chain model
def generate_name(transition_dict, gender):
    vowels = 'aeiou'
    consonants = 'bcdfghjklmnpqrstvwxyz'
    if gender == 'M':
        first_letter = random.choice(['a', 'e', 'i', 'o', 'u'] + list(consonants))
    else:
        first_letter = random.choice(['a', 'e', 'i', 'o', 'u'] + list(vowels))
    name = first_letter
    current_letter = first_letter
    while len(name) < 10:
        if current_letter not in transition_dict:
            break
        next_letter = random.choices(list(transition_dict[current_letter].keys()), 
                                      list(transition_dict[current_letter].values()))[0]
        name += next_letter
        current_letter = next_letter
    return name.capitalize()

# create a list of male and female names using the Markov Chain model
male_names = []
female_names = []
transition_dict = create_transition_dict(df['name'].values)
for index, row in df.iterrows():
    if row['gender'] == 'M':
        male_names.append(generate_name(transition_dict, 'M'))
    else:
        female_names.append(generate_name(transition_dict, 'F'))

with open('/content/generated_names.csv', mode='w', newline='') as file:
    writer = csv.writer(file)

    # Write the header row
    writer.writerow(['gender', 'name'])

    # Write the male names
    for name in male_names:
        writer.writerow(['M', name])

    # Write the female names
    for name in female_names:
        writer.writerow(['F', name])

# print the generated names
print("Generated Male Names:")
for name in male_names[:100]:
    print(name)
print("\nGenerated Female Names:")
for name in female_names[:100]:
    print(name)

data = pd.read_csv("/content/generated_names.csv")
data['name'] = data['name'].apply(lambda x: ''.join(filter(lambda y: y in string.printable, x)))
chars = sorted(list(set(''.join(data['name'].values))))

char_to_int = dict((c, i) for i, c in enumerate(chars))
int_to_char = dict((i, c) for i, c in enumerate(chars))

male_count = 0
for name in male_names:
    if train_model("GRU", 1.0) == "M":
        male_count += 1
        male_accuracy = male_count / len(male_names)
        exit

female_count = 0
for name in female_names:
    if train_model("GRU", 1.0) == "F":
        female_count += 1
        female_accuracy = female_count / len(female_names)
        exit

print("Accuracy on generated male names: ", male_accuracy)
print("Accuracy on generated female names: ", female_accuracy)

Generated Male Names:
Nedeqeteri
Ponnaioddi
Cansiruman
Lyleiaphir
Trkovanior
Vonekannia
Brelyrmmai
Onneogrene
Qunienizey
Chavaleial
Vabrdeltre
Vihahanday
Eresobelon
Eanneahand
Nlarsidrah
Linararomo
Phebrecoha
Ligistinna
Carifayula
Anclilerau
Shatohahir
Llleanadit
Ntcharmari
Haiamiavid
Eraronoseu
Frnadoakej
Hargianava
Mionaniken
Zmandaberi
Ontatobhay
Uaneliando
Qunnninnnt
Beitrriond
Pazeadicai
Lejameraia
Waschabimi
Quarinewar
Eerruithau
Thayahnzow
Alkeilinyc
Ozelavolan
Quizalelol
Crishanttt
Sanjulosta
Shanilllbi
Qubrelllia
Jeyaynahia
Arandameel
Thiarayner
Ourasstham
Larerugrah
Isheayorrt
Kaynerliky
Wolladhaha
Zenwacager
Hrinosonnk
Fikishaylo
Jeryrianyu
Wemeartauh
Wnarianaic
Maraianabe
Vorithneys
Zallandaro
Alfrienell
Gralanenil
Wantayveso
Atonzelyne
Ghagelinio
Preyeianen
Larubeland
Zelarranna
Xllenyntya
Jadolynnak
Cawynnylip
Geylezaiyl
Ionnedaynn
Shynnnnitr
Ypondecopa
Pplamsarin
Wyariamame
Liberianti
Canerionse
Fesanialle
Unatrichol
Esharieeli
Matrirshya
Iemarilifa
Yammarettr
Vondeariha

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  data_X = np.zeros((len(data_sample), max_len, len(chars)), dtype=np.bool)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  data_Y = np.zeros((len(data_sample), 2), dtype=np.bool)


Cell type: GRU
Data size: 1.0
Accuracy: 93.31%
Male accuracy: 80.51%
Female accuracy: 100.00%

Cell type: GRU
Data size: 1.0
Accuracy: 93.30%
Male accuracy: 80.50%
Female accuracy: 100.00%

Cell type: GRU
Data size: 1.0
Accuracy: 93.31%
Male accuracy: 80.51%
Female accuracy: 100.00%

Cell type: GRU
Data size: 1.0
Accuracy: 93.31%
Male accuracy: 80.51%
Female accuracy: 100.00%

Cell type: GRU
Data size: 1.0
Accuracy: 93.31%
Male accuracy: 80.51%
Female accuracy: 100.00%

Cell type: GRU
Data size: 1.0
Accuracy: 93.31%
Male accuracy: 80.51%
Female accuracy: 100.00%

Cell type: GRU
Data size: 1.0
Accuracy: 93.31%
Male accuracy: 80.51%
Female accuracy: 100.00%

Cell type: GRU
Data size: 1.0
Accuracy: 93.31%
Male accuracy: 80.51%
Female accuracy: 100.00%

Cell type: GRU
Data size: 1.0
Accuracy: 93.31%
Male accuracy: 80.51%
Female accuracy: 100.00%



KeyboardInterrupt: ignored

In [4]:
#2a
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, LSTM
from keras.utils import np_utils
import nltk

nltk.download('names')

names = nltk.corpus.names.words('/content/name_gender.csv')
names = [name.lower() for name in names if name[0].lower() in ['a', 'm', 'z']]

chars = sorted(list(set(' '.join(names))))
char_to_int = dict((c, i) for i, c in enumerate(chars))
int_to_char = dict((i, c) for i, c in enumerate(chars))

seq_length = 10
dataX = []
dataY = []
for name in names:
    for i in range(len(name)-seq_length):
        seq_in = name[i:i+seq_length]
        seq_out = name[i+seq_length]
        dataX.append([char_to_int[char] for char in seq_in])
        dataY.append(char_to_int[seq_out])
n_patterns = len(dataX)

X = np.reshape(dataX, (n_patterns, seq_length, 1))
X = X / float(len(chars))
y = np_utils.to_categorical(dataY)

model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2])))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

model.fit(X, y, epochs=20, batch_size=128)

for i in range(50):
    start = np.random.randint(0, len(dataX)-1)
    pattern = dataX[start]
    name = [int_to_char[value] for value in pattern]

    for j in range(20):
        x = np.reshape(pattern, (1, len(pattern), 1))
        x = x / float(len(chars))
        prediction = model.predict(x, verbose=0)
        index = np.argmax(prediction)
        result = int_to_char[index]
        name.append(result)
        pattern.append(index)
        pattern = pattern[1:len(pattern)]
        if result == '.':
            break

    print(''.join(name).capitalize())

split_index = int(len(dataX)*0.9)
trainX, testX = dataX[:split_index], dataX[split_index:]
trainY, testY = dataY[:split_index], dataY[split_index:]

testX = np.reshape(testX, (len(testX), seq_length, 1))
testX = testX / float(len(chars))
testY = np_utils.to_categorical(testY)    

perplexities = []
for i in range(50):
    start = np.random.randint(0, len(testX)-1)
    pattern = testX[start]
    name = [int_to_char[value] for value in pattern.flatten()]

    perplexity = 1.0
    for j in range(20):
        x = np.reshape(pattern, (1, len(pattern), 1))
        x = x / float(len(chars))
        prediction = model.predict(x, verbose=0)
        index = np.argmax(prediction)
        result = int_to_char[index]
        name.append(result)
        pattern = np.append(pattern, index)
        pattern = pattern[1:len(pattern)]
        if result == '.':
            break
        perplexity *= prediction[0][index]
    
    perplexity = pow(perplexity, -1/len(name))
    perplexities.append(perplexity)
    print(''.join(name).capitalize(), 'Perplexity:', perplexity)

avg_perplexity = sum(perplexities) / len(perplexities)
print('Average Perplexity:', avg_perplexity)    

[nltk_data] Downloading package names to /root/nltk_data...
[nltk_data]   Package names is already up-to-date!


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Ilarose,f,1.
Morissa,f,1.
333333333372727272727272727272
En,f,0.99766676766666666666666
523985239876666666666666666666
666666666666666666666666666666
507002801172727276627276627676
915492957762766662766662766662
Moniesha,f,1.
Erlin,m,0.99766666766666666666
Nastashia,f,1.
.98314606727727627627627627627
Ery,m,0.9976666676666666666666
Ddilynn,f,1.
032177444577666666666666666666
Oey,f,0.9976667676666666666666
Murlyn,m,0.
Artiana,f,1.
0.9991299271777276767276666276
0.8539923969717667727666627666
M,0.97099766266666276666276666
333333333372727272727272727272
Marleen,f,1.
Ustinjohn,m,1.
Zariyah,f,1.
Andra,f,0.99766676766666666666
Arrow,m,0.99766666766666666666
0.9982598172767276766276666276
L,f,0.854076777276666276666276
Nterrius,m,1.
Atiyanna,f,1.
042253521176727276