# Using n-grams to generate plausible names for NPCs
N-grams are a useful way of preserving the general structure of a language sample while allowing the generation of novel text (see https://towardsdatascience.com/text-generation-using-n-gram-model-8d12d9802aa0)

Here we shall use them to attempt to produce plausible NPC names. We shall take a sample names corpus and use it to generate plausible synthetic names

nltk has useful n-gram functions in its collocation module which we shall make use of. https://www.nltk.org/api/nltk.html

In [None]:
from io import BytesIO
import math
import nltk
import pandas as pd
from pathlib import Path
import random
import urllib.request
from zipfile import ZipFile
import nltk
from nltk.util import ngrams
from nltk.corpus import words

In [None]:
nltk.download('words')

[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [None]:
# obtain the relevant file from the US census
names_url = 'https://www.ssa.gov/oact/babynames/names.zip'
local_file = 'babynames_download.zip'
# unfortunately ZipFile strenuously objects to handling in memory data so lets create a local copy of the zipfile before we can handle it 
# first lets check if we already have the data downloaded and download if not
test_file = Path(local_file)
if not test_file.is_file():
  with urllib.request.urlopen(names_url) as remote:  # open remote file
    data = remote.read()  
  # write a local copy
  with open(local_file, 'wb') as local: 
    local.write(data)

In [None]:
# set our desired year between 1880 and 2020
year = '1991'

# now we can process the file with ZipFile
with ZipFile(local_file, 'r') as zip_file:
  #zip.printdir()
  file_bytes = zip_file.read(f'yob{year}.txt')
  names_df = pd.read_csv(BytesIO(file_bytes), header = None, names = ['name', 'gender', 'count'])

# preprocessing
It seems likely that male and female names may have different characteristics (for example female names may be more likely to end in a vowel) lets create a male list and a female list before we start processing for ngrams. We can also join these lists to seek potentially gender neutral names which we could try to use for producing gender neutral names.

In [None]:
male_names_df = names_df[names_df['gender']=='M']
female_names_df = names_df[names_df['gender']=='F']

# lets also look for names which are used for both genders
shared_names_df = male_names_df.merge(
  female_names_df,  
  how = 'inner',
  on = 'name',
  suffixes = ('_m', '_f'), 
)
# calculate the female/male ratio
shared_names_df['ratio'] = shared_names_df['count_f'] / shared_names_df['count_m']
# remove names overwhelmingly used for females (but preserve the join for future analysis)
neutral_names_df = shared_names_df[shared_names_df['ratio'] < 10]
# remove names overwhelmingly used for males
neutral_names_df = neutral_names_df[neutral_names_df['ratio'] > 0.1]
neutral_names_df.reset_index(inplace = True, drop=True)
neutral_names_df['count'] = neutral_names_df['count_m'] + neutral_names_df['count_f']
neutral_names_df['gender'] = 'N'
neutral_names_df.drop(
  columns=['gender_f', 'gender_m', 'count_m', 'count_f', 'ratio'], 
  inplace = True,
)
neutral_names_df

Unnamed: 0,name,count,gender
0,Jordan,21576,N
1,Taylor,18223,N
2,Devin,5550,N
3,Casey,7177,N
4,Logan,4404,N
...,...,...,...
1148,Tranell,11,N
1149,Tristyn,17,N
1150,Vanny,11,N
1151,Weslie,11,N


In [41]:
# nltk works on lists lets get a list of letters to work with then apply the N-grams function to it

# set an n value for our n-grams
n_value = 3

def tokenise(word: str, n=2):
  '''Convert words into a token list of individual lower case characters 
  prefixed by n > symbols and suffixed by n < symbols

  Args:
      word (str): the word to be tokenised
      n (int): number of prefix and suffix characters to append

  Returns:
      The tokenised list
  '''
  # Our names structure may be position sensitive lets add start and stop characters
  # Lets add "<<" in front of our names and ">>" behind
  return ['>'] * (n) + list(word.lower()) + ['<'] * (n)

def get_ngrams(sequence: list, n=2):
  '''Convert a sequence of tokens into a list of n-grams. 
    This is a wrapper for the nltk ngrams function returning a list not a generator.
    See https://www.nltk.org/api/nltk.html?highlight=ngram 

  Args:
      sequence (list): the word to be tokenised
      n (int): the order of n-grams to return

  Returns:
      A list of n-gram tuples
  '''
  # we can use the nltk ngrams function but we need to return a list not a generator
  # https://www.nltk.org/api/nltk.html?highlight=ngram 
  ngram_list = list(ngrams(sequence=sequence, n=n))
  return ngram_list

def weight_by_count(count):
  '''produce integer weightings by taking the natural log of a number and adding one 
  
  Args:
      count (int): a positive integer to turn into a weight

  Returns:
      An int to use as a weight
  '''
  return int(math.log(count))+1


def ngram_dataframe(df: pd.core.frame.DataFrame, n: int):
  ''' Given a dataframe with columns called 'name' and 'count', returns a 
  dataframe of ngram frequencies for future use. Note that the natural log +1 of 
  count is used to determine frequencies of ngrams when summing to avoid common 
  names swamping rare ones while still biassing slightly for comon names

  Args:
      df (pandas.core.frame.DataFrame): the input dataframe of names and frquencies
      n (int): the order of n-grams to process

  Returns:
      A pandas.core.frame.DataFrame of n-gram frequencies
  
  '''
  df['tokens'] = df['name'].apply(tokenise, n = n -1)
  df['n_value'] = n
  # we want to weight for common names but not to extremes
  df['weight'] = df['count'].apply(weight_by_count)
  # note that we can pass **kwargs into apply and they forward to the applied function
  # https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.apply.html
  df['n_grams'] = df['tokens'].apply(get_ngrams, n=n)
  # now we can explode our ngrams 
  # https://pandas.pydata.org/pandas-docs/stable/user_guide/reshaping.html#exploding-a-list-like-column
  df = df.explode('n_grams')
  # then groupby n_grams to get our relative frequencies
  # https://pandas.pydata.org/docs/reference/api/pandas.core.groupby.DataFrameGroupBy.aggregate.html
  df = df.groupby('n_grams').agg({'weight': 'sum'}).reset_index()
  return df

# lets start with the neutral names
df_n_grams = ngram_dataframe(df=neutral_names_df, n=n_value)
df_n_grams

Unnamed: 0,n_grams,weight
0,"(>, >, a)",440
1,"(>, >, b)",179
2,"(>, >, c)",477
3,"(>, >, d)",469
4,"(>, >, e)",91
...,...,...
1594,"(z, i, e)",18
1595,"(z, m, o)",3
1596,"(z, u, r)",5
1597,"(z, y, <)",4


# optimise data format
now we need to convert our n-gram frequencies into a format to allow for rapid retrieval for generation of names. For any given n-gram we will want to look at the previous n-1 letters of our existing sequence and generate the next letter in proportion to the frequencies of occurence of the last letters of the n-grams that start with our current n-1 preceding letters

In [42]:
def get_n_gram_dictionary(df_n_grams: pd.core.frame.DataFrame):
  # get our start parts and next character
  df_n_grams['start_seq'] = df_n_grams['n_grams'].apply(lambda x: x[:-1])
  df_n_grams['next'] = df_n_grams['n_grams'].apply(lambda x: x[-1])

  # now extract out weights, start sequences and next letters
  weights = df_n_grams['weight'].to_list()
  starts = df_n_grams['start_seq'].to_list()
  nexts = df_n_grams['next'].to_list()

  ngram_zip = zip(starts, nexts, weights)
  unique_starts = set(starts)
  # create an empty dictionary to put our data into 
  ngram_dict = {key: [] for key in unique_starts}
  for item in ngram_zip:
    possible_letters = [item[1]]*item[2]
    ngram_dict[item[0]].extend(possible_letters)
  return ngram_dict

ngram_dict = get_n_gram_dictionary(df_n_grams)

We now have a dictionary where the keys are the preceding tuple and the items are a list of the possible next letters with each letter appearing in the list multiple times according to itsd frequency as the end of the n-gram. 

Therefore to generate a name all we need to do is start with the sequence [">", ">"] and repeatedly pick a random element from the corresponding dictionary entry

As a final flourish we shall check that we have not produced and English word (this happens quite often)

In [47]:
def generate_names(ngram_dict: dict, names_required = 10, n_value=3):
  n_minus_one = n_value - 1
  names_completed = 0
  new_names = []
  while names_completed < names_required:
    # generate start sequence
    start = ['>'] * n_minus_one
    generated_name = start
    while len(generated_name) < 12 and generated_name[-n_minus_one:] != ['<']*n_minus_one:
      preceding_tuple = tuple(generated_name[-n_minus_one:])
      possibilities = ngram_dict[preceding_tuple]
      next_letter = random.choice(possibilities)
      generated_name.append(next_letter)
    finished_name = "".join(generated_name[n_minus_one:-n_minus_one])
    if finished_name in words.words():
      print ('ooops English word')
    else:
      new_names.append(finished_name.capitalize())
      names_completed += 1
  return new_names


# now we can produce some gender neutral names
neutral_names = generate_names(ngram_dict, names_required = 10, n_value = 3)
print(neutral_names)

ooops English word
ooops English word
ooops English word
ooops English word
ooops English word
['Skyerich', 'Shikking', 'Ji', 'Jenain', 'Marittan', 'Kary', 'Ossidala', 'Jacy', 'Colan', 'Jeanice']


In [48]:
# generate synthetic female names
n_value = 3
df_female_grams = ngram_dataframe(df=female_names_df, n=n_value)
female_dict = get_n_gram_dictionary(df_female_grams)
female_names = generate_names(female_dict, names_required = 10, n_value = n_value)
print(female_names)


ooops English word
ooops English word
['Luperann', 'Clotira', 'Toryana', 'Coustani', 'Ia', 'Adinden', 'Moniccia', 'Chountob', 'Nie', 'Kouricki']


In [51]:
# generate synthetic male names
n_value = 4
df_male_grams = ngram_dataframe(df=male_names_df, n=n_value)
male_dict = get_n_gram_dictionary(df_male_grams)
male_names = generate_names(male_dict, names_required = 10, n_value = n_value)
print(male_names)


ooops English word
ooops English word
ooops English word
ooops English word
['Jacote', 'Juan', 'Damal', 'Kenio', 'Kus', 'Cipria', 'Harlan', 'Que', 'Natha', 'Ravong']
