In [1]:
# In this notebook, you learn:
#
# 1) How to load a csv file?
# 2) How to filter the data to create a subset of names suitable for makemore?
#
# Resources:
# 1) https://www.kaggle.com/datasets/surajpratap/sixty-thousand-unique-indian-names-dataset?resource=download
#       -- Downloaded this dataset containing a bunch of Indian names from Kaggle.

In [14]:
import pandas as pd

In [15]:
# Load the dataset from a csv file to a pandas dataframe.
names_df = pd.read_csv('../Data/IndianNamesUnique.csv')
names_df.head()

Unnamed: 0,Name
0,AABAN
1,AABARANA
2,AABARNA
3,AABARNIGA
4,AABARSHANA


In [16]:
type(names_df)

pandas.core.frame.DataFrame

In [17]:
# Find the number of rows in the dataset.
names_df.shape

(60600, 1)

In [18]:
# Find the type of each column in the dataset.
names_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60600 entries, 0 to 60599
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    60600 non-null  object
dtypes: object(1)
memory usage: 473.6+ KB


In [19]:
# Convert all the names to lowercase. For now, we are not interested in handling case sensitivity.
names_df['Name'] = names_df['Name'].str.lower()

In [20]:
# Let's remove all the names that contain anything other than the 26 lowercase alphabets.
filtered_names_df = names_df[names_df['Name'].str.contains('^[a-z]+$', regex=True, case=False)]
filtered_names_df.head()

Unnamed: 0,Name
0,aaban
1,aabarana
2,aabarna
3,aabarniga
4,aabarshana


In [21]:
# We have ~6500 single word names in the dataset.
filtered_names_df.shape

(60548, 1)

In [23]:
# Let's remove duplicate names from the dataset as well.
filtered_names_df = filtered_names_df.drop_duplicates(subset=["Name"], keep="first")
filtered_names_df.head()

Unnamed: 0,Name
0,aaban
1,aabarana
2,aabarna
3,aabarniga
4,aabarshana


In [24]:
# Seems like there are no duplicate names in the dataset at all.
filtered_names_df.shape

(60548, 1)

In [25]:
# Let's find the maximum and minimum length of the names in the dataset. We don't want names that are too long
# or single character names.
max_name_length = filtered_names_df['Name'].str.len().max()
min_name_length = filtered_names_df['Name'].str.len().min()
print(f"max_name_length: {max_name_length}, min_name_length: {min_name_length}")

max_name_length: 24, min_name_length: 3


In [26]:
# I want the names to be in a random order. Currently, the names are sorted alphabetically.
new_names_df = filtered_names_df.sample(frac=1)
# If you look at the index, you will see that it no longer starts from 0. This is because when
# we shuffled the data, the index retained the original order. We can reset the index to start
# from 0.
new_names_df.head()

Unnamed: 0,Name
2941,albonsha
7820,beenapreethi
52994,thushniha
232,aakaksha
13423,dumeethran


In [27]:
# When we reset the index, the old index is added as a new column in the dataframe. We don't need
# this column, so we drop it.
new_names_df = new_names_df.reset_index()
new_names_df.head()

Unnamed: 0,index,Name
0,2941,albonsha
1,7820,beenapreethi
2,52994,thushniha
3,232,aakaksha
4,13423,dumeethran


In [28]:
# Let's drop the 'index' column.
new_names_df = new_names_df.drop(columns=['index'])
new_names_df.head()

Unnamed: 0,Name
0,albonsha
1,beenapreethi
2,thushniha
3,aakaksha
4,dumeethran


In [29]:
CLEANED_DATASET_PATH = "../Data/names.txt"

In [30]:
# Save the cleaned dataset to a text file to be used by makemore.
with open(CLEANED_DATASET_PATH, "w") as f:
    for name in new_names_df['Name']:
        f.write(name + "\n")

In [31]:
# Let's read the saved file to make sure that the data was saved correctly.
with open(CLEANED_DATASET_PATH, "r") as f:
    # Read all the lines in the file and strip the newline character from each line.
    # By default, readlines() returns a list of lines with the newline character at the end.
    names = [name.strip() for name in f.readlines()]

In [32]:
# Data was saved correctly.
names[:10]

['albonsha',
 'beenapreethi',
 'thushniha',
 'aakaksha',
 'dumeethran',
 'luhit',
 'valam',
 'harinyai',
 'sakthikaa',
 'kaveetha']