## Gender Identification Model

### Problem Statement
#### Given the name, can we identify the gender of the person?

In [None]:
## import libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS

import re
import pickle

import time

In [None]:
# Start the timer
start_time = time.time()

In [None]:
## load the initial Train dataset
dt = pd.read_csv("../../Njambanene/Tasks/Name_Identification/Feb  Members Spend Analysis  - Active 1 Members.csv")

In [None]:
#### Sample the dataset
dt.head()

In [None]:
## rename column
dt = dt.rename(columns={'Member Name': 'Name'})

In [None]:
## Check data types and attributes
print(dt.columns)

print(dt.dtypes)

In [None]:
def count_one_word_names(name):
  """Counts the number of names with one word in a list of names.

  Args:
    names: A list of names.

  Returns:
    The number of names with one word.
  """

  count = 0
  for name in name:
    if len(name.split()) == 1:
      count += 1
  return count


In [None]:
## proportion of one-word names
one_word_names = round(count_one_word_names(dt['Name'])/len(dt)*100,2)

print(f"The proportion of one-word names is: {one_word_names}%")

In [None]:
# names = dt['Name'].apply(lambda x: x.lower())

In [None]:
## replace col values
# dt['Gender'] = dt['Gender'].replace({0:"M",1:"F"})

In [None]:
## shape of data
print(dt.shape)

In [None]:
## Function for Cleaning Names
def clean_name(value):
    # Convert float values to string
    if isinstance(value, float):
        value = str(value)

    # Truncate the name to the first three words
    words = value.split()
    truncated_words = words[:3]
    truncated_name = ' '.join(truncated_words)

    # Remove extraneous white spaces using regular expression substitution
    dropped_whitespaces = re.sub(r'\s+', ' ', truncated_name)

    # Drop non-name words and remove characters after hyphen, underscore, or brackets
    pattern = r"\b(?:[^A-Za-z\s]|(?!^)\d)\b|[-_()\[\]]"
    names_only = re.sub(pattern, '', dropped_whitespaces)

    # Remove the pattern and everything after it
    pattern = r"\b(?:DO NOT|DONT|SAYS|REQUEST|CUSTOMER|LOCK|NOT|REQUESTED|ACCOUNT|TRANSFER|CUSTOMERS|DONOT|REPORTED|LOST|STOLEN|COOKER|CANISTER|KINDLY|GIVE)\b.*"
    drop_nonnames = re.sub(pattern, '', names_only)

    # Remove special characters (excluding spaces) from the name
    pattern = r'[^A-Za-z\s]'
    cleaned_name = re.sub(pattern, '', drop_nonnames)

    return cleaned_name.strip()


In [None]:
# Apply the clean function to the 'Name' column
dt['Cleaned Name'] = dt['Name'].apply(clean_name)#apply(convert_float_and_remove_numbers).apply(truncate_drop_and_clean)
dt.head()

In [None]:
## unique names 
print(len(dt['Cleaned Name'].unique()))

In [None]:
## plot for male and female names in the data
import seaborn as sns
import matplotlib.pyplot as plt

# Create the countplot
ax = sns.countplot(x='Gender', data=dt)

# Calculate the proportions for each category
total = len(dt)
counts = dt['Gender'].value_counts()
proportions = counts / total

# Sort the counts and proportions in descending order
sorted_counts = counts.sort_values(ascending=False)
sorted_proportions = proportions.loc[sorted_counts.index]


# Add the proportions as annotations
for i, p in enumerate(ax.patches):
    x = p.get_x() + p.get_width() / 2
    y = p.get_height()
    label = f"{sorted_proportions[i]:.2%}"
    ax.annotate(label, (x, y), ha='center', va='bottom')

sns.countplot(x='Gender', data=dt)
plt.title('No of male and female names in the dataset')
plt.xticks([0,1],('Female','Male'))
plt.show()

In [None]:
## analyze starting letters of names
alphabets= ['A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P',

            'Q','R','S','T','U','V','W','X','Y','Z']

startletter_count = {}

for i in alphabets:

    startletter_count[i] = len(dt[dt['Name'].str.startswith(i)])

print(startletter_count)

In [None]:
## visualize starting letters
plt.figure(figsize = (16,8))

plt.bar(startletter_count.keys(),startletter_count.values())

plt.xlabel('Starting alphabet')

plt.ylabel('No. of names')

plt.title('Number of names starting with each letter')

In [None]:
## check most common starting alphabets
print('The 5 most common starting letters are : ', *sorted (startletter_count.items(),key=lambda item: item[1])[-5:][::-1])

In [None]:
## analyze ending letters of names
small_alphabets = ['a','b','c','d','e','f','g','h',

                   'i','j','k','l','m','n','o','p','q','r','s','t','u','v','x','y','z']

endletter_count ={}

for i in small_alphabets:
    endletter_count[i] = len(dt[dt['Name'].str.endswith(i)])

print(endletter_count)

In [None]:
plt.figure(figsize = (16,8))

plt.bar(endletter_count.keys(),endletter_count.values())

plt.xlabel('Ending alphabet')

plt.ylabel('No. of names')

plt.title('Number of names ending with each letter')

In [None]:
## most common name ending letters
print("The 5 most name ending letters are : ",*sorted(endletter_count.items(),
                                                      key=lambda item: item[1])[-5:][::-1])

In [None]:
# building a word cloud

text =  " ".join(i for i in dt.Name)

word_cloud = WordCloud(

        width=3000,

        height=2000,

        random_state=1,

        background_color="white",

        colormap="BuPu",

        collocations=False,
        stopwords= STOPWORDS,

        ).generate(text)

plt.imshow(word_cloud)

plt.axis("off")    

plt.show()

In [None]:
## build model
X = list(dt['Cleaned Name'])
Y = list(dt['Gender'])

In [None]:
## encode the labels
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
Y = encoder.fit_transform(Y)

In [None]:
## count vectorization
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(analyzer = 'char')
X = cv.fit_transform(X).toarray()



In [None]:
## split the dataset
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X,Y, test_size=0.33, random_state= 42)

In [None]:
## logistic regression
from sklearn.linear_model import LogisticRegression
LR_model = LogisticRegression()
LR_model.fit(x_train, y_train)
LR_y_pred = LR_model.predict(x_test)

In [None]:
## Naive Bayes
from sklearn.naive_bayes import MultinomialNB
NB_model = MultinomialNB()
NB_model.fit(x_train, y_train)
NB_y_pred = NB_model.predict(x_test)

In [None]:
## K Nearest Neighbors
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

# Create an instance of the KNN classifier
knn_model = KNeighborsClassifier(n_neighbors=5)

# Fit the KNN model
knn_model.fit(x_train, y_train)
knn_y_pred = knn_model.predict(x_test)


In [None]:
## preview prediction output
df_predictions2 = pd.DataFrame({'Name': dt.loc[range(len(x_test)),'Name'], 'Predicted_Gender': encoder.inverse_transform(knn_y_pred)})
df_predictions2.head()

In [None]:
## XGBoost
from xgboost import XGBClassifier


# Define the seed
seed = 42

XGB_model = XGBClassifier(random_state= seed) #use_label_encoder = False
XGB_model.fit(x_train,y_train)
XGB_y_pred = XGB_model.predict(x_test)


In [None]:
## preview prediction output

XGB_y_pred = XGB_y_pred.tolist()
df_predictions = pd.DataFrame({'Cleaned Name': dt.loc[range(len(x_test)), 'Cleaned Name'], 'Predicted_Gender': encoder.inverse_transform(XGB_y_pred)})
df_predictions.head()

In [None]:
## Comparison of performance
### function for confusion matrix
from sklearn.metrics import confusion_matrix
def cmatrix(model):
    y_pred = model.predict(x_test)
    cmatrix = confusion_matrix(y_test,y_pred)
    print(cmatrix)
    sns.heatmap(cmatrix,fmt = 'd',cmap='BuPu', annot=True)
    plt.xlabel('Predicted Values')
    plt.ylabel('Actual Values')
    plt.title('Confusion Matrix')

In [None]:
## output
import sklearn.metrics as metrics

# for logistic regression
accuracy = metrics.accuracy_score(LR_y_pred,y_test)
print("Accuracy: %.2f%%" % (accuracy*100))

print(metrics.classification_report(y_test,LR_y_pred))
print(cmatrix(LR_model))

In [None]:
# for naive bayes

accuracy = metrics.accuracy_score(NB_y_pred,y_test)
print("Accuracy: %.2f%%" % (accuracy*100))

print(metrics.classification_report(y_test, NB_y_pred))

print(cmatrix(NB_model))

In [None]:
# for knn
accuracy = metrics.accuracy_score(knn_y_pred,y_test)

print("Accuracy: %.2f%%" % (accuracy*100))

print(metrics.classification_report(y_test,knn_y_pred))

print(cmatrix(knn_model))

In [None]:
# for XGBoost
print(metrics.accuracy_score(XGB_y_pred,y_test))

accuracy = metrics.accuracy_score(XGB_y_pred,y_test)
print("Accuracy: %.2f%%" % (accuracy*100))

print(metrics.classification_report(y_test, XGB_y_pred))

print(cmatrix(XGB_model))

### Choosen Model

In [None]:
## XGBoost
from xgboost import XGBClassifier


# Define the seed
seed = 42

XGB_model = XGBClassifier(random_state = seed) #use_label_encoder = False
XGB_model.fit(x_train,y_train)

# Save the model and the seed
model_filename = "name_classification_xgb.pkl"
model_data = {
    "model": XGB_model,
    "seed": seed
}

with open(model_filename, "wb") as file:
    pickle.dump(model_data, file)


XGB_y_pred = XGB_model.predict(x_test)

In [None]:
## preview prediction output

XGB_y_pred = XGB_y_pred.tolist()
dt_predictions = pd.DataFrame({'Cleaned Name': dt.loc[range(len(x_test)), 'Name'], 'Predicted_Gender': encoder.inverse_transform(XGB_y_pred)})
dt_predictions.head()

In [None]:
dt_predictions.shape

### Run predictions on cust_prof_gend data --validation of data w Gender

In [None]:
# import os

# pydomo_client_id = os.getenv('PYDOMO_CLIENTID')
# pydomo_secret = os.getenv('PYDOMO_SECRET')



In [None]:
## create domo python connection
from pydomo import Domo
from dotenv import dotenv_values


env_values = dotenv_values('.env')

domo = Domo(env_values['PYDOMO_CLIENTID'], env_values['PYDOMO_SECRET'])

In [None]:
# Download a data set from Domo
cust_prof_gender = domo.ds_get('9e6aecb0-9669-4787-894d-43c496b4c928')
cust_prof_gender.head()

In [None]:
## drop unnecessary columns
cust_prof_gender = cust_prof_gender.drop(['num_canisters','is_club_customer'],axis=1)

In [None]:
# Map values from 1 to "female" and 2 to "male" in the 'Gender' column
cust_prof_gender['gender'] = cust_prof_gender['gender'].map({1: 'Female', 2: 'Male'})
cust_prof_gender['gender'].value_counts(normalize=True)

In [None]:
## plot for male and female names in the loaded data

# Create the countplot
ax = sns.countplot(x='gender', data=cust_prof_gender)

# Calculate the proportions for each category
total = len(cust_prof_gender)
counts = cust_prof_gender['gender'].value_counts()
proportions = counts / total

# Add the proportions as annotations
for i, p in enumerate(ax.patches):
    x = p.get_x() + p.get_width() / 2
    y = p.get_height()
    label = f"{proportions[i]:.2%}"
    ax.annotate(label, (x, y), ha='center', va='bottom')

# Set the title and x-axis tick labels
plt.title('No of male and female names in the dataset')
plt.xticks(ticks=[0, 1], labels=['Female', 'Male'])

# Display the plot
plt.show()

In [None]:
  ## proportion of one-word names
one_word_names2 = round(count_one_word_names(cust_prof_gender['name'])/len(cust_prof_gender)*100,2)

print(f"The proportion of one-word-names is: {one_word_names2}%")

In [None]:
## update dataset to predict
# df = pd.read_csv("../../Njambanene/Tasks/Name_Identification/Fuel Customers_test.csv")
df = cust_prof_gender
## rename col
df = df.rename(columns= {'name': 'Name','gender':'Gender'})
df.head()

In [None]:
## check shape
df.shape


In [None]:
# Apply the clean function to the 'Name' column
df['Cleaned Name'] = df['Name'].apply(clean_name)#apply(convert_float_and_remove_numbers).apply(truncate_drop_and_clean)
df.head()

In [None]:
# df = add_name_length_column(df['Cleaned Name']).sort_values(by ='Name Length' ,ascending=False)
# df.head(30)

In [None]:
## preprocessing transforms
df_test = list(df['Cleaned Name'])

In [None]:
## vectorization
df_test = cv.fit_transform(df_test).toarray()

In [None]:
## truncate to train data's number of attributes
# num_attributes = X.shape[1]

# truncated_array = cv.fit_transform(df_test).toarray()[:, :num_attributes]


In [None]:
## check shape of training vs test data
# df_test = truncated_array
print("Shape of df_test:", df_test.shape)
# print("Shape of truncated_array:", truncated_array.shape)

print("Shape of df_train:", X.shape)

In [None]:
## load and run saved model
### Run the model on new data
import xgboost as xgb

# Load the saved model and seed
model_filename = "name_classification_xgb.pkl"

with open(model_filename, "rb") as file:
    model_data = pickle.load(file)

XGB_model = model_data["model"]
seed = model_data["seed"]


predictions = XGB_model.predict(df_test)


In [None]:
## preview prediction output

predictions = predictions.tolist()
cust_prof_gender_predictions = pd.DataFrame({'customer_id': df['customer_id'],'Cleaned Name': df.loc[range(len(df_test)), 'Cleaned Name'], 'Predicted_Gender': encoder.inverse_transform(predictions)})
cust_prof_gender_predictions.head()

In [None]:
cust_prof_gender_predictions.shape


In [None]:
## join prediction with original dataset for validations
df2 = df.merge(cust_prof_gender_predictions,left_index=True,right_index=True)
df2.head()

### Compare Predictions with Gender Data Already Captured

In [None]:
## filter accurate matches
matched_mask = df2['Gender'] == df2['Predicted_Gender']
matched_rows = df2[matched_mask]

matched_rows.shape

In [None]:
## proportion of matches to customer data with Gender
match_percentage = round(len(matched_rows)/len(cust_prof_gender)*100,2)

print(f"The match percentage is: {match_percentage}%")

In [None]:
matched_rows.head()

In [None]:
matched_rows = matched_rows.drop(columns=['Name','Gender','Cleaned Name_x','customer_id_x'], axis=1)
matched_rows = matched_rows.rename(columns={'Cleaned Name_y':'Cleaned Name','customer_id_y':'customer_id' })#, 'Predicted_Gender':'Gender'
matched_rows.head()

In [None]:
matched_rows.shape

In [None]:
## duplicate data for storage
df_predictions = matched_rows



In [None]:
### Append Predicted-Matched data with former Test data
# Concatenate datasets based on matching columns
merged_train_dt = pd.concat([dt_predictions, df_predictions], axis=0, ignore_index=True)
merged_train_dt.shape

In [None]:
merged_train_dt.head()

In [None]:
  ## proportion of one-word names
one_word_names2a = round(count_one_word_names(merged_train_dt['Cleaned Name'])/len(merged_train_dt)*100,2)

print(f"The proportion of one-word-names is: {one_word_names2a}%")

In [None]:
merged_train_dt['Predicted_Gender'].value_counts()

In [None]:
total = len(merged_train_dt)
counts = merged_train_dt['Predicted_Gender'].value_counts()
proportions = counts / total
proportions

In [None]:
## plot for male and female names in the merged training data

# Calculate the counts and proportions
counts = merged_train_dt['Predicted_Gender'].value_counts()
proportions = counts / len(merged_train_dt)

# Sort the counts and proportions in descending order
sorted_counts = counts.sort_values(ascending=False)
sorted_proportions = proportions.loc[sorted_counts.index]

# Create the countplot with sorted data
ax = sns.countplot(x='Predicted_Gender', data=merged_train_dt, order=sorted_counts.index)

# Add the proportions as annotations
for i, p in enumerate(ax.patches):
    x = p.get_x() + p.get_width() / 2
    y = p.get_height()
    label = f"{sorted_proportions[i]:.2%}"
    ax.annotate(label, (x, y), ha='center', va='bottom')

# Set the title and x-axis tick labels
plt.title('No of male and female names in the dataset')
plt.xticks(ticks=[0, 1], labels=['Female', 'Male'])

# Display the plot
plt.show()



### Train another model on merged data

In [None]:
## run clean function on names
merged_train_dt['Cleaned Name'] = merged_train_dt['Cleaned Name'].apply(clean_name)#apply(convert_float_and_remove_numbers).apply(truncate_drop_and_clean)
merged_train_dt.head()

In [None]:
## build model
X = list(merged_train_dt['Cleaned Name'])
Y = list(merged_train_dt['Predicted_Gender'])


In [None]:
## encode the labels
from sklearn.preprocessing import LabelEncoder



encoder = LabelEncoder()
Y = encoder.fit_transform(Y)

In [None]:
## count vectorization
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(analyzer = 'char')
X = cv.fit_transform(X).toarray()

In [None]:
## split the dataset
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X,Y, test_size=0.33, random_state= 43)

In [None]:
### train another model on merged data
# Define the seed
seed = 43

XGB_model2 = XGBClassifier(random_state = seed) #use_label_encoder = False
XGB_model2.fit(x_train,y_train)

# Save the model and the seed
model_filename = "name_classification_xgb2.pkl"
model_data = {
    "model": XGB_model2,
    "seed": seed
}

with open(model_filename, "wb") as file:
    pickle.dump(model_data, file)


XGB_y_pred2 = XGB_model2.predict(x_test)

In [None]:
# for XGBoost
# print(metrics.accuracy_score(XGB_y_pred2,y_test))

accuracy = metrics.accuracy_score(XGB_y_pred2,y_test)
print("Accuracy: %.2f%%" % (accuracy*100))

print(metrics.classification_report(y_test, XGB_y_pred2))

print(cmatrix(XGB_model2))

### Running Prediction on Entire Customer Profile Data

In [None]:
# Download  entire cust dataset from Domo
cust_prof = domo.ds_get('6ab44284-0840-4af4-a316-09e993e0065a')
cust_prof.head()

In [None]:
## check shape
cust_prof.shape

In [None]:
# Map values from 1 to "female" and 2 to "male" in the 'Gender' column
cust_prof['gender'] = cust_prof['gender'].map({1: 'Female', 2: 'Male'})

In [None]:
## select key cols

cols_of_interest = ['customer_id','name']#,'gender'
cust_prof = cust_prof[cols_of_interest]
 

In [None]:

## filter data already in train data
cust_prof = cust_prof.reset_index(drop=True)
matched_rows = matched_rows.reset_index(drop=True)

df4 = cust_prof.drop(cust_prof[cust_prof['customer_id'].isin(matched_rows['customer_id'])].index)
## check shape after
df4.shape

In [None]:
## drop nas
df4 = df4.dropna(how = 'all')
## rename cols
df4 = df4.rename(columns= {'name': 'Name'})#,'gender':'Gender'
df4.head()

In [None]:
  ## proportion of one-word names
one_word_names3 = round(count_one_word_names(df4['Name'])/len(df4)*100,2)

print(f"The proportion of one-word-names is: {one_word_names3}%")

### Pre-Processing Final Prediction Data

In [None]:
def add_name_length_column(names):
  """Adds a column of the number of characters in each name to a DataFrame.

  Args:
    names: A list of names.

  Returns:
    A DataFrame with a column of the number of characters in each name.
  """
  name_lengths = []
  for name in names:
    name_length = len(name)
    name_lengths.append(name_length)

  df = pd.DataFrame({'Name': names})
  df['Name Length'] = name_lengths
  return df

In [None]:
## check the name with most characters

# df4 = add_name_length_column(df4['Name']).sort_values(by ='Name Length' ,ascending=False)
# df4.head()

In [None]:

# Apply the truncate_names function to the 'Name' column
df4['Cleaned Name'] = df4['Name'].apply(clean_name)
df4.head()



In [None]:
## preprocessing transforms
dm_test = list(df4['Cleaned Name'])

In [None]:
# Replace NaN values with empty strings
# df_test2 = ['' if pd.isna(value) else value for value in df_test2]

## truncate to train data's number of attributes
# num_attributes = X.shape[1]

# truncated_array = cv.fit_transform(df_test2).toarray()[:, :num_attributes]

In [None]:
## vectorization

dm_test = cv.fit_transform(dm_test).toarray()

In [None]:
## check shape of training vs test data
# df_test2 = truncated_array
print("Shape of dm_test:", dm_test.shape)
# print("Shape of truncated_array:", truncated_array.shape)

print("Shape of df_train:", X.shape)

In [None]:

## load and run saved model
### Run the model on new data


# Load the saved model and seed
model_filename = "name_classification_xgb2.pkl"

with open(model_filename, "rb") as file:
    model_data = pickle.load(file)

XGB_model2 = model_data["model"]
seed = model_data["seed"]


predictions2 = XGB_model2.predict(dm_test)

In [None]:
predictions2 = predictions2.tolist()
# customer_f_dt = customer_f_dt.reset_index(drop=True)

cust_profile_predictions = pd.DataFrame({'customer_id': df4['customer_id'],'Cleaned Name': df4['Cleaned Name'],'Predicted_Gender': encoder.inverse_transform(predictions2)
})
cust_profile_predictions.head()


In [None]:
cust_profile_predictions.shape

### Append Predictions to Train data

In [None]:
cust_prof_gender_predictions.head()

In [None]:
## append matched rows with dt
all_cust_prof_data = pd.concat([cust_prof_gender_predictions,cust_profile_predictions],axis=0, ignore_index=True)
all_cust_prof_data.shape

In [None]:
all_cust_prof_data.head()

In [None]:
## Drop Duplicated records
all_cust_prof_data.drop_duplicates(subset='customer_id', inplace=True)
all_cust_prof_data.shape

### Visualize Gender Proportions

In [None]:
  ## proportion of one-word names
one_word_names4 = round(count_one_word_names(all_cust_prof_data['Cleaned Name'])/len(all_cust_prof_data)*100,2)

print(f"The proportion of one-word-names is: {one_word_names4}%")

In [None]:
# %%
all_cust_prof_data['Predicted_Gender'].value_counts()

In [None]:
## plot for male and female names in the data

# Create the countplot
ax = sns.countplot(x='Predicted_Gender', data=all_cust_prof_data)

# Calculate the proportions for each category
total = len(all_cust_prof_data)
counts = all_cust_prof_data['Predicted_Gender'].value_counts()
proportions = counts / total

# Sort the counts and proportions in descending order
sorted_counts = counts.sort_values(ascending=False)
sorted_proportions = proportions.loc[sorted_counts.index]


# Add the proportions as annotations
for i, p in enumerate(ax.patches):
    x = p.get_x() + p.get_width() / 2
    y = p.get_height()
    label = f"{sorted_proportions[i]:.2%}"
    ax.annotate(label, (x, y), ha='center', va='bottom')

# Set the title and x-axis tick labels
plt.title('No of male and female names in the dataset')
plt.xticks(ticks=[0, 1], labels=['Female', 'Male'])

# Display the plot
plt.show()


In [None]:
## output predicted genders
all_cust_prof_data.to_csv('../../Njambanene/Tasks/Name_Identification/KOKO Users -Name based Gender Classification.csv',index=False)

In [None]:
# Create a new data set in Domo with the result, the return value is the data set id of the new data set.
# all_cust_prof_data = domo.ds_create(all_cust_prof_data,'Customer Gender Dataset','Python')
## previously created dataset
all_cust_prof_data_prev = domo.ds_get('378b0d4e-1f77-4e7d-b4f0-cb62d974e550')
all_cust_prof_data_prev.shape


In [None]:
## update the previously created dataset
all_cust_prof_data = domo.ds_create(all_cust_prof_data,'Customer Gender Dataset_Update','Python')
all_cust_prof_data_update = domo.ds_update(all_cust_prof_data,all_cust_prof_data_prev)


In [None]:
all_cust_prof_data = pd.read_csv('../../Njambanene/Tasks/Name_Identification/KOKO Users -Name based Gender Classification.csv')
## Check for additional rows
current_rows = all_cust_prof_data.shape[0]
previous_rows = all_cust_prof_data_prev.shape[0]



In [None]:
# Compare the number of rows
if current_rows > previous_rows:
    additional_rows = current_rows - previous_rows
    print(f"There are {additional_rows} additional rows in the current dataset.")
else:
    print("No additional rows have been added to the dataset.")

In [None]:
# End the timer
end_time = time.time()

# Calculate the elapsed time
elapsed_time = end_time - start_time

# Print the elapsed time
print("Elapsed time:", elapsed_time, "minutes")

## Note:
Although we were able to predict the gender of majority of the names accurately with ~96% accuracy, below are are some disclaimers that influenced the accuracy of the model:
1. One-Word Names: There was a significant number of one word names both in the training set (~63%) and the test data (~50%) that reduces the ability to predict gender accurately.
2. Data Bias and Representation: There is a potential bias in the training data used to develop the model as female customers was ~60%  of the data. As such the model's predictions may reflect the biases present in the data, such as underrepresentation or overrepresentation.
3. Name complexity: Due to the limitation of the model processing only three names for a customer (at most), there is a possibility of incorrect predictions if two of the names could potentially belong to both male and female genders individually.
4. Third Gender Representation: It's important to note that the model does not consider gender-fluid or non-binary individuals. Instead, it primarily categorizes names into male or female genders and predicts the gender that is most commonly associated with a given name. This means that the model may not accurately represent or predict the gender for individuals who identify as gender-fluid or non-binary.
5. Name Ambiguity and Variability: Some names had numbers, names of places e.g. shops, special characters as well aliases that were dropped during name clean up. More-over some names can be gender-neutral or may have varying associations across different cultures, hence the model may be inaccurate in such instances. Names that are used for both males and females can introduce ambiguity in the model's predictions.
 

In [None]:
## create a function based on ner (pre-trained named entity recognition (NER) model) for name-checking 
# import spacy

# def is_name(word):
#     # Load the pre-trained English NER model from spaCy
#     nlp = spacy.load('en_core_web_sm')
    
#     # Process the word with the NER model
#     doc = nlp(word)
    
#     # Check if any entity in the word is classified as a person (PERSON)
#     return any(ent.label_ == 'PERSON' for ent in doc.ents)

