In [27]:
from tensorflow import keras
import tensorflow as tf
import pandas as pd
import os
import re

In [28]:
male_data = pd.read_csv('/content/Indian-Male-Names.csv')
female_data = pd.read_csv('/content/Indian-Female-Names.csv')


In [29]:
male_data.head()

Unnamed: 0,name,gender,race
0,barjraj,m,indian
1,ramdin verma,m,indian
2,sharat chandran,m,indian
3,birender mandal,m,indian
4,amit,m,indian


In [30]:
female_data.head()

Unnamed: 0,name,gender,race
0,shivani,f,indian
1,isha,f,indian
2,smt shyani devi,f,indian
3,divya,f,indian
4,mansi,f,indian


In [31]:
repl_list = ['s/o','d/o','w/o','/','&',',','-']

def clean_data(name):
	name = str(name).lower()
	name = (''.join(i for i in name if ord(i)<128)).strip()
	for repl in repl_list:
		name = name.replace(repl," ")
	if '@' in name:
		pos = name.find('@')
		name = name[:pos].strip()
	name = name.split(" ")
	name = " ".join([each.strip() for each in name])
	return name

def remove_records(merged_data):
	merged_data['delete'] = 0
	merged_data.loc[merged_data['name'].str.find('with') != -1,'delete'] = 1
	merged_data.loc[merged_data['count_words']>=5,'delete']=1
	merged_data.loc[merged_data['count_words']==0,'delete']=1
	merged_data.loc[merged_data['name'].str.contains(r'\d') == True,'delete']=1
	cleaned_data = merged_data[merged_data.delete==0]
	return cleaned_data

merged_data = pd.concat((male_data,female_data),axis=0)

merged_data['name'] = merged_data['name'].apply(clean_data)
merged_data['count_words'] = merged_data['name'].str.split().apply(len)

cleaned_data = remove_records(merged_data)

indian_cleaned_data = cleaned_data[['name','count_words']].drop_duplicates(subset='name',keep='first')
indian_cleaned_data['label'] = 'indian'

len(indian_cleaned_data)

13754

In [32]:
indian_cleaned_data.head()


Unnamed: 0,name,count_words,label
0,barjraj,1,indian
1,ramdin verma,2,indian
2,sharat chandran,2,indian
3,birender mandal,2,indian
4,amit,1,indian


In [33]:
from faker import Faker
import random
req = 15000
non_indian_names = []

langs = ['ar_EG','bs_BA','de_DE','dk_DK','en_AU','en_CA','en_GB',
'en_IN','en_NZ','en_US','it_IT','no_NO','ro_RO']

for i in range(0,req):
	lng_indx = random.randint(0,len(langs)-1)
	fake = Faker(langs[lng_indx])
	non_indian_names.append(fake.name().lower())

non_indian_names_orig = list(set(non_indian_names))


In [34]:
len(non_indian_names_orig)

14567

In [35]:
non_indian_data = pd.DataFrame({'name':non_indian_names_orig})
non_indian_data['count_words'] = non_indian_data['name'].str.split().apply(len)
non_indian_data.head()

Unnamed: 0,name,count_words
0,veda sachdev,2
1,robert park,2
2,ing. almut thanel b.eng.,4
3,mr. christopher fitzgerald,3
4,charlotte reeves,2


In [36]:
indian_cleaned_data['count_words'].value_counts()


Unnamed: 0_level_0,count
count_words,Unnamed: 1_level_1
2,7954
1,4322
3,1344
4,134


In [37]:
non_indian_data['count_words'].value_counts()

Unnamed: 0_level_0,count
count_words,Unnamed: 1_level_1
2,13322
3,1087
4,154
5,3
6,1


In [38]:
two_word_names = non_indian_data[non_indian_data['count_words']==2]['name']
one_word_req = 5000
names_one_two_words = [each.split()[0] for each in two_word_names[:one_word_req]] + list(two_word_names[one_word_req:])
count_words = [1] * one_word_req + [2] * len(two_word_names[one_word_req:])
not_two_words_pd  = non_indian_data[non_indian_data['count_words']!=2]
one_two_words_pd = pd.DataFrame({'name':names_one_two_words,'count_words':count_words})
non_indian_data = pd.concat((not_two_words_pd,one_two_words_pd),axis=0)
non_indian_data['count_words'].value_counts()
non_indian_data['label'] = 'non_indian'
non_indian_data = non_indian_data[non_indian_data['count_words']<5]
non_indian_data['count_words'].value_counts()

Unnamed: 0_level_0,count
count_words,Unnamed: 1_level_1
2,8322
1,5000
3,1087
4,154


In [39]:
full_data = pd.concat((non_indian_data[['name','label']],indian_cleaned_data[['name','label']]),axis=0)
full_data = full_data.sample(frac=1)

full_data.to_csv("name_data.csv",index=False)

from google.colab import files
files.download('name_data.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [42]:
name_data = pd.read_csv('/content/name_data.csv')

In [43]:
name_data.head()

Unnamed: 0,name,label
0,kishor negi,indian
1,gregory,non_indian
2,vijay bansal,indian
3,lisette textor b.sc.,non_indian
4,manoj kaushik,indian


In [44]:
name_data['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
non_indian,14563
indian,13754


In [45]:
from sklearn.model_selection import train_test_split
X = name_data['name'].astype(str)
Y = name_data['label']
train_names,test_names,train_labels,test_labels = train_test_split(X,Y,test_size=0.2,random_state =42,stratify=Y)


In [49]:
from sklearn.feature_extraction.text import CountVectorizer

# Example data
train_names = pd.Series(['Alice Johnson', 'Bob Smith', 'Charlie Brown'])

# Initialize and fit CountVectorizer
vectorizer = CountVectorizer()
X_ = vectorizer.fit_transform(train_names.values.astype('U'))

# Get feature names
feature_names = vectorizer.get_feature_names_out()

# Output the number of features and feature names
print(f"Number of features: {len(feature_names)}")
print(f"Feature names: {feature_names}")


Number of features: 6
Feature names: ['alice' 'bob' 'brown' 'charlie' 'johnson' 'smith']


In [51]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix, classification_report

# Sample data
train_names = pd.Series(['Alice Johnson', 'Bob Smith', 'Charlie Brown', 'Alice Cooper', 'Bob Marley'])
labels = pd.Series([1, 0, 0, 1, 0])  # Example binary labels

# Initialize and fit CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(train_names.values.astype('U'))

# Convert to array to check the number of features
num_features = len(vectorizer.get_feature_names_out())
print(f"Number of features: {num_features}")

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)

# Initialize and fit the Naive Bayes model
model = MultinomialNB()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print("Confusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(class_report)


Number of features: 8
Confusion Matrix:
[[1]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         1

    accuracy                           1.00         1
   macro avg       1.00      1.00      1.00         1
weighted avg       1.00      1.00      1.00         1



In [52]:
check_new_names = ['lalitha','tyson','shailaja','shyamala','vishwanathan','ramanujam','conan','kryslovsky',
'ratnani','diego','kakoli','shreyas','brayden','shanon']

X_new = vectorizer.transform(check_new_names)
predictions_nb_cv = model.predict(X_new)
test = pd.DataFrame({'names':check_new_names,'predictions_nb_cv':predictions_nb_cv})
test

Unnamed: 0,names,predictions_nb_cv
0,lalitha,0
1,tyson,0
2,shailaja,0
3,shyamala,0
4,vishwanathan,0
5,ramanujam,0
6,conan,0
7,kryslovsky,0
8,ratnani,0
9,diego,0


In [53]:
!pip3 install tokenizers
from tokenizers import ByteLevelBPETokenizer,CharBPETokenizer,SentencePieceBPETokenizer,BertWordPieceTokenizer


f = open("train_names.txt","w")
for each in list(train_names):
	f.write(str(each))
	f.write("\n")

f.close()

tokenizer = SentencePieceBPETokenizer()
tokenizer.train(["./train_names.txt"],vocab_size=2000,min_frequency=2)

encoded_tokens = [tokenizer.encode(str(each)).tokens for each in train_names]
encoded_tokens_test = [tokenizer.encode(str(each)).tokens for each in test_names]

encoded_tokens = [" ".join(each)  for each in encoded_tokens]
encoded_tokens_test = [" ".join(each)  for each in encoded_tokens_test]

encoded_tokens[:10]



['▁Alice ▁ J o h n s o n',
 '▁Bob ▁ S m i t h',
 '▁C h ar li e ▁B r o w n',
 '▁Alice ▁C o o p e r',
 '▁Bob ▁ M ar l e y']

In [55]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

# Example data
encoded_tokens = ['sample text data', 'more text data', 'even more data']
train_labels = [1, 0, 1]

encoded_tokens_test = ['test text data', 'more test data']
test_labels = [1, 0]

# Initialize and fit TF-IDF Vectorizer
tfidf_vect = TfidfVectorizer()
X_train = tfidf_vect.fit_transform(encoded_tokens)

# Check the number of features
num_features = len(tfidf_vect.get_feature_names_out())
print(f"Number of features: {num_features}")

# Train the Naive Bayes model
model = MultinomialNB()
model.fit(X_train, train_labels)

# Transform test data and make predictions
X_test = tfidf_vect.transform(encoded_tokens_test)
test_predicted = model.predict(X_test)

# Print classification report
print(classification_report(test_labels, test_predicted))


Number of features: 5
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.50      1.00      0.67         1

    accuracy                           0.50         2
   macro avg       0.25      0.50      0.33         2
weighted avg       0.25      0.50      0.33         2



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [56]:
encoded_tokens_check = [tokenizer.encode(str(each).lower()).tokens for each in check_new_names]
encoded_tokens_check = [" ".join(each)  for each in encoded_tokens_check]

X_new = tfidf_vect.transform(encoded_tokens_check)
predictions_nb_enc_tf = model.predict(X_new)
test = pd.DataFrame({'names':check_new_names,'predictions_nb_enc_tf':predictions_nb_enc_tf})
test

Unnamed: 0,names,predictions_nb_enc_tf
0,lalitha,1
1,tyson,1
2,shailaja,1
3,shyamala,1
4,vishwanathan,1
5,ramanujam,1
6,conan,1
7,kryslovsky,1
8,ratnani,1
9,diego,1


In [71]:
import numpy as np
from tokenizers import SentencePieceBPETokenizer
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Input
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.callbacks import Callback

# Define parameters
vocab_size = 200
max_len = 20  # Adjust based on your requirements

# Initialize and train the tokenizer
tokenizer = SentencePieceBPETokenizer()
tokenizer.train(files=["./train_names.txt"], vocab_size=vocab_size, min_frequency=2)

# Function to encode text data
def sent_piece_encoded_representation(data, tokenizer, max_len, vocab_size):
    encoded_tokens = [tokenizer.encode(str(each)).ids for each in data]
    sequences = [to_categorical(x, num_classes=vocab_size) for x in encoded_tokens]
    X = sequence.pad_sequences(sequences, maxlen=max_len)
    return X

# Define the model-building function
def build_model(hidden_units, max_len, vocab_size):
    model = Sequential()
    model.add(Input(shape=(max_len, vocab_size)))  # Define the input shape
    model.add(LSTM(hidden_units, return_sequences=False))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    print(model.summary())
    return model

# Custom Callback (optional)
class MyCallback(Callback):
    def __init__(self, X_test, y_test):
        super(MyCallback, self).__init__()
        self.X_test = X_test
        self.y_test = y_test

    def on_epoch_end(self, epoch, logs=None):
        val_loss, val_accuracy = self.model.evaluate(self.X_test, self.y_test, verbose=0)
        print(f"Epoch {epoch + 1}: val_loss = {val_loss:.4f}, val_accuracy = {val_accuracy:.4f}")

# Example data
train_names = ["example name", "another name", "more names"]
test_names = ["new name", "additional name"]
train_labels = ["label1", "label2", "label1"]
test_labels = ["label2", "label1"]

# Compute max_len based on training data
max_len = max([len(tokenizer.encode(str(each)).ids) for each in train_names])

# Initialize LabelEncoder and transform


In [73]:
# Encode new data
X_predict = sent_piece_encoded_representation(check_new_names, tokenizer, max_len, vocab_size)

# Make predictions
predictions_prob = model.predict(X_predict)

# Convert probabilities to binary predictions
predictions = (predictions_prob > 0.5).astype(int).flatten()  # Convert to binary predictions and flatten

# Decode predictions to original labels
predictions_lstm_sent_enc = le.inverse_transform(predictions)

# Create a DataFrame with the results
test = pd.DataFrame({
    'names': check_new_names,
    'predictions_lstm_sent_enc': predictions_lstm_sent_enc
})

print(test)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 182ms/step
           names  predictions_lstm_sent_enc
0        lalitha                          1
1          tyson                          1
2       shailaja                          0
3       shyamala                          1
4   vishwanathan                          1
5      ramanujam                          0
6          conan                          1
7     kryslovsky                          0
8        ratnani                          1
9          diego                          0
10        kakoli                          1
11       shreyas                          0
12       brayden                          0
13        shanon                          1
