In [8]:
import pandas as pd

In [9]:
# Reading the dataet
df = pd.read_csv('NationalNames.csv')

In [10]:
# dropping unnecessary columns
df.drop(columns={'Id','Year'},inplace=True)

In [11]:
# grouping by name and gender
data = df.groupby(['Name','Gender'],as_index=False).agg({'Count':'sum'})
data.head()

Unnamed: 0,Name,Gender,Count
0,Aaban,M,72
1,Aabha,F,21
2,Aabid,M,5
3,Aabriella,F,10
4,Aadam,M,196


In [12]:
# pandas.pivot(index, columns, values) 
new_data = data.pivot('Name','Gender','Count').reset_index().rename_axis(None, axis=1)
new_data.head()

Unnamed: 0,Name,F,M
0,Aaban,,72.0
1,Aabha,21.0,
2,Aabid,,5.0
3,Aabriella,10.0,
4,Aadam,,196.0


In [13]:
# Filling Null values with zeros
new_data = new_data.fillna(0)
new_data.head()
new_data.Name = new_data.Name.str.lower()

In [14]:
# classifying name into male or female based on how many times it is used as male or female
new_data['Diff'] = (new_data['M'] - new_data['F'])
new_data['Gender'] = ['Male' if diff > 0 else 'Female' for diff in new_data['Diff']]

In [15]:
new_data.drop(columns=['F','M','Diff'],inplace=True)

In [16]:
X = new_data['Name']
y = new_data['Gender']

In [17]:
from sklearn.model_selection import train_test_split
#splitting the data in training and test set
X_train , X_test , y_train , y_test = train_test_split(X,y, test_size = 0.3, random_state = 101)

In [18]:
# Using countvectorizer to transform text into token
from sklearn.feature_extraction.text import CountVectorizer
vector = CountVectorizer(analyzer='char_wb',ngram_range=(2, 2))

In [19]:
#fitting train and test data and then transforming it to count matrix
X_train = vector.fit_transform(X_train)
X_test = vector.transform(X_test)

# importing naive bayes algorithm
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()

In [20]:
# fitting the model into train data 
model.fit(X_train,y_train)

# predicting the model on test data
y_pred = model.predict(X_test)

# checking accuracy score
from sklearn.metrics import accuracy_score
print('Accuracy : ' ,accuracy_score(y_test,y_pred)*100)

# Confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test,y_pred)
print(cm)

Accuracy :  82.43334398409486
[[15580  2316]
 [ 2632  7639]]


## Testing the model

In [291]:
new_text = pd.Series('alex')
new_text_transform = vector.transform(new_text)
print(" The name is mostly " ,model.predict(new_text_transform))

 The name is mostly  ['Male']


In [292]:
new_text = pd.Series('lasya')
new_text_transform = vector.transform(new_text)
print(" The name is mostly " ,model.predict(new_text_transform))

 The name is mostly  ['Female']
