In [1]:
# Importing libraries
import pandas as pd
import numpy as np
from pandas import read_csv
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split


In [2]:
# data loading 
filename='names_dataset.csv'
data=read_csv(filename)
data

Unnamed: 0,name,sex
0,Mary,F
1,Anna,F
2,Emma,F
3,Elizabeth,F
4,Minnie,F
...,...,...
125078,shivani,f
125079,nayna,f
125080,ujwal,M
125081,Prajwal,M


In [3]:
# handling missing values 
data.shape
# data.isnull().sum()
# as no null values so no need to drop any row or handle anything

(125083, 2)

In [4]:
# Data Coversion so that we get data in integer type
data_names=data
data.sex.replace({'F':0,'M':1},inplace=True)
data.sex.replace({'f':0,'m':1},inplace=True)
data.tail()
data_names # changes are reflected in both data_names and data
data

Unnamed: 0,name,sex
0,Mary,0
1,Anna,0
2,Emma,0
3,Elizabeth,0
4,Minnie,0
...,...,...
125078,shivani,0
125079,nayna,0
125080,ujwal,1
125081,Prajwal,1


In [5]:
data_names.sex.unique()

array([0, 1], dtype=int64)

In [6]:
data.loc[33]

name    Lillie
sex          0
Name: 33, dtype: object

In [7]:
# Feature Selection as deciding x and y 
data_x=data_names.name
data_y=data_names.sex
data_x
data_y


0         0
1         0
2         0
3         0
4         0
         ..
125078    0
125079    0
125080    1
125081    1
125082    0
Name: sex, Length: 125083, dtype: int64

In [8]:
# feature Extraction  
# MOst Important work done by CountVectorizer (NLP)
corpus=data_x
vectorizer=CountVectorizer()
X=vectorizer.fit_transform(corpus)

In [9]:
# Train Test split
X_train,X_test,Y_train,Y_test=train_test_split(X,data_y,test_size=0.3,random_state=45)# we are passing transformed x by countVectorizer 

In [10]:
# fit the model
from sklearn.naive_bayes import MultinomialNB
clf=MultinomialNB () # model call
clf.fit(X_train,Y_train) 
clf.score(X_test,Y_test)

0.6985742838107928

In [11]:
# checking accuracy 
# Cross Validation

from sklearn.model_selection import cross_val_score
v=cross_val_score(clf,X_train,Y_train,cv=10) # here we are passing our model and traing data to perform cross validation
for i in range(10):
    print ("Accuracy of naive bayes is :{0:.2%}".format(v[i,]))
print("")
print("mean accuracy of naive bayes is ",v.mean())


Accuracy of naive bayes is :69.42%
Accuracy of naive bayes is :69.61%
Accuracy of naive bayes is :69.45%
Accuracy of naive bayes is :69.78%
Accuracy of naive bayes is :69.95%
Accuracy of naive bayes is :70.00%
Accuracy of naive bayes is :69.69%
Accuracy of naive bayes is :69.51%
Accuracy of naive bayes is :70.30%
Accuracy of naive bayes is :69.89%

mean accuracy of naive bayes is  0.6975948143187252


In [12]:
# accuracy by using different metrics  
""" 
RECALL  =TP/(TP+FN)
PRECISON = TP/(TP+FP)
F1 SCORE 
"""
from sklearn.metrics import classification_report
fit=clf.fit(X_train,Y_train)
predict=fit.predict(X_test) # as we are predicting so need to pass x_test , and predicting on fit data of training 
print(classification_report(Y_test,predict,labels=[1,2,3])) # here we are passing y_test and predicted x_test

              precision    recall  f1-score   support

           1       0.94      0.25      0.39     14752
           2       0.00      0.00      0.00         0
           3       0.00      0.00      0.00         0

   micro avg       0.94      0.25      0.39     14752
   macro avg       0.31      0.08      0.13     14752
weighted avg       0.94      0.25      0.39     14752



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [13]:
# developing a model and dumping
import joblib
import pickle
joblib.dump(clf,'model\gender_model.pkl')
print("Model is Dumped")

Model is Dumped


In [14]:
df = pd.read_csv("names_dataset.csv")
# Features and Labels
df_X = df.name
df_Y = df.sex

# Vectorization
corpus = df_X
cv = CountVectorizer()
X = cv.fit_transform(corpus)


In [15]:
naivebayes_model = open("model/gender_model.pkl", "rb")
clf = joblib.load(naivebayes_model)


In [16]:
data = ["Gauri"]
vect = cv.transform(data).toarray()
print(clf.predict(vect))


[0]
