<a href="https://colab.research.google.com/github/TheQuanEnthusiast/MachineLearning/blob/main/Gender_Classification_With_Names.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Data Preparing

In [1]:
import numpy as np
import pandas as pd

In [6]:
d1 = pd.read_csv('/content/female.csv', header=None)

In [7]:
d2 = pd.read_csv('/content/male.csv', header=None)

In [8]:
d1.head()

Unnamed: 0,0
0,Abagael
1,Abagail
2,Abbe
3,Abbey
4,Abbi


In [9]:
d2.head()

Unnamed: 0,0
0,Aamir
1,Aaron
2,Abbey
3,Abbie
4,Abbot


In [11]:
#Let's assign column name 'name' to our DataFrame
d1.columns = ['name']
d2.columns = ['name']

In [12]:
d1.head()

Unnamed: 0,name
0,Abagael
1,Abagail
2,Abbe
3,Abbey
4,Abbi


In [13]:
d2.head()

Unnamed: 0,name
0,Aamir
1,Aaron
2,Abbey
3,Abbie
4,Abbot


In [14]:
d1.shape

(5001, 1)

In [15]:
d2.shape

(2943, 1)

In [16]:
#Here we will need second column 'sex' to classify our names whether they are male or female.
#Let's assign 0 to female and 1 to male. We can do create this with np.zeros and np.ones method
n1 = np.zeros(shape=(5001, 1)).astype(int)
n2 = np.ones(shape=(2943, 1)).astype(int)

In [23]:
#As our n1 and n2 are numpy arrays we must convert them into pandas DataFrame
n1_converted = pd.DataFrame(n1, columns = ['sex'])
n2_converted = pd.DataFrame(n2, columns=['sex'])

In [47]:
data1 = np.vstack((d1, d2))

In [49]:
data2 = np.vstack((n1_converted, n2_converted))

In [52]:
x = np.hstack((data1, data2))

In [53]:
x1 = pd.DataFrame(x, columns = ['name', 'sex'])

In [64]:
df = x1.sample(frac=1)

In [65]:
df

Unnamed: 0,name,sex
3376,Marni,0
4112,Rhodia,0
1069,Constantia,0
3737,Nerti,0
6189,Hirsch,1
...,...,...
3831,Odetta,0
5558,Damian,1
6784,Muhammad,1
4114,Rhody,0


In [67]:
#Importing ML algorithms
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import DictVectorizer

In [68]:
df.head()

Unnamed: 0,name,sex
3376,Marni,0
4112,Rhodia,0
1069,Constantia,0
3737,Nerti,0
6189,Hirsch,1


In [140]:
df['sex'] = pd.to_numeric(df['sex'])

In [141]:
df.size

15888

In [142]:
df.shape

(7944, 2)

In [143]:
df.columns

Index(['name', 'sex'], dtype='object')

In [144]:
df.dtypes

name    object
sex      int64
dtype: object

In [145]:
#Checking for null values
df.isnull().sum()

name    0
sex     0
dtype: int64

In [146]:
X_features = df['name']

In [147]:
cv = CountVectorizer()
X = cv.fit_transform(X_features)

In [148]:
cv.get_feature_names()



['aamir',
 'aaron',
 'abagael',
 'abagail',
 'abbe',
 'abbey',
 'abbi',
 'abbie',
 'abbot',
 'abbott',
 'abby',
 'abdel',
 'abdul',
 'abdulkarim',
 'abdullah',
 'abe',
 'abel',
 'abelard',
 'abigael',
 'abigail',
 'abigale',
 'abner',
 'abra',
 'abraham',
 'abram',
 'acacia',
 'ace',
 'ada',
 'adah',
 'adair',
 'adaline',
 'adam',
 'adams',
 'adara',
 'addie',
 'addis',
 'adel',
 'adela',
 'adelaide',
 'adele',
 'adelice',
 'adelina',
 'adelind',
 'adeline',
 'adella',
 'adelle',
 'adena',
 'adey',
 'adger',
 'adi',
 'adiana',
 'adina',
 'aditya',
 'adlai',
 'adnan',
 'adolf',
 'adolfo',
 'adolph',
 'adolphe',
 'adolpho',
 'adolphus',
 'adora',
 'adore',
 'adoree',
 'adorne',
 'adrea',
 'adria',
 'adriaens',
 'adrian',
 'adriana',
 'adriane',
 'adrianna',
 'adrianne',
 'adrick',
 'adrien',
 'adriena',
 'adrienne',
 'aeriel',
 'aeriela',
 'aeriell',
 'ag',
 'agace',
 'agamemnon',
 'agata',
 'agatha',
 'agathe',
 'aggi',
 'aggie',
 'aggy',
 'agna',
 'agnella',
 'agnes',
 'agnese',
 'agne

In [149]:
from sklearn.model_selection import train_test_split

In [150]:
X  #features
y = df['sex'] #Labels

In [151]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [152]:
#Naive Bayes Classification
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB()
classifier.fit(X_train, y_train)
classifier.score(X_test, y_test)

0.5939597315436241

In [158]:
#The accuracy of NB model is about 59.4%. It's not a good result. Need to do some tuning and choose better model.
print('The accuracy of model: %.3f ' %classifier.score(X_test, y_test))



The accuracy of model: 0.594 


In [159]:
#Let's write a function to predict the gender
def predictgender(x):
  nametesting = [x]
  vector = cv.transform(nametesting).toarray()
  if classifier.predict(vector) == 0:
    print('female')
  else:
    print('male')

In [160]:
predictgender('Marni')

female


In [162]:
predictgender('Rhodia')

female


In [169]:
predictgender('Hirsch')

female


In [170]:
predictgender('Damian')

male


In [171]:
predictgender('Odetta')

female


In [173]:
predictgender('Muhammad')

male


In [174]:
#Let's do some feature analysis. As we see from dataset most of the female names ends in A or E or they have A sound in their pronunciation
def features(name):
  name = name.lower()
  return {
      'first_letter': name[0],
      'first2letters': name[0:2],
      'first3letters': name[0:3],
      'lastletter': name[-1],
      'last2letters': name[-2:],
      'last3letters': name[-3:]
  }

In [175]:
#Vectorize features
features = np.vectorize(features)

In [176]:
#Determine features and labels
df_X = features(df['name'])
df_y = df['sex']

In [177]:
from sklearn.feature_extraction import DictVectorizer


In [178]:
df_X_train, df_X_test, df_y_train, df_y_test = train_test_split(df_X, df_y, test_size=0.3, random_state=1)

In [179]:
df_X_train

array([{'first_letter': 'a', 'first2letters': 'al', 'first3letters': 'ale', 'lastletter': 'i', 'last2letters': 'xi', 'last3letters': 'exi'},
       {'first_letter': 's', 'first2letters': 'st', 'first3letters': 'ste', 'lastletter': 'd', 'last2letters': 'rd', 'last3letters': 'ard'},
       {'first_letter': 'r', 'first2letters': 'ro', 'first3letters': 'rox', 'lastletter': 'i', 'last2letters': 'xi', 'last3letters': 'oxi'},
       ...,
       {'first_letter': 'l', 'first2letters': 'lu', 'first3letters': 'lud', 'lastletter': 'g', 'last2letters': 'ig', 'last3letters': 'wig'},
       {'first_letter': 'h', 'first2letters': 'ha', 'first3letters': 'han', 'lastletter': 'l', 'last2letters': 'el', 'last3letters': 'sel'},
       {'first_letter': 'b', 'first2letters': 'be', 'first3letters': 'ben', 'lastletter': 'i', 'last2letters': 'ni', 'last3letters': 'nni'}],
      dtype=object)

In [180]:
dictvectorizer = DictVectorizer()
dictvectorizer.fit_transform(df_X_train)

<5560x2923 sparse matrix of type '<class 'numpy.float64'>'
	with 33360 stored elements in Compressed Sparse Row format>

In [182]:
#Now let's build a ml model using DecisionTreeClassifier
from sklearn.tree import DecisionTreeClassifier
dtcl = DecisionTreeClassifier()
new_features = dictvectorizer.transform(df_X_train)
dtcl.fit(new_features, df_y_train)

DecisionTreeClassifier()

In [183]:
name = ['Hirsch']
transformed_dictvectorizer = dictvectorizer.transform(features(name))

In [184]:
vector = transformed_dictvectorizer.toarray()

In [185]:
dtcl.predict(vector)

array([1])

In [186]:
#As we see, in our first NB classifier Hirsch has been classified as female, but in DT model it has been classified correctly(as a male)


In [187]:
if dtcl.predict(vector) == 0:
  print('female')
else:
  print('male')

male


In [188]:
#Now let's print accuracy of our model
print(dtcl.score(dictvectorizer.transform(df_X_train), df_y_train))

0.9652877697841726


In [189]:
print(dtcl.score(dictvectorizer.transform(df_X_test), df_y_test))

0.7885906040268457


In [190]:
#As we see DecisionTree classifier better on our dataset than NaiveBayes classifier