UnSupervised

In [31]:
#imports
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
import random 
import numpy as np


In [2]:
random.seed(42)

In [3]:
#read dataset
data = pd.read_csv('./name_gender_dataset.csv')

In [4]:
#viewing data
data.head()

Unnamed: 0,Name,Gender,Count,Probability
0,James,M,5304407,0.014517
1,John,M,5260831,0.014398
2,Robert,M,4970386,0.013603
3,Michael,M,4579950,0.012534
4,William,M,4226608,0.011567


In [5]:
#get unique values in Gender
data.Gender.unique()

array(['M', 'F'], dtype=object)

In [20]:
#count of each unique values
data.Gender.value_counts()

F    89749
M    57520
Name: Gender, dtype: int64

In [7]:
#check any empty value in dataset
data.isnull().sum()

Name           0
Gender         0
Count          0
Probability    0
dtype: int64

In [25]:
#print names which occur more than 1 times
tmp = data[['Name','Gender']].value_counts()

In [36]:
tmp.shape

(147269,)

In [26]:
tmp.head()

Name     Gender
A        F         1
Nashira  F         1
Nashiem  M         1
Nashika  F         1
Nashim   M         1
dtype: int64

In [35]:
np.where(tmp > 1)

(array([], dtype=int64),)

Data Preprocessing

In [10]:
vectorizer = TfidfVectorizer()
x = vectorizer.fit_transform(data['Name'])

In [11]:
# k = 2 because we know there are two y variables
model = KMeans(2)
model.fit(x)

KMeans(n_clusters=2)

In [12]:
#get labels for each dataum
model.labels_

array([0, 0, 0, ..., 0, 0, 0], dtype=int32)

In [13]:
#get encoded values
pd.get_dummies(data['Gender'],drop_first=True)

Unnamed: 0,M
0,1
1,1
2,1
3,1
4,1
...,...
147264,1
147265,1
147266,1
147267,1


In [14]:
actual = pd.get_dummies(data['Gender'],drop_first=True)

In [15]:
actual[:10]

Unnamed: 0,M
0,1
1,1
2,1
3,1
4,1
5,0
6,1
7,1
8,1
9,1


In [16]:
pred = model.labels_

In [17]:
pred.shape

(147269,)

In [18]:
actual['M'].shape

(147269,)

In [19]:
res = pred == actual['M']

In [20]:
res.value_counts()

True     88606
False    58663
Name: M, dtype: int64

In [21]:
(res == True).sum()

88606

In [22]:
(res == False).sum()

58663

In [23]:
len(pred)

147269

In [24]:
#check accuracy by manual
(res == True).sum() / len(pred)

0.6016609062328121

In [25]:
#check accuracy inbuilt method
accuracy_score(pred,actual['M'])

0.6016609062328121

In [26]:
#using DBSCAN
# model = Birch(n_clusters=2)
# model.fit(x)

In [27]:
# model.labels_

In [54]:
#Hyper paramete tuning
param = {
    'init':['k-means++','random'],
    'n_init':[20,50,100,150],
    'max_iter':[50,100,150,200,250,300]
}

grd = GridSearchCV(KMeans(2),param)
grd.fit(x)

GridSearchCV(estimator=KMeans(n_clusters=2),
             param_grid={'init': ['k-means++', 'random'],
                         'max_iter': [50, 100, 150, 200, 250, 300],
                         'n_init': [20, 50, 100, 150]})

In [40]:
grd.best_params_

{'init': 'k-means++', 'max_iter': 100, 'n_init': 100}

In [55]:
model = KMeans(2,max_iter=50,n_init=50)
model.fit(x)
pred = model.labels_
accuracy_score(actual['M'],pred)

0.6061221302514447

In [56]:
model = KMeans(2,max_iter=50,n_init=100)
model.fit(x)
pred = model.labels_
accuracy_score(actual['M'],pred)

0.6061221302514447

In [57]:
model = KMeans(2,max_iter=100,n_init=100)
model.fit(x)
pred = model.labels_
accuracy_score(actual['M'],pred)

0.6016609062328121