In [19]:
from __future__ import division
from collections import defaultdict
from math import log

def train(samples):
    classes, freq = defaultdict(lambda:0), defaultdict(lambda:0)
#     print(classes)
#     print(freq)
    for feats, label in samples:
        classes[label] += 1                 # count classes frequencies
        for feat in feats:
            freq[label, feat] += 1          # count features frequencies

    print(freq)
            
    for label, feat in freq:                # normalize features frequencies
        freq[label, feat] /= classes[label]
    for c in classes:                       # normalize classes frequencies
        classes[c] /= len(samples)

    return classes, freq                    # return P(C) and P(O|C)

def classify(classifier, feats):
    classes, prob = classifier
    returned_value = min(classes.keys(),              # calculate argmin(-log(C|O))
        key = lambda cl: -log(classes[cl]) + \
            sum(-log(prob.get((cl,feat), 10**(-7))) for feat in feats))
#     print(classes.keys())# ['M', 'F']
#     print(classes['F'])# aprior variety
#     print(returned_value)# M or F
#     print(prob) # ('M', 'll: e'): 0.097
    return returned_value

In [20]:
features = [(get_features(row[0]), row[1]) for _index,row in sub_samples.iterrows()]
classifier = train(features)

defaultdict(<function train.<locals>.<lambda> at 0x10b1bca60>, {('M', 'll: e'): 23, ('F', 'll: i'): 47, ('M', 'll: l'): 28, ('F', 'll: a'): 227, ('M', 'll: i'): 31, ('M', 'll: a'): 19, ('M', 'll: o'): 18, ('F', 'll: g'): 2, ('F', 'll: e'): 80, ('F', 'll: o'): 9, ('F', 'll: á'): 1, ('F', 'll: h'): 71, ('F', 'll: n'): 33, ('M', 'll: n'): 99, ('M', 'll: m'): 43, ('M', 'll: u'): 7, ('M', 'll: d'): 22, ('M', 'll: h'): 35, ('M', 'll: v'): 10, ('M', 'll: k'): 13, ('M', 'll: t'): 12, ('F', 'll: t'): 4, ('M', 'll: r'): 32, ('M', 'll: p'): 2, ('F', 'll: s'): 5, ('M', 'll: s'): 16, ('F', 'll: r'): 5, ('F', 'll: l'): 44, ('M', 'll: b'): 11, ('M', 'll: y'): 16, ('M', 'll: z'): 7, ('M', 'll: c'): 3, ('M', 'll: f'): 3, ('F', 'll: y'): 11, ('M', 'll: q'): 4, ('M', 'll: w'): 1, ('M', 'll: j'): 1, ('F', 'll: k'): 1, ('F', 'll: u'): 1, ('F', 'll: d'): 2, ('F', 'll: z'): 1})


In [21]:
print('gender: ', classify(classifier, get_features(u'peter')))

gender:  M


In [22]:
import pandas
import numpy as np

In [4]:
samples = pandas.read_csv('names.csv', index_col=0)

In [5]:
samples.groupby('sex')['name'].nunique()

sex
F    61179
M    35178
Name: name, dtype: int64

In [6]:
sub_samples = samples[0:1000]

In [7]:
samples.head(10)

Unnamed: 0,name,sex
0,Abebe,M
1,Abebi,F
2,Abel,M
3,Abena,F
4,Abeni,F
5,Abidemi,F
6,Abidemi,M
7,Abimbola,M
8,Abioye,M
9,Abrafo,M


In [16]:
def get_features(sample): return (
        'll: %s' % sample[-1],          # get last letter
#         'fl: %s' % sample[0],           # get first letter
#         'sl: %s' % sample[1],           # get second letter
        )

In [17]:
# samples = (line.decode('utf-8').split() for line in open('names.txt'))
features = [(get_features(row[0]), row[1]) for _index,row in sub_samples.iterrows()]
classifier = train(features)

print('gender: ', classify(classifier, get_features(u'ivan')))

gender:  M


In [18]:
for feats, label in features:
    print(feats)
    print(label)
    print('_____________')

('ll: e',)
M
_____________
('ll: i',)
F
_____________
('ll: l',)
M
_____________
('ll: a',)
F
_____________
('ll: i',)
F
_____________
('ll: i',)
F
_____________
('ll: i',)
M
_____________
('ll: a',)
M
_____________
('ll: e',)
M
_____________
('ll: o',)
M
_____________
('ll: g',)
F
_____________
('ll: e',)
F
_____________
('ll: a',)
F
_____________
('ll: e',)
F
_____________
('ll: e',)
M
_____________
('ll: a',)
M
_____________
('ll: o',)
M
_____________
('ll: i',)
M
_____________
('ll: o',)
F
_____________
('ll: a',)
F
_____________
('ll: a',)
M
_____________
('ll: a',)
F
_____________
('ll: o',)
F
_____________
('ll: o',)
M
_____________
('ll: a',)
F
_____________
('ll: a',)
F
_____________
('ll: a',)
F
_____________
('ll: i',)
M
_____________
('ll: a',)
F
_____________
('ll: á',)
F
_____________
('ll: o',)
F
_____________
('ll: i',)
F
_____________
('ll: o',)
F
_____________
('ll: a',)
F
_____________
('ll: a',)
M
_____________
('ll: a',)
F
_____________
('ll: i',)
M
_____________
(

('ll: l',)
M
_____________
('ll: a',)
F
_____________
('ll: e',)
F
_____________
('ll: a',)
F
_____________
('ll: e',)
F
_____________
('ll: n',)
M
_____________
('ll: e',)
F
_____________
('ll: l',)
F
_____________
('ll: t',)
F
_____________
('ll: l',)
F
_____________
('ll: e',)
F
_____________
('ll: a',)
F
_____________
('ll: n',)
M
_____________
('ll: a',)
F
_____________
('ll: a',)
F
_____________
('ll: h',)
F
_____________
('ll: a',)
F
_____________
('ll: h',)
F
_____________
('ll: a',)
F
_____________
('ll: m',)
M
_____________
('ll: n',)
M
_____________
('ll: m',)
M
_____________
('ll: y',)
F
_____________
('ll: a',)
F
_____________
('ll: a',)
F
_____________
('ll: a',)
F
_____________
('ll: e',)
F
_____________
('ll: l',)
F
_____________
('ll: t',)
F
_____________
('ll: m',)
M
_____________
('ll: n',)
M
_____________
('ll: r',)
M
_____________
('ll: y',)
F
_____________
('ll: n',)
M
_____________
('ll: u',)
M
_____________
('ll: r',)
M
_____________
('ll: r',)
M
_____________
(

In [10]:
print('gender: ', classify(classifier, get_features(u'sandra')))

gender:  F
