# Baseline Classifier Evaluation

** uses nltk Max Entropy classifier **

original algorithm here: https://github.com/kitofans/ethnicityguesser

Evaluation method as prepared below made by our team.:)

In [19]:
import pandas as pd
import csv
import pickle
import numpy as np
from NLTKMaxentEthnicityClassifier import NLTKMaxentEthnicityClassifier as mxec


In [30]:
## Import lists

from os import walk

f = []
for (dirpath, dirnames, filenames) in walk("pickled_names"):
    f.extend(filenames)
    break

# clean .txt
ethnicities = []
for each in f:
    ethnicities.append(each.partition('.')[0])

ethnicities

eth_dict = {}

for ethnicity in ethnicities:
    with open('pickled_names/'+ethnicity+'.pkl', 'rb') as filename:
        names = pickle.load(filename)
    eth_dict[ethnicity] = names




In [33]:
ethnicities

['indian',
 'spanish',
 'ukranian',
 'french',
 'african',
 'korean',
 'czech',
 'irish',
 'vietnamese',
 'swedish',
 'greek',
 'muslim',
 'chinese',
 'italian',
 'slavic',
 'danish',
 'japanese',
 'jewish',
 'swiss',
 'arabic',
 'portugese',
 'german',
 'russian']

In [44]:
## make a super list of names and true ethnicities

super_list_names = []
super_list_ethnicities = []

for ethnicity in ethnicities:
    name_list = eth_dict[ethnicity][0]
    eth_list = []
    for name in name_list:
        eth_list.append(ethnicity)
    super_list_names = super_list_names + name_list
    super_list_ethnicities = super_list_ethnicities + eth_list
    
df = pd.DataFrame(
            {'Name': super_list_names,
             'True Ethnicity': super_list_ethnicities
            })
    

In [89]:
df.head()

Unnamed: 0,Name,True Ethnicity
0,Acharya,indian
1,Agarwal,indian
2,Agate,indian
3,Aggarwal,indian
4,Agrawal,indian


In [49]:
## Split into Training and Test
msk = np.random.rand(len(df)) < 0.3
train_df = df[msk]
test_df = df[~msk]

print len(test_df), len(train_df)

14046 6199


In [56]:
## Package DF into token
train_tokens = []
for ethnicity in ethnicities:
    new_tokens = (list(train_df[train_df['True Ethnicity'] == ethnicity]['Name']), ethnicity)
    train_tokens.append(new_tokens)

## tokens must be a list of ([list of names], 'ethnicity') pairs. Ethnicities can be repeated.

In [59]:
## Train Classifier (beware, this takes time)

classifier = mxec(train_tokens)
classifier.train()

  ==> Training (100 iterations)

      Iteration    Log Likelihood    Accuracy
      ---------------------------------------
             1          -3.13549        0.006
             2          -1.35965        0.708
             3          -0.97083        0.862
             4          -0.76069        0.914
             5          -0.63048        0.937
             6          -0.54141        0.949
             7          -0.47632        0.955
             8          -0.42650        0.959
             9          -0.38707        0.962
            10          -0.35502        0.964
            11          -0.32844        0.965
            12          -0.30602        0.966
            13          -0.28683        0.966
            14          -0.27022        0.967
            15          -0.25570        0.967
            16          -0.24289        0.967
            17          -0.23150        0.968
            18          -0.22130        0.968
            19          -0.21212        0.968
 

In [95]:
print classifier.classify('alexander')
print classifier.classify('robert')
print classifier.classify('li')
print classifier.classify('sajkfldsafh')

swedish
french
chinese
jewish


In [76]:
## Predict!!!!

test_names = list(test_df['Name'])
test_eth = list(test_df['True Ethnicity'])

test_preds = []

for name in test_names:
    pred = classifier.classify(name)
    test_preds.append(pred)

df_preds = pd.DataFrame({
    'Name': test_names,
    'True Ethnicity': test_eth,
    'Prediction': test_preds
})

df_preds.head()

Unnamed: 0,Name,Prediction,True Ethnicity
0,Acharya,indian,indian
1,Agarwal,indian,indian
2,Agate,french,indian
3,Aggarwal,indian,indian
4,Agrawal,jewish,indian


In [97]:
# Add True if you got it right
df_preds['Accuracy'] = (df_preds['Prediction']==df_preds['True Ethnicity'])
df_preds.head()

Unnamed: 0,Name,Prediction,True Ethnicity,Accuracy
0,Acharya,indian,indian,True
1,Agarwal,indian,indian,True
2,Agate,french,indian,False
3,Aggarwal,indian,indian,True
4,Agrawal,jewish,indian,False


In [79]:
## Tool to Calculate TPR

def calcAccuracy(df):
    length = len(df)
    length_true = len(df[df['Accuracy']==True])
    return float(length_true)/float(length)

In [86]:
accuracies = []
ethnicity_list = []

# ethnic accuracies
for ethnicity in ethnicities:
    accuracy = calcAccuracy(df_preds[df_preds['True Ethnicity']==ethnicity])
    accuracies.append(accuracy)
    ethnicity_list.append(ethnicity)

# Aggregate accuracy
accuracies.append(calcAccuracy(df_preds))
ethnicity_list.append('OVERALL')

# put into df
df_acc = pd.DataFrame({
    'ethnicity': ethnicity_list,
    'True Positive Rate': accuracies
})

df_acc.set_index('ethnicity', inplace=True)

In [87]:
df_acc

Unnamed: 0_level_0,True Positive Rate
ethnicity,Unnamed: 1_level_1
indian,0.256858
spanish,0.607336
ukranian,0.188119
french,0.793724
african,0.221698
korean,0.04878
czech,0.546201
irish,0.348624
vietnamese,0.065217
swedish,0.593556
