Idea for tuning:

need to specify which metrics need to be computed, which one needs to be minimized, and how large k is

* build k splits into train and test data
* for each train-test-split:
    * build a parameter grid using cls.tuning_params. (For each class we have to find out what a good grid would be.)
    * for each combination in the grid: 
        * translate api-response into f,m,u and store in the column gender_infered (GOT THIS FAR!!!)
        * compute all provided metrics on the resulting test_data DF restricted to training data
        * store relation grid-point:metrics
    * select grid_point which minimizes specified metric and compute all provided metrics on test set
    * store the metrics on test set
* compute average of metrics on all test sets for all metrics

In [1]:
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from evaluators import *
from itertools import product

In [2]:
data_source = 'all'
service_name = GenderizeIoEvaluator
evaluator = service_name(data_source)

In [3]:
evaluator.load_data(evaluated=True)

In [4]:
evaluator.test_data.head()

Unnamed: 0,first_name,middle_name,last_name,full_name,gender,origin,api_count,api_gender,api_name,api_probability,gender_infered
0,pierre,paul,grivel,pierre paul grivel,m,zbmath,5.0,male,pierre-paul,1.0,m
1,raul,,serapioni,raul serapioni,m,zbmath,821.0,male,raul,1.0,m
2,adriano,,moura,adriano moura,m,zbmath,166.0,male,adriano,0.99,m
3,ralf,,kieser,ralf kieser,m,zbmath,86.0,male,ralf,1.0,m
4,teppei,,ariyoshi,teppei ariyoshi,u,zbmath,,,teppei,,u


In [5]:
# we have to ignore 'u' in column 'gender'
evaluator.remove_rows_with_unknown_gender()

In [6]:
evaluator.tuning_params

('api_count', 'api_probability')

In [7]:
evaluator.build_parameter_grid([1,10, 50, 100], [0.5, 0.7, 0.8, 0.9])

[{'api_count': 1, 'api_probability': 0.5},
 {'api_count': 1, 'api_probability': 0.7},
 {'api_count': 1, 'api_probability': 0.8},
 {'api_count': 1, 'api_probability': 0.9},
 {'api_count': 10, 'api_probability': 0.5},
 {'api_count': 10, 'api_probability': 0.7},
 {'api_count': 10, 'api_probability': 0.8},
 {'api_count': 10, 'api_probability': 0.9},
 {'api_count': 50, 'api_probability': 0.5},
 {'api_count': 50, 'api_probability': 0.7},
 {'api_count': 50, 'api_probability': 0.8},
 {'api_count': 50, 'api_probability': 0.9},
 {'api_count': 100, 'api_probability': 0.5},
 {'api_count': 100, 'api_probability': 0.7},
 {'api_count': 100, 'api_probability': 0.8},
 {'api_count': 100, 'api_probability': 0.9}]

In [8]:
grid = evaluator.build_parameter_grid([1,10, 50, 100], [0.5, 0.7, 0.8, 0.9])

In [9]:
# TODO: integrate into code for this specific evaluator
for col in evaluator.tuning_params:
    evaluator.test_data[col] = evaluator.test_data[col].replace({'':None}) # None is coded as empty string
    evaluator.test_data[col] = evaluator.test_data[col].astype(float)

In [10]:
for item in grid:
    print(item)
    evaluator._translate_api_response(**item)
    display(evaluator.test_data.head())

{'api_count': 1, 'api_probability': 0.5}


Unnamed: 0,index,first_name,middle_name,last_name,full_name,gender,origin,api_count,api_gender,api_name,api_probability,gender_infered
0,0,pierre,paul,grivel,pierre paul grivel,m,zbmath,5.0,male,pierre-paul,1.0,m
1,1,raul,,serapioni,raul serapioni,m,zbmath,821.0,male,raul,1.0,m
2,2,adriano,,moura,adriano moura,m,zbmath,166.0,male,adriano,0.99,m
3,3,ralf,,kieser,ralf kieser,m,zbmath,86.0,male,ralf,1.0,m
4,5,guillermo,,leon-de-la-barra,guillermo leon-de-la-barra,m,zbmath,850.0,male,guillermo,1.0,m


{'api_count': 1, 'api_probability': 0.7}


Unnamed: 0,index,first_name,middle_name,last_name,full_name,gender,origin,api_count,api_gender,api_name,api_probability,gender_infered
0,0,pierre,paul,grivel,pierre paul grivel,m,zbmath,5.0,male,pierre-paul,1.0,m
1,1,raul,,serapioni,raul serapioni,m,zbmath,821.0,male,raul,1.0,m
2,2,adriano,,moura,adriano moura,m,zbmath,166.0,male,adriano,0.99,m
3,3,ralf,,kieser,ralf kieser,m,zbmath,86.0,male,ralf,1.0,m
4,5,guillermo,,leon-de-la-barra,guillermo leon-de-la-barra,m,zbmath,850.0,male,guillermo,1.0,m


{'api_count': 1, 'api_probability': 0.8}


Unnamed: 0,index,first_name,middle_name,last_name,full_name,gender,origin,api_count,api_gender,api_name,api_probability,gender_infered
0,0,pierre,paul,grivel,pierre paul grivel,m,zbmath,5.0,male,pierre-paul,1.0,m
1,1,raul,,serapioni,raul serapioni,m,zbmath,821.0,male,raul,1.0,m
2,2,adriano,,moura,adriano moura,m,zbmath,166.0,male,adriano,0.99,m
3,3,ralf,,kieser,ralf kieser,m,zbmath,86.0,male,ralf,1.0,m
4,5,guillermo,,leon-de-la-barra,guillermo leon-de-la-barra,m,zbmath,850.0,male,guillermo,1.0,m


{'api_count': 1, 'api_probability': 0.9}


Unnamed: 0,index,first_name,middle_name,last_name,full_name,gender,origin,api_count,api_gender,api_name,api_probability,gender_infered
0,0,pierre,paul,grivel,pierre paul grivel,m,zbmath,5.0,male,pierre-paul,1.0,m
1,1,raul,,serapioni,raul serapioni,m,zbmath,821.0,male,raul,1.0,m
2,2,adriano,,moura,adriano moura,m,zbmath,166.0,male,adriano,0.99,m
3,3,ralf,,kieser,ralf kieser,m,zbmath,86.0,male,ralf,1.0,m
4,5,guillermo,,leon-de-la-barra,guillermo leon-de-la-barra,m,zbmath,850.0,male,guillermo,1.0,m


{'api_count': 10, 'api_probability': 0.5}


Unnamed: 0,index,first_name,middle_name,last_name,full_name,gender,origin,api_count,api_gender,api_name,api_probability,gender_infered
0,0,pierre,paul,grivel,pierre paul grivel,m,zbmath,5.0,male,pierre-paul,1.0,u
1,1,raul,,serapioni,raul serapioni,m,zbmath,821.0,male,raul,1.0,m
2,2,adriano,,moura,adriano moura,m,zbmath,166.0,male,adriano,0.99,m
3,3,ralf,,kieser,ralf kieser,m,zbmath,86.0,male,ralf,1.0,m
4,5,guillermo,,leon-de-la-barra,guillermo leon-de-la-barra,m,zbmath,850.0,male,guillermo,1.0,m


{'api_count': 10, 'api_probability': 0.7}


Unnamed: 0,index,first_name,middle_name,last_name,full_name,gender,origin,api_count,api_gender,api_name,api_probability,gender_infered
0,0,pierre,paul,grivel,pierre paul grivel,m,zbmath,5.0,male,pierre-paul,1.0,u
1,1,raul,,serapioni,raul serapioni,m,zbmath,821.0,male,raul,1.0,m
2,2,adriano,,moura,adriano moura,m,zbmath,166.0,male,adriano,0.99,m
3,3,ralf,,kieser,ralf kieser,m,zbmath,86.0,male,ralf,1.0,m
4,5,guillermo,,leon-de-la-barra,guillermo leon-de-la-barra,m,zbmath,850.0,male,guillermo,1.0,m


{'api_count': 10, 'api_probability': 0.8}


Unnamed: 0,index,first_name,middle_name,last_name,full_name,gender,origin,api_count,api_gender,api_name,api_probability,gender_infered
0,0,pierre,paul,grivel,pierre paul grivel,m,zbmath,5.0,male,pierre-paul,1.0,u
1,1,raul,,serapioni,raul serapioni,m,zbmath,821.0,male,raul,1.0,m
2,2,adriano,,moura,adriano moura,m,zbmath,166.0,male,adriano,0.99,m
3,3,ralf,,kieser,ralf kieser,m,zbmath,86.0,male,ralf,1.0,m
4,5,guillermo,,leon-de-la-barra,guillermo leon-de-la-barra,m,zbmath,850.0,male,guillermo,1.0,m


{'api_count': 10, 'api_probability': 0.9}


Unnamed: 0,index,first_name,middle_name,last_name,full_name,gender,origin,api_count,api_gender,api_name,api_probability,gender_infered
0,0,pierre,paul,grivel,pierre paul grivel,m,zbmath,5.0,male,pierre-paul,1.0,u
1,1,raul,,serapioni,raul serapioni,m,zbmath,821.0,male,raul,1.0,m
2,2,adriano,,moura,adriano moura,m,zbmath,166.0,male,adriano,0.99,m
3,3,ralf,,kieser,ralf kieser,m,zbmath,86.0,male,ralf,1.0,m
4,5,guillermo,,leon-de-la-barra,guillermo leon-de-la-barra,m,zbmath,850.0,male,guillermo,1.0,m


{'api_count': 50, 'api_probability': 0.5}


Unnamed: 0,index,first_name,middle_name,last_name,full_name,gender,origin,api_count,api_gender,api_name,api_probability,gender_infered
0,0,pierre,paul,grivel,pierre paul grivel,m,zbmath,5.0,male,pierre-paul,1.0,u
1,1,raul,,serapioni,raul serapioni,m,zbmath,821.0,male,raul,1.0,m
2,2,adriano,,moura,adriano moura,m,zbmath,166.0,male,adriano,0.99,m
3,3,ralf,,kieser,ralf kieser,m,zbmath,86.0,male,ralf,1.0,m
4,5,guillermo,,leon-de-la-barra,guillermo leon-de-la-barra,m,zbmath,850.0,male,guillermo,1.0,m


{'api_count': 50, 'api_probability': 0.7}


Unnamed: 0,index,first_name,middle_name,last_name,full_name,gender,origin,api_count,api_gender,api_name,api_probability,gender_infered
0,0,pierre,paul,grivel,pierre paul grivel,m,zbmath,5.0,male,pierre-paul,1.0,u
1,1,raul,,serapioni,raul serapioni,m,zbmath,821.0,male,raul,1.0,m
2,2,adriano,,moura,adriano moura,m,zbmath,166.0,male,adriano,0.99,m
3,3,ralf,,kieser,ralf kieser,m,zbmath,86.0,male,ralf,1.0,m
4,5,guillermo,,leon-de-la-barra,guillermo leon-de-la-barra,m,zbmath,850.0,male,guillermo,1.0,m


{'api_count': 50, 'api_probability': 0.8}


Unnamed: 0,index,first_name,middle_name,last_name,full_name,gender,origin,api_count,api_gender,api_name,api_probability,gender_infered
0,0,pierre,paul,grivel,pierre paul grivel,m,zbmath,5.0,male,pierre-paul,1.0,u
1,1,raul,,serapioni,raul serapioni,m,zbmath,821.0,male,raul,1.0,m
2,2,adriano,,moura,adriano moura,m,zbmath,166.0,male,adriano,0.99,m
3,3,ralf,,kieser,ralf kieser,m,zbmath,86.0,male,ralf,1.0,m
4,5,guillermo,,leon-de-la-barra,guillermo leon-de-la-barra,m,zbmath,850.0,male,guillermo,1.0,m


{'api_count': 50, 'api_probability': 0.9}


Unnamed: 0,index,first_name,middle_name,last_name,full_name,gender,origin,api_count,api_gender,api_name,api_probability,gender_infered
0,0,pierre,paul,grivel,pierre paul grivel,m,zbmath,5.0,male,pierre-paul,1.0,u
1,1,raul,,serapioni,raul serapioni,m,zbmath,821.0,male,raul,1.0,m
2,2,adriano,,moura,adriano moura,m,zbmath,166.0,male,adriano,0.99,m
3,3,ralf,,kieser,ralf kieser,m,zbmath,86.0,male,ralf,1.0,m
4,5,guillermo,,leon-de-la-barra,guillermo leon-de-la-barra,m,zbmath,850.0,male,guillermo,1.0,m


{'api_count': 100, 'api_probability': 0.5}


Unnamed: 0,index,first_name,middle_name,last_name,full_name,gender,origin,api_count,api_gender,api_name,api_probability,gender_infered
0,0,pierre,paul,grivel,pierre paul grivel,m,zbmath,5.0,male,pierre-paul,1.0,u
1,1,raul,,serapioni,raul serapioni,m,zbmath,821.0,male,raul,1.0,m
2,2,adriano,,moura,adriano moura,m,zbmath,166.0,male,adriano,0.99,m
3,3,ralf,,kieser,ralf kieser,m,zbmath,86.0,male,ralf,1.0,u
4,5,guillermo,,leon-de-la-barra,guillermo leon-de-la-barra,m,zbmath,850.0,male,guillermo,1.0,m


{'api_count': 100, 'api_probability': 0.7}


Unnamed: 0,index,first_name,middle_name,last_name,full_name,gender,origin,api_count,api_gender,api_name,api_probability,gender_infered
0,0,pierre,paul,grivel,pierre paul grivel,m,zbmath,5.0,male,pierre-paul,1.0,u
1,1,raul,,serapioni,raul serapioni,m,zbmath,821.0,male,raul,1.0,m
2,2,adriano,,moura,adriano moura,m,zbmath,166.0,male,adriano,0.99,m
3,3,ralf,,kieser,ralf kieser,m,zbmath,86.0,male,ralf,1.0,u
4,5,guillermo,,leon-de-la-barra,guillermo leon-de-la-barra,m,zbmath,850.0,male,guillermo,1.0,m


{'api_count': 100, 'api_probability': 0.8}


Unnamed: 0,index,first_name,middle_name,last_name,full_name,gender,origin,api_count,api_gender,api_name,api_probability,gender_infered
0,0,pierre,paul,grivel,pierre paul grivel,m,zbmath,5.0,male,pierre-paul,1.0,u
1,1,raul,,serapioni,raul serapioni,m,zbmath,821.0,male,raul,1.0,m
2,2,adriano,,moura,adriano moura,m,zbmath,166.0,male,adriano,0.99,m
3,3,ralf,,kieser,ralf kieser,m,zbmath,86.0,male,ralf,1.0,u
4,5,guillermo,,leon-de-la-barra,guillermo leon-de-la-barra,m,zbmath,850.0,male,guillermo,1.0,m


{'api_count': 100, 'api_probability': 0.9}


Unnamed: 0,index,first_name,middle_name,last_name,full_name,gender,origin,api_count,api_gender,api_name,api_probability,gender_infered
0,0,pierre,paul,grivel,pierre paul grivel,m,zbmath,5.0,male,pierre-paul,1.0,u
1,1,raul,,serapioni,raul serapioni,m,zbmath,821.0,male,raul,1.0,m
2,2,adriano,,moura,adriano moura,m,zbmath,166.0,male,adriano,0.99,m
3,3,ralf,,kieser,ralf kieser,m,zbmath,86.0,male,ralf,1.0,u
4,5,guillermo,,leon-de-la-barra,guillermo leon-de-la-barra,m,zbmath,850.0,male,guillermo,1.0,m


In [67]:
X = evaluator.test_data[['full_name', 'api_count', 'api_probability', 'api_gender']].head(50)
y = evaluator.test_data['gender'][:50]

In [75]:
kf = KFold(n_splits=3, random_state=1, shuffle=True)
skf = StratifiedKFold(n_splits=3, random_state=1, shuffle=True)

In [76]:
for train_index, test_index in kf.split(X):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X.loc[train_index,:], X.loc[test_index,:]
    y_train, y_test = y[train_index], y[test_index]

TRAIN: [ 0  1  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 20 22 23 24 25 26 28 30
 33 34 37 41 43 44 45 47] TEST: [ 2  3 19 21 27 29 31 32 35 36 38 39 40 42 46 48 49]
TRAIN: [ 0  1  2  3  5  6  7  8  9 11 12 15 16 19 20 21 25 27 29 31 32 35 36 37 38
 39 40 42 43 46 47 48 49] TEST: [ 4 10 13 14 17 18 22 23 24 26 28 30 33 34 41 44 45]
TRAIN: [ 2  3  4 10 13 14 17 18 19 21 22 23 24 26 27 28 29 30 31 32 33 34 35 36 38
 39 40 41 42 44 45 46 48 49] TEST: [ 0  1  5  6  7  8  9 11 12 15 16 20 25 37 43 47]


In [77]:
X_train.head()

Unnamed: 0,full_name,api_count,api_probability,api_gender
2,adriano moura,166.0,0.99,male
3,ralf kieser,86.0,1.0,male
4,guillermo leon-de-la-barra,850.0,1.0,male
10,gregory gelles,721.0,1.0,male
13,martin slawski,3568.0,1.0,male


In [78]:
for train_index, test_index in skf.split(X,y):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X.loc[train_index,:], X.loc[test_index,:]
    y_train, y_test = y[train_index], y[test_index]

TRAIN: [ 0  1  4  5  7  8  9 10 11 12 13 14 15 17 18 19 21 23 27 28 29 30 31 32 33
 37 41 42 43 44 45 47 48] TEST: [ 2  3  6 16 20 22 24 25 26 34 35 36 38 39 40 46 49]
TRAIN: [ 0  1  2  3  6  7  9 10 11 13 14 15 16 18 19 20 22 24 25 26 31 32 34 35 36
 38 39 40 42 46 47 48 49] TEST: [ 4  5  8 12 17 21 23 27 28 29 30 33 37 41 43 44 45]
TRAIN: [ 2  3  4  5  6  8 12 16 17 20 21 22 23 24 25 26 27 28 29 30 33 34 35 36 37
 38 39 40 41 43 44 45 46 49] TEST: [ 0  1  7  9 10 11 13 14 15 18 19 31 32 42 47 48]
