Idea for tuning:

need to specify which metrics need to be computed, which one needs to be minimized, and how large k is

* build k splits into train and test data
* for each train-test-split:
    * build a parameter grid using cls.tuning_params. (For each class we have to find out what a good grid would be.)
    * for each combination in the grid: 
        * translate api-response into f,m,u and store in the column gender_infered 
        * compute all provided metrics on the resulting test_data DF restricted to training data
        * store relation grid-point:metrics
    * select grid_point which minimizes specified metric and compute all provided metrics on test set
    * store the metrics on test set
* compute average of metrics on all test sets for all metrics

In [1]:
from evaluators import *

## `genderize_io`

In [2]:
data_source = 'all'
service_name = GenderizeIoEvaluator
evaluator = service_name(data_source)

evaluator.load_data(evaluated=True)
evaluator.preprocess_data_for_parameter_tuning()
evaluator.remove_rows_with_unknown_gender()

In [3]:
evaluator.test_data.head()

Unnamed: 0,index,first_name,middle_name,last_name,full_name,gender,origin,api_count,api_gender,api_name,api_probability,gender_infered
0,0,pierre,paul,grivel,pierre paul grivel,m,zbmath,5.0,male,pierre-paul,1.0,m
1,1,raul,,serapioni,raul serapioni,m,zbmath,821.0,male,raul,1.0,m
2,2,adriano,,moura,adriano moura,m,zbmath,166.0,male,adriano,0.99,m
3,3,ralf,,kieser,ralf kieser,m,zbmath,86.0,male,ralf,1.0,m
4,5,guillermo,,leon-de-la-barra,guillermo leon-de-la-barra,m,zbmath,850.0,male,guillermo,1.0,m


In [4]:
evaluator.test_data[list(evaluator.tuning_params)].describe()

Unnamed: 0,api_count,api_probability
count,5227.0,5227.0
mean,1695.015496,0.957744
std,2638.971903,0.105776
min,1.0,0.5
25%,46.0,0.99
50%,503.0,1.0
75%,2201.0,1.0
max,12593.0,1.0


In [5]:
evaluator.build_parameter_grid([100, 500, 1000], [0.8, 0.85, 0.9])

[OrderedDict([('api_count', 100), ('api_probability', 0.8)]),
 OrderedDict([('api_count', 100), ('api_probability', 0.85)]),
 OrderedDict([('api_count', 100), ('api_probability', 0.9)]),
 OrderedDict([('api_count', 500), ('api_probability', 0.8)]),
 OrderedDict([('api_count', 500), ('api_probability', 0.85)]),
 OrderedDict([('api_count', 500), ('api_probability', 0.9)]),
 OrderedDict([('api_count', 1000), ('api_probability', 0.8)]),
 OrderedDict([('api_count', 1000), ('api_probability', 0.85)]),
 OrderedDict([('api_count', 1000), ('api_probability', 0.9)])]

In [6]:
grid = evaluator.build_parameter_grid([1, 10, 100, 200, 500, 1000], 
                                      [0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95])

In [7]:
train_index = evaluator.test_data.index
test_index = evaluator.test_data.index
errors = evaluator.compute_train_test_error_for_param_grid(grid, evaluator.compute_error_without_unknown, 
                                                           train_index, test_index)
errors # TODO: turn errors into a nice dataframe while ignoring the second entry in value(train error = test error)

{(1, 0.55): (0.050438169425511201, 0.050438169425511201),
 (1, 0.6): (0.045275590551181105, 0.045275590551181105),
 (1, 0.65): (0.041106877882494487, 0.041106877882494487),
 (1, 0.7): (0.037021969080553295, 0.037021969080553295),
 (1, 0.75): (0.034688013136288998, 0.034688013136288998),
 (1, 0.8): (0.031067556296914094, 0.031067556296914094),
 (1, 0.85): (0.027931769722814498, 0.027931769722814498),
 (1, 0.9): (0.024731888815933464, 0.024731888815933464),
 (1, 0.95): (0.020702070207020702, 0.020702070207020702),
 (10, 0.55): (0.044370122630992194, 0.044370122630992194),
 (10, 0.6): (0.038574328896909545, 0.038574328896909545),
 (10, 0.65): (0.033847570803591985, 0.033847570803591985),
 (10, 0.7): (0.030303030303030304, 0.030303030303030304),
 (10, 0.75): (0.027758174547165372, 0.027758174547165372),
 (10, 0.8): (0.023661567877629065, 0.023661567877629065),
 (10, 0.85): (0.021001221001221003, 0.021001221001221003),
 (10, 0.9): (0.017306245297215951, 0.017306245297215951),
 (10, 0.95): (

In [8]:
evaluator.compute_cv_score(5, grid, evaluator.compute_error_without_unknown)

minimal train error: 0.00398142003981 corresponding test error: 0.00261780104712
params for lowest train error: {'api_probability': 0.95, 'api_count': 1000}
minimal train error: 0.00405405405405 corresponding test error: 0.00244498777506
params for lowest train error: {'api_probability': 0.95, 'api_count': 1000}
minimal train error: 0.00392670157068 corresponding test error: 0.00277008310249
params for lowest train error: {'api_probability': 0.95, 'api_count': 1000}
minimal train error: 0.00324254215305 corresponding test error: 0.00576368876081
params for lowest train error: {'api_probability': 0.95, 'api_count': 1000}
minimal train error: 0.00333555703803 corresponding test error: 0.00512820512821
params for lowest train error: {'api_probability': 0.95, 'api_count': 1000}


0.0037449531627373319

In [9]:
evaluator.compute_cv_score(5, grid, evaluator.compute_error_with_unknown)

minimal train error: 0.0652278177458 corresponding test error: 0.0747398297067
params for lowest train error: {'api_probability': 0.55, 'api_count': 1}
minimal train error: 0.0694444444444 corresponding test error: 0.058039961941
params for lowest train error: {'api_probability': 0.55, 'api_count': 1}
minimal train error: 0.0667938931298 corresponding test error: 0.0685990338164
params for lowest train error: {'api_probability': 0.55, 'api_count': 1}
minimal train error: 0.0671927307508 corresponding test error: 0.066985645933
params for lowest train error: {'api_probability': 0.55, 'api_count': 1}
minimal train error: 0.0670964660936 corresponding test error: 0.0673724735322
params for lowest train error: {'api_probability': 0.55, 'api_count': 1}


0.067147388985881545

In [10]:
evaluator.compute_cv_score(5, grid, evaluator.compute_error_unknown)

minimal train error: 0.0158273381295 corresponding test error: 0.0245979186377
params for lowest train error: {'api_count': 1, 'api_probability': 0.55}
minimal train error: 0.0193965517241 corresponding test error: 0.0104662226451
params for lowest train error: {'api_count': 1, 'api_probability': 0.55}
minimal train error: 0.0174141221374 corresponding test error: 0.0183574879227
params for lowest train error: {'api_count': 1, 'api_probability': 0.55}
minimal train error: 0.0172166427547 corresponding test error: 0.0191387559809
params for lowest train error: {'api_count': 1, 'api_probability': 0.55}
minimal train error: 0.018147086915 corresponding test error: 0.0153994225217
params for lowest train error: {'api_count': 1, 'api_probability': 0.55}


0.017591961541595126

In [11]:
evaluator.compute_cv_score(5, grid, evaluator.compute_error_gender_bias)

minimal train error: 0.00398142003981 corresponding test error: 0.00261780104712
params for lowest train error: {'api_count': 1000, 'api_probability': 0.95}
minimal train error: 0.00405405405405 corresponding test error: 0.00244498777506
params for lowest train error: {'api_count': 1000, 'api_probability': 0.95}
minimal train error: 0.00392670157068 corresponding test error: 0.00277008310249
params for lowest train error: {'api_count': 1000, 'api_probability': 0.95}
minimal train error: 0.00324254215305 corresponding test error: 0.00576368876081
params for lowest train error: {'api_count': 1000, 'api_probability': 0.95}
minimal train error: 0.00333555703803 corresponding test error: 0.00512820512821
params for lowest train error: {'api_count': 1000, 'api_probability': 0.95}


0.0037449531627373319

In [12]:
evaluator.compute_cv_score(5, grid, evaluator.compute_inverse_f1_score)

minimal train error: 1.00573613767 corresponding test error: 1.00354609929
params for lowest train error: {'api_count': 1000, 'api_probability': 0.95}
minimal train error: 1.00564971751 corresponding test error: 1.0037593985
params for lowest train error: {'api_count': 1000, 'api_probability': 0.95}
minimal train error: 1.00561797753 corresponding test error: 1.00384615385
params for lowest train error: {'api_count': 1000, 'api_probability': 0.95}
minimal train error: 1.00457875458 corresponding test error: 1.00847457627
params for lowest train error: {'api_count': 1000, 'api_probability': 0.95}
minimal train error: 1.00478927203 corresponding test error: 1.00704225352
params for lowest train error: {'api_count': 1000, 'api_probability': 0.95}


1.0053336962850976

## `gender_api`

In [13]:
data_source = 'all'
service_name = GenderAPIEvaluator
evaluator = service_name(data_source)

evaluator.load_data(evaluated=True)
evaluator.preprocess_data_for_parameter_tuning()
evaluator.remove_rows_with_unknown_gender()

In [14]:
evaluator.tuning_params

('api_accuracy', 'api_samples')

In [15]:
evaluator.test_data[list(evaluator.tuning_params)].describe()

Unnamed: 0,api_accuracy,api_samples
count,5790.0,5790.0
mean,91.38532,40243.978066
std,18.721861,66650.027682
min,0.0,0.0
25%,95.0,987.0
50%,98.0,11153.0
75%,99.0,51412.0
max,100.0,433182.0


In [16]:
grid = evaluator.build_parameter_grid([50, 60, 70, 80, 90, 95], [10000, 20000, 30000, 40000, 50000, 60000])

In [17]:
evaluator.compute_cv_score(5, grid, evaluator.compute_error_without_unknown)

minimal train error: 0.00460405156538 corresponding test error: 0.00684931506849
params for lowest train error: {'api_samples': 50000, 'api_accuracy': 95}
minimal train error: 0.00522951772225 corresponding test error: 0.00660792951542
params for lowest train error: {'api_samples': 20000, 'api_accuracy': 95}
minimal train error: 0.00542005420054 corresponding test error: 0.00369003690037
params for lowest train error: {'api_samples': 50000, 'api_accuracy': 95}
minimal train error: 0.00441014332966 corresponding test error: 0.010152284264
params for lowest train error: {'api_samples': 60000, 'api_accuracy': 95}
minimal train error: 0.00546946216955 corresponding test error: 0.00355871886121
params for lowest train error: {'api_samples': 50000, 'api_accuracy': 95}


0.0061716569218900012

In [18]:
evaluator.compute_cv_score(5, grid, evaluator.compute_error_with_unknown)

minimal train error: 1.00476190476 corresponding test error: 0.944630872483
params for lowest train error: {'api_samples': 10000, 'api_accuracy': 50}
minimal train error: 1.00954446855 corresponding test error: 0.926788685524
params for lowest train error: {'api_samples': 10000, 'api_accuracy': 50}
minimal train error: 0.96855078623 corresponding test error: 1.09403254973
params for lowest train error: {'api_samples': 10000, 'api_accuracy': 50}
minimal train error: 0.981180496151 corresponding test error: 1.03873239437
params for lowest train error: {'api_samples': 10000, 'api_accuracy': 50}
minimal train error: 0.998705780846 corresponding test error: 0.96768707483
params for lowest train error: {'api_samples': 10000, 'api_accuracy': 50}


0.99437431538644583

In [19]:
evaluator.compute_cv_score(5, grid, evaluator.compute_inverse_f1_score)

minimal train error: 1.00889328063 corresponding test error: 1.01119402985
params for lowest train error: {'api_samples': 20000, 'api_accuracy': 95}
minimal train error: 1.00872093023 corresponding test error: 1.01209677419
params for lowest train error: {'api_samples': 20000, 'api_accuracy': 95}
minimal train error: 1.01046337818 corresponding test error: 1.00943396226
params for lowest train error: {'api_samples': 10000, 'api_accuracy': 95}
minimal train error: 1.00863723608 corresponding test error: 1.01260504202
params for lowest train error: {'api_samples': 20000, 'api_accuracy': 95}
minimal train error: 1.0099009901 corresponding test error: 1.00740740741
params for lowest train error: {'api_samples': 20000, 'api_accuracy': 95}


1.010547443146532