Idea for tuning:

need to specify which metrics need to be computed, which one needs to be minimized, and how large k is

* build k splits into train and test data
* for each train-test-split:
    * build a parameter grid using cls.tuning_params. (For each class we have to find out what a good grid would be.)
    * for each combination in the grid: 
        * translate api-response into f,m,u and store in the column gender_infered 
        * compute all provided metrics on the resulting test_data DF restricted to training data
        * store relation grid-point:metrics
    * select grid_point which minimizes specified metric and compute all provided metrics on test set
    * store the metrics on test set
* compute average of metrics on all test sets for all metrics

In [1]:
from evaluators import *

## `genderize_io`

In [2]:
data_source = 'genderizeR'
service_name = GenderizeIoEvaluator
evaluator = service_name(data_source)

evaluator.load_data(evaluated=True)
evaluator.preprocess_data_for_parameter_tuning()
evaluator.remove_rows_with_unknown_gender()

In [3]:
evaluator.test_data.head()

Unnamed: 0,index,raw_name,first_name,middle_name,last_name,full_name,gender,api_count,gender_infered,api_name,api_probability,api_gender
0,1,"Chiesa, Paolo",paolo,,chiesa,paolo chiesa,m,781.0,m,paolo,0.99,male
1,2,"Abbate, Ernesto",ernesto,,abbate,ernesto abbate,m,381.0,m,ernesto,1.0,male
2,3,"Epstein, John H.",john,,epstein,john epstein,m,9931.0,m,john,0.99,male
3,4,"Cotroneo, Margaret",margaret,,cotroneo,margaret cotroneo,f,1101.0,f,margaret,0.98,female
4,5,"Kresge, Nicole",nicole,,kresge,nicole kresge,f,4042.0,f,nicole,1.0,female


In [4]:
evaluator.test_data[list(evaluator.tuning_params)].describe()

Unnamed: 0,api_count,api_probability
count,397.0,397.0
mean,2456.032746,0.978262
std,3288.578471,0.069072
min,1.0,0.52
25%,145.0,0.99
50%,972.0,1.0
75%,3568.0,1.0
max,12593.0,1.0


In [5]:
# show confusion matrix if we do no tuning
evaluator._translate_api_response()
evaluator.compute_confusion_matrix(evaluator.test_data)

Unnamed: 0,f_pred,m_pred,u_pred
f,83,2,0
m,8,304,0
u,0,0,0


In [6]:
# only for testing
evaluator.build_parameter_grid([100, 500, 1000], [0.8, 0.85, 0.9])

[OrderedDict([('api_count', 100), ('api_probability', 0.8)]),
 OrderedDict([('api_count', 100), ('api_probability', 0.85)]),
 OrderedDict([('api_count', 100), ('api_probability', 0.9)]),
 OrderedDict([('api_count', 500), ('api_probability', 0.8)]),
 OrderedDict([('api_count', 500), ('api_probability', 0.85)]),
 OrderedDict([('api_count', 500), ('api_probability', 0.9)]),
 OrderedDict([('api_count', 1000), ('api_probability', 0.8)]),
 OrderedDict([('api_count', 1000), ('api_probability', 0.85)]),
 OrderedDict([('api_count', 1000), ('api_probability', 0.9)])]

In [7]:
# build a sample grid
grid = evaluator.build_parameter_grid([1, 10, 100, 200, 300], 
                                      [0.5, 0.7, 0.8, 0.9, 0.95, 0.97, 0.98, 0.98, 1])

In [8]:
# TODO: turn errors into a nice dataframe for the paper 
index = evaluator.test_data.index
errors = evaluator.compute_error_for_param_grid(grid, evaluator.compute_error_without_unknown, index)
errors 

{(1, 0.5): 0.025188916876574308,
 (1, 0.7): 0.017994858611825194,
 (1, 0.8): 0.013192612137203167,
 (1, 0.9): 0.01078167115902965,
 (1, 0.95): 0.010899182561307902,
 (1, 0.97): 0.0111731843575419,
 (1, 0.98): 0.011235955056179775,
 (1, 1): 0.015384615384615385,
 (10, 0.5): 0.018617021276595744,
 (10, 0.7): 0.010840108401084011,
 (10, 0.8): 0.0055710306406685237,
 (10, 0.9): 0.002840909090909091,
 (10, 0.95): 0.0028735632183908046,
 (10, 0.97): 0.0029498525073746312,
 (10, 0.98): 0.002967359050445104,
 (10, 1): 0.0041493775933609959,
 (100, 0.5): 0.0065573770491803279,
 (100, 0.7): 0.0033333333333333335,
 (100, 0.8): 0.0,
 (100, 0.9): 0.0,
 (100, 0.95): 0.0,
 (100, 0.97): 0.0,
 (100, 0.98): 0.0,
 (100, 1): 0.0,
 (200, 0.5): 0.006993006993006993,
 (200, 0.7): 0.0035587188612099642,
 (200, 0.8): 0.0,
 (200, 0.9): 0.0,
 (200, 0.95): 0.0,
 (200, 0.97): 0.0,
 (200, 0.98): 0.0,
 (200, 1): 0.0,
 (300, 0.5): 0.0076045627376425855,
 (300, 0.7): 0.0038610038610038611,
 (300, 0.8): 0.0,
 (300, 0.9

### Try out a weighted version of `compute_error_with_unknown`

In [9]:
evaluator.compute_cv_score(5, grid, evaluator.compute_weighted_error)

minimal train error: 0.0162774239207 corresponding test error: 0.0121786197564
params for lowest train error: {'api_probability': 0.9, 'api_count': 10}
minimal train error: 0.016106442577 corresponding test error: 0.0126939351199
params for lowest train error: {'api_probability': 0.9, 'api_count': 10}
minimal train error: 0.0104529616725 corresponding test error: 0.0369393139842
params for lowest train error: {'api_probability': 0.8, 'api_count': 10}
minimal train error: 0.017199017199 corresponding test error: 0.00837988826816
params for lowest train error: {'api_probability': 0.9, 'api_count': 10}
minimal train error: 0.015939015939 corresponding test error: 0.0132547864507
params for lowest train error: {'api_probability': 0.9, 'api_count': 10}


0.016689308715860558

In [10]:
# show confusion matrix if we tune parameters according to error function 'compute_weighted_error'
evaluator._translate_api_response(api_count=10, api_probability=0.9)
evaluator.compute_confusion_matrix(evaluator.test_data)

Unnamed: 0,f_pred,m_pred,u_pred
f,77,0,8
m,1,274,37
u,0,0,0


### try setting a constraint

In [11]:
# we tune parameters such that 'error_func' is minimised on the training sets 
# but under the constraint that 'constraint_func' is less than 'constraint_val' on test set
evaluator.compute_cv_score(n_splits=5, param_grid=grid, error_func=evaluator.compute_error_without_unknown,
                          constraint_func=evaluator.compute_error_with_unknown, constraint_val=0.1)

minimal train error: 0.00701754385965 corresponding test error: 0.0
params for lowest train error: {'api_probability': 0.8, 'api_count': 10}
minimal train error: 0.0101351351351 corresponding test error: 0.0133333333333
params for lowest train error: {'api_probability': 0.9, 'api_count': 1}
minimal train error: 0.00664451827243 corresponding test error: 0.0384615384615
params for lowest train error: {'api_probability': 0.8, 'api_count': 1}
minimal train error: 0.00355871886121 corresponding test error: 0.0
params for lowest train error: {'api_probability': 0.9, 'api_count': 10}
minimal train error: 0.0100334448161 corresponding test error: 0.0142857142857
params for lowest train error: {'api_probability': 0.7, 'api_count': 10}


0.013216117216117217

## `gender_api`

In [13]:
data_source = 'all'
service_name = GenderAPIEvaluator
evaluator = service_name(data_source)

evaluator.load_data(evaluated=True)
evaluator.preprocess_data_for_parameter_tuning()
evaluator.remove_rows_with_unknown_gender()

In [14]:
evaluator.tuning_params

('api_accuracy', 'api_samples')

In [15]:
evaluator.test_data[list(evaluator.tuning_params)].describe()

Unnamed: 0,api_accuracy,api_samples
count,5790.0,5790.0
mean,91.38532,40243.978066
std,18.721861,66650.027682
min,0.0,0.0
25%,95.0,987.0
50%,98.0,11153.0
75%,99.0,51412.0
max,100.0,433182.0


In [16]:
grid = evaluator.build_parameter_grid([50, 60, 70, 80, 90, 95], [10000, 20000, 30000, 40000, 50000, 60000])

In [17]:
evaluator.compute_cv_score(5, grid, evaluator.compute_error_without_unknown)

minimal train error: 0.00460405156538 corresponding test error: 0.00684931506849
params for lowest train error: {'api_samples': 50000, 'api_accuracy': 95}
minimal train error: 0.00522951772225 corresponding test error: 0.00660792951542
params for lowest train error: {'api_samples': 20000, 'api_accuracy': 95}
minimal train error: 0.00542005420054 corresponding test error: 0.00369003690037
params for lowest train error: {'api_samples': 50000, 'api_accuracy': 95}
minimal train error: 0.00441014332966 corresponding test error: 0.010152284264
params for lowest train error: {'api_samples': 60000, 'api_accuracy': 95}
minimal train error: 0.00546946216955 corresponding test error: 0.00355871886121
params for lowest train error: {'api_samples': 50000, 'api_accuracy': 95}


0.0061716569218900012

In [18]:
evaluator.compute_cv_score(5, grid, evaluator.compute_error_with_unknown)

minimal train error: 1.00476190476 corresponding test error: 0.944630872483
params for lowest train error: {'api_samples': 10000, 'api_accuracy': 50}
minimal train error: 1.00954446855 corresponding test error: 0.926788685524
params for lowest train error: {'api_samples': 10000, 'api_accuracy': 50}
minimal train error: 0.96855078623 corresponding test error: 1.09403254973
params for lowest train error: {'api_samples': 10000, 'api_accuracy': 50}
minimal train error: 0.981180496151 corresponding test error: 1.03873239437
params for lowest train error: {'api_samples': 10000, 'api_accuracy': 50}
minimal train error: 0.998705780846 corresponding test error: 0.96768707483
params for lowest train error: {'api_samples': 10000, 'api_accuracy': 50}


0.99437431538644583

In [19]:
evaluator.compute_cv_score(5, grid, evaluator.compute_inverse_f1_score)

minimal train error: 1.00889328063 corresponding test error: 1.01119402985
params for lowest train error: {'api_samples': 20000, 'api_accuracy': 95}
minimal train error: 1.00872093023 corresponding test error: 1.01209677419
params for lowest train error: {'api_samples': 20000, 'api_accuracy': 95}
minimal train error: 1.01046337818 corresponding test error: 1.00943396226
params for lowest train error: {'api_samples': 10000, 'api_accuracy': 95}
minimal train error: 1.00863723608 corresponding test error: 1.01260504202
params for lowest train error: {'api_samples': 20000, 'api_accuracy': 95}
minimal train error: 1.0099009901 corresponding test error: 1.00740740741
params for lowest train error: {'api_samples': 20000, 'api_accuracy': 95}


1.010547443146532