# Experiment

Hier werden die Ausreißer in den Ergebnissen der 3% Durchläufe untersucht. Hierfür werden ausgewählte Imputationverfahren evaluiert, ohne sie vorher zu scalieren.

### Vorbereitung

In [1]:
import pandas as pd
import numpy as np
import time

import experiment
from imputer import *

In [2]:
t0 = time.time()
results = []
base = experiment.reset_base()

In [3]:
functions =  {'Backfill':impute_backfill, 'Overall Mean':impute_overall_means, 
              'Yearly Mean':impute_yearly_means, 'Yearly Mean per Region':impute_yearly_means_per_region, 
              'Interpolate 3':interpolate3, 'Interpolate all':interpolate_all, 
              'ICE 1':iterative_imputer1, 'ICE 2':iterative_imputer2, 
              'ICE 3':iterative_imputer3, 'MICE 1':mice_imputer, 'MICE 2':mice_imputer2, 
              'KNN 1':knn_imputer1, 'KNN 2':knn_imputer2
             }

In [4]:
def run_all(frac):
    
    #getting n different training sets
    cords_list =[]
    n = 7
    for i in np.arange(1,n+1):
        cords_list.append(experiment.get_cords(frac, i))
    
    #setting up two dictionaries to save detailed and averaged results
    detailed_results = {}   
    results = {}
         
    #itterating over dictionary of all imputers
    for name, func in functions.items():
        func_runs = []
        print(name)
        
        #running each imputer n times on different training sets
        for count, cords in enumerate(cords_list):            
            print(f'run: {count+1}/{n}')
            
            #running current imputer and timeing runtime
            t0 = time.time()
            df= func(experiment.reset_train(cords)) 
            t1 = time.time()
            t = t1-t0
            
            #evaluating run
            func_runs.append(experiment.evaluate(df, t, cords))
        
        #print evaluation result of all runs of current imputer
        print(func_runs)
        print ('')
        
        #save detailed and averaged results
        detailed_results[name] = func_runs
        results[name] = np.mean(np.array(func_runs), axis=0)
        
    return results, detailed_results

### Durchlauf mit 3% fehlenden Werten

In [5]:
r1, r1_detail = run_all(0.03)

Testdaten mit 3.0% fehlenden Werten (absolut: 5109)
Testdaten mit 3.0% fehlenden Werten (absolut: 5109)
Testdaten mit 3.0% fehlenden Werten (absolut: 5109)
Testdaten mit 3.0% fehlenden Werten (absolut: 5109)
Testdaten mit 3.0% fehlenden Werten (absolut: 5109)
Testdaten mit 3.0% fehlenden Werten (absolut: 5109)
Testdaten mit 3.0% fehlenden Werten (absolut: 5109)
Backfill
run: 1/7
r2: 0.8050321942695272, rmse: 0.41014396064947206, t: 1.5170214176177979

run: 2/7
r2: 0.928916836453693, rmse: 0.2580152797743437, t: 1.479947566986084

run: 3/7
r2: 0.8580742751346473, rmse: 0.352383792177162, t: 1.4813783168792725

run: 4/7
r2: 0.8751756323359304, rmse: 0.36329463825630665, t: 1.4805912971496582

run: 5/7
r2: 0.8240985613045108, rmse: 0.42971101916958276, t: 1.494509220123291

run: 6/7
r2: 0.8883275971831258, rmse: 0.3029909242955271, t: 1.5613691806793213

run: 7/7
r2: 0.8739886195692419, rmse: 0.31142606682249746, t: 1.55983567237854

[[0.8050321942695272, 0.41014396064947206, 135530, 1.51



r2: 0.8160170135420015, rmse: 0.39657193297187787, t: 160.04702830314636

run: 2/7
[IterativeImputer] Completing matrix with shape (4898, 165)
[IterativeImputer] Change: 337.37134885018816, scaled tolerance: 0.05665340229323228 
[IterativeImputer] Change: 112.03648420923652, scaled tolerance: 0.05665340229323228 
[IterativeImputer] Change: 35.56681867523324, scaled tolerance: 0.05665340229323228 
[IterativeImputer] Change: 7.872501847222356, scaled tolerance: 0.05665340229323228 
[IterativeImputer] Change: 1.1246152729394487, scaled tolerance: 0.05665340229323228 
[IterativeImputer] Change: 0.8539672145549266, scaled tolerance: 0.05665340229323228 
[IterativeImputer] Change: 4.99112917739782, scaled tolerance: 0.05665340229323228 
[IterativeImputer] Change: 0.6093257231167408, scaled tolerance: 0.05665340229323228 
[IterativeImputer] Change: 0.7087490806163094, scaled tolerance: 0.05665340229323228 
[IterativeImputer] Change: 0.5274037684098039, scaled tolerance: 0.05665340229323228 




r2: 0.8045518652052257, rmse: 0.4539324258132192, t: 147.90535974502563

run: 3/7
[IterativeImputer] Completing matrix with shape (4898, 165)
[IterativeImputer] Change: 336.2039629142481, scaled tolerance: 0.05665340229323228 
[IterativeImputer] Change: 113.61486447885886, scaled tolerance: 0.05665340229323228 
[IterativeImputer] Change: 29.790643981609012, scaled tolerance: 0.05665340229323228 
[IterativeImputer] Change: 9.453246017719229, scaled tolerance: 0.05665340229323228 
[IterativeImputer] Change: 2.209328430573106, scaled tolerance: 0.05665340229323228 
[IterativeImputer] Change: 1.3980312429919128, scaled tolerance: 0.05665340229323228 
[IterativeImputer] Change: 1.3678051210243214, scaled tolerance: 0.05665340229323228 
[IterativeImputer] Change: 0.9616685401376509, scaled tolerance: 0.05665340229323228 
[IterativeImputer] Change: 0.6859790153308467, scaled tolerance: 0.05665340229323228 
[IterativeImputer] Change: 0.5453599377557168, scaled tolerance: 0.05665340229323228 




r2: 0.7483105609712455, rmse: 0.474593625782185, t: 149.4348156452179

run: 4/7
[IterativeImputer] Completing matrix with shape (4898, 165)
[IterativeImputer] Change: 328.636247941889, scaled tolerance: 0.05665340229323228 
[IterativeImputer] Change: 106.48159386380027, scaled tolerance: 0.05665340229323228 
[IterativeImputer] Change: 33.279528844022394, scaled tolerance: 0.05665340229323228 
[IterativeImputer] Change: 6.689808832434508, scaled tolerance: 0.05665340229323228 
[IterativeImputer] Change: 3.1120235670135283, scaled tolerance: 0.05665340229323228 
[IterativeImputer] Change: 1.607631796566388, scaled tolerance: 0.05665340229323228 
[IterativeImputer] Change: 0.8843238637592368, scaled tolerance: 0.05665340229323228 
[IterativeImputer] Change: 0.49918206558623124, scaled tolerance: 0.05665340229323228 
[IterativeImputer] Change: 0.3716946725952704, scaled tolerance: 0.05665340229323228 
[IterativeImputer] Change: 0.36533940432982837, scaled tolerance: 0.05665340229323228 




r2: 0.8067005552605703, rmse: 0.4472300723777, t: 155.4999303817749

run: 5/7
[IterativeImputer] Completing matrix with shape (4898, 165)
[IterativeImputer] Change: 352.6998770827351, scaled tolerance: 0.05665340229323228 
[IterativeImputer] Change: 106.28405598946199, scaled tolerance: 0.05665340229323228 
[IterativeImputer] Change: 29.743548988842434, scaled tolerance: 0.05665340229323228 
[IterativeImputer] Change: 6.630473898902364, scaled tolerance: 0.05665340229323228 
[IterativeImputer] Change: 4.223607583235861, scaled tolerance: 0.05665340229323228 



KeyboardInterrupt



In [None]:
r1 = pd.DataFrame(r1).T
r1

In [None]:
r1_detail = pd.DataFrame(pd.DataFrame(r1_detail).T.stack().to_dict()).T
r1_detail.columns = ['r2', 'rmse', 'still_missing', 't']
r1_detail

### Durchlauf mit 5% fehlenden Werten

In [None]:
r2 , r2_detail =  run_all(0.05)

In [None]:
r2 = pd.DataFrame(r2).T
r2

In [None]:
r2_detail = pd.DataFrame(pd.DataFrame(r2_detail).T.stack().to_dict()).T
r2_detail.columns = ['r2', 'rmse', 'still_missing', 't']
r2_detail

### Durchlauf mit 10% fehlenden Werten

In [None]:
r3, r3_detail = run_all(0.1)

In [None]:
r3 = pd.DataFrame(r3).T
r3

In [None]:
r3_detail = pd.DataFrame(pd.DataFrame(r3_detail).T.stack().to_dict()).T
r3_detail.columns = ['r2', 'rmse', 'still_missing', 't']
r3_detail

### Imputationverfahren auf Originaldatensatz anwenden

In [None]:
for name, func in functions.items():
    print(name)

    t0 = time.time()
    df= func(experiment.reset_base()) 
    t1 = time.time()
    t = t1-t0

    print(f'{round(t, 2)}s vergangen')
    print('')
    #df.to_csv(f'additional_data/imputed_sets/{name}.csv')

### Ergebnisse abspeichern

In [None]:
t1 = time.time()
t = t1-t0
print(f'comlete runtime: {t/60}min')

In [None]:
sets = {'r1':r1, 'r1_detail':r1_detail,
       'r2':r2, 'r2_detail':r2_detail,
       'r3':r3, 'r3_detail':r3_detail,}
#for name, df in sets.items():
#    df.to_csv(f'additional_data/results/{name}.csv')