# Experiment

Hier werden die Ausreißer in den Ergebnissen der 3% Durchläufe untersucht. Hierfür werden ausgewählte Imputationverfahren evaluiert, ohne sie vorher zu scalieren.

### Vorbereitung

In [1]:
import pandas as pd
import numpy as np
import time

import experiment
from imputer import *

In [2]:
t0 = time.time()
results = []
base = experiment.reset_base()

In [3]:
functions =  {'Backfill':impute_backfill, 'Overall Mean':impute_overall_means, 
              'Yearly Mean':impute_yearly_means, 'Yearly Mean per Region':impute_yearly_means_per_region, 
              'Interpolate 3':interpolate3, 'Interpolate all':interpolate_all, 
              'ICE 3':iterative_imputer3, 'KNN 2':knn_imputer2
             }

In [4]:
def run_all(frac):
    
    #getting n different training sets
    cords_list =[]
    n = 7
    for i in np.arange(1,n+1):
        cords_list.append(experiment.get_cords(frac, i))
    
    #setting up two dictionaries to save detailed and averaged results
    detailed_results = {}   
    results = {}
         
    #itterating over dictionary of all imputers
    for name, func in functions.items():
        func_runs = []
        print(name)
        
        #running each imputer n times on different training sets
        for count, cords in enumerate(cords_list):            
            print(f'run: {count+1}/{n}')
            
            #running current imputer and timeing runtime
            t0 = time.time()
            df= func(experiment.reset_train(cords)) 
            t1 = time.time()
            t = t1-t0
            
            #evaluating run
            func_runs.append(experiment.evaluate(df, t, cords))
        
        #print evaluation result of all runs of current imputer
        print(func_runs)
        print ('')
        
        #save detailed and averaged results
        detailed_results[name] = func_runs
        results[name] = np.mean(np.array(func_runs), axis=0)
        
    return results, detailed_results

### Durchlauf mit 3% fehlenden Werten

In [5]:
r1, r1_detail = run_all(0.03)

Testdaten mit 3.0% fehlenden Werten (absolut: 5109)
Testdaten mit 3.0% fehlenden Werten (absolut: 5109)
Testdaten mit 3.0% fehlenden Werten (absolut: 5109)
Testdaten mit 3.0% fehlenden Werten (absolut: 5109)
Testdaten mit 3.0% fehlenden Werten (absolut: 5109)
Testdaten mit 3.0% fehlenden Werten (absolut: 5109)
Testdaten mit 3.0% fehlenden Werten (absolut: 5109)
Backfill
run: 1/7
r2: 0.932583879271738, rmse: 0.025304839916863744, t: 0.612112283706665

run: 2/7
r2: 0.98343571167781, rmse: 0.0721265294982051, t: 0.6277449131011963

run: 3/7
r2: 0.9894228814375354, rmse: 0.03734057954145193, t: 0.6280338764190674

run: 4/7
r2: 0.9924393093755304, rmse: 0.045826793422044876, t: 0.6283199787139893

run: 5/7
r2: 0.9960627112832513, rmse: 0.08578166120884186, t: 0.6127698421478271

run: 6/7
r2: 0.9990794142822997, rmse: 0.03858664120213273, t: 0.6158549785614014

run: 7/7
r2: 0.9977643305855354, rmse: 0.09632470364064019, t: 0.6120262145996094

[[0.932583879271738, 0.025304839916863744, 135530



r2: -1.2105984946004513, rmse: 0.14168025745445673, t: 78.80939769744873

run: 2/7
[IterativeImputer] Completing matrix with shape (4898, 165)
[IterativeImputer] Change: 99.2159716151302, scaled tolerance: 0.11370298835951696 
[IterativeImputer] Change: 42.034276567349295, scaled tolerance: 0.11370298835951696 
[IterativeImputer] Change: 28.062243449769994, scaled tolerance: 0.11370298835951696 
[IterativeImputer] Change: 154.71924042526047, scaled tolerance: 0.11370298835951696 
[IterativeImputer] Change: 345.5167766679781, scaled tolerance: 0.11370298835951696 
[IterativeImputer] Change: 257.05674190662353, scaled tolerance: 0.11370298835951696 
[IterativeImputer] Change: 240.92782215088067, scaled tolerance: 0.11370298835951696 
[IterativeImputer] Change: 191.7187887694185, scaled tolerance: 0.11370298835951696 
[IterativeImputer] Change: 187.0769083342523, scaled tolerance: 0.11370298835951696 
[IterativeImputer] Change: 187.383368610997, scaled tolerance: 0.11370298835951696 




r2: 0.8936553175244143, rmse: 0.19117784439953964, t: 73.44765639305115

run: 3/7
[IterativeImputer] Completing matrix with shape (4898, 165)
[IterativeImputer] Change: 99.59402231966706, scaled tolerance: 0.11370298835951696 
[IterativeImputer] Change: 13.669418268066647, scaled tolerance: 0.11370298835951696 
[IterativeImputer] Change: 36.10506645140776, scaled tolerance: 0.11370298835951696 
[IterativeImputer] Change: 16.141173403242906, scaled tolerance: 0.11370298835951696 
[IterativeImputer] Change: 23.466948976031453, scaled tolerance: 0.11370298835951696 
[IterativeImputer] Change: 27.382955196798232, scaled tolerance: 0.11370298835951696 
[IterativeImputer] Change: 29.148213219472144, scaled tolerance: 0.11370298835951696 
[IterativeImputer] Change: 26.85105391327696, scaled tolerance: 0.11370298835951696 
[IterativeImputer] Change: 28.081218615013885, scaled tolerance: 0.11370298835951696 
[IterativeImputer] Change: 29.319108454832804, scaled tolerance: 0.11370298835951696 




r2: 0.8459807702145355, rmse: 0.13907799997610903, t: 78.92291498184204

run: 4/7
[IterativeImputer] Completing matrix with shape (4898, 165)
[IterativeImputer] Change: 98.84242662649088, scaled tolerance: 0.11370298835951696 
[IterativeImputer] Change: 152.52152220273146, scaled tolerance: 0.11370298835951696 
[IterativeImputer] Change: 34.78098784941541, scaled tolerance: 0.11370298835951696 
[IterativeImputer] Change: 28.87992727367949, scaled tolerance: 0.11370298835951696 
[IterativeImputer] Change: 48.91560846475136, scaled tolerance: 0.11370298835951696 
[IterativeImputer] Change: 46.71642876089783, scaled tolerance: 0.11370298835951696 
[IterativeImputer] Change: 92.07880957249628, scaled tolerance: 0.11370298835951696 
[IterativeImputer] Change: 80.17741811487804, scaled tolerance: 0.11370298835951696 
[IterativeImputer] Change: 69.44114163376354, scaled tolerance: 0.11370298835951696 
[IterativeImputer] Change: 54.69869173548857, scaled tolerance: 0.11370298835951696 




r2: 0.8349694951820509, rmse: 0.20957397552549772, t: 76.86753034591675

run: 5/7
[IterativeImputer] Completing matrix with shape (4898, 165)
[IterativeImputer] Change: 102.01042838654358, scaled tolerance: 0.11370298835951696 
[IterativeImputer] Change: 42.794701206207094, scaled tolerance: 0.11370298835951696 
[IterativeImputer] Change: 51.71927393351463, scaled tolerance: 0.11370298835951696 
[IterativeImputer] Change: 115.79749845530786, scaled tolerance: 0.11370298835951696 
[IterativeImputer] Change: 61.81263937562668, scaled tolerance: 0.11370298835951696 
[IterativeImputer] Change: 75.13721697610097, scaled tolerance: 0.11370298835951696 
[IterativeImputer] Change: 34.037062723552125, scaled tolerance: 0.11370298835951696 
[IterativeImputer] Change: 66.56076000133453, scaled tolerance: 0.11370298835951696 
[IterativeImputer] Change: 219.92908703654427, scaled tolerance: 0.11370298835951696 
[IterativeImputer] Change: 42.741572618123136, scaled tolerance: 0.11370298835951696 




r2: 0.9750461966036715, rmse: 0.2110007646787648, t: 77.04527306556702

run: 6/7
[IterativeImputer] Completing matrix with shape (4898, 165)
[IterativeImputer] Change: 100.29992120037876, scaled tolerance: 0.11370298835951696 
[IterativeImputer] Change: 23.766062911069252, scaled tolerance: 0.11370298835951696 
[IterativeImputer] Change: 35.43642482199109, scaled tolerance: 0.11370298835951696 
[IterativeImputer] Change: 37.871971597783464, scaled tolerance: 0.11370298835951696 
[IterativeImputer] Change: 28.61889094939427, scaled tolerance: 0.11370298835951696 
[IterativeImputer] Change: 21.31459912038936, scaled tolerance: 0.11370298835951696 
[IterativeImputer] Change: 23.152855745494918, scaled tolerance: 0.11370298835951696 
[IterativeImputer] Change: 24.908490405319043, scaled tolerance: 0.11370298835951696 
[IterativeImputer] Change: 37.988742933208584, scaled tolerance: 0.11370298835951696 
[IterativeImputer] Change: 49.869332214409845, scaled tolerance: 0.11370298835951696 




r2: 0.9860540053451526, rmse: 0.1470234347617971, t: 82.41221976280212

run: 7/7
[IterativeImputer] Completing matrix with shape (4898, 165)
[IterativeImputer] Change: 105.06411838560527, scaled tolerance: 0.11370298835951696 
[IterativeImputer] Change: 32.482667046772285, scaled tolerance: 0.11370298835951696 
[IterativeImputer] Change: 34.308568139380796, scaled tolerance: 0.11370298835951696 
[IterativeImputer] Change: 76.89283846514238, scaled tolerance: 0.11370298835951696 
[IterativeImputer] Change: 77.40993865181999, scaled tolerance: 0.11370298835951696 
[IterativeImputer] Change: 79.59059286043379, scaled tolerance: 0.11370298835951696 
[IterativeImputer] Change: 53.58648329351062, scaled tolerance: 0.11370298835951696 
[IterativeImputer] Change: 55.52653975524776, scaled tolerance: 0.11370298835951696 
[IterativeImputer] Change: 98.91122711964226, scaled tolerance: 0.11370298835951696 
[IterativeImputer] Change: 96.2747341420783, scaled tolerance: 0.11370298835951696 




r2: 0.9408369760538747, rmse: 0.48329912012373183, t: 84.35188579559326

[[-1.2105984946004513, 0.14168025745445673, 0, 78.80939769744873], [0.8936553175244143, 0.19117784439953964, 0, 73.44765639305115], [0.8459807702145355, 0.13907799997610903, 0, 78.92291498184204], [0.8349694951820509, 0.20957397552549772, 0, 76.86753034591675], [0.9750461966036715, 0.2110007646787648, 0, 77.04527306556702], [0.9860540053451526, 0.1470234347617971, 0, 82.41221976280212], [0.9408369760538747, 0.48329912012373183, 0, 84.35188579559326]]

KNN 2
run: 1/7
r2: 0.506906365923218, rmse: 0.06691432483156766, t: 23.60706663131714

run: 2/7
r2: 0.3418010865107488, rmse: 0.47561796628174274, t: 22.87093162536621

run: 3/7
r2: 0.11716626739518698, rmse: 0.33297407605425705, t: 23.89342761039734

run: 4/7
r2: 0.5585519264288634, rmse: 0.34276388035825284, t: 22.8709135055542

run: 5/7
r2: 0.157528553251446, rmse: 1.2260078472373193, t: 23.29871940612793

run: 6/7
r2: 0.09027288613343898, rmse: 1.187455618472937,

In [6]:
r1 = pd.DataFrame(r1).T
r1

Unnamed: 0,0,1,2,3
Backfill,0.984398,0.057327,135535.571429,0.619552
Overall Mean,-0.32835,0.878404,0.0,0.964519
Yearly Mean,-0.357697,0.883607,52298.0,0.714972
Yearly Mean per Region,-1.195184,0.903695,57235.857143,0.920106
Interpolate 3,0.996616,0.028444,134354.571429,2.579178
Interpolate all,0.996619,0.028371,112679.285714,1.806637
ICE 3,0.609421,0.217548,0.0,78.836697
KNN 2,0.335481,0.70362,0.0,23.434694


In [7]:
r1_detail = pd.DataFrame(pd.DataFrame(r1_detail).T.stack().to_dict()).T
r1_detail.columns = ['r2', 'rmse', 'still_missing', 't']
r1_detail

Unnamed: 0,Unnamed: 1,r2,rmse,still_missing,t
Backfill,0,0.932584,0.025305,135530.0,0.612112
Backfill,1,0.983436,0.072127,135514.0,0.627745
Backfill,2,0.989423,0.037341,135555.0,0.628034
Backfill,3,0.992439,0.045827,135512.0,0.62832
Backfill,4,0.996063,0.085782,135536.0,0.61277
Backfill,5,0.999079,0.038587,135532.0,0.615855
Backfill,6,0.997764,0.096325,135570.0,0.612026
Overall Mean,0,-2.317507,0.173564,0.0,0.941701
Overall Mean,1,-0.010545,0.589329,0.0,1.038188
Overall Mean,2,-0.041418,0.361646,0.0,0.987757


### Durchlauf mit 5% fehlenden Werten

In [8]:
r2 , r2_detail =  run_all(0.05)

Testdaten mit 5.0% fehlenden Werten (absolut: 8515)
Testdaten mit 5.0% fehlenden Werten (absolut: 8515)
Testdaten mit 5.0% fehlenden Werten (absolut: 8515)
Testdaten mit 5.0% fehlenden Werten (absolut: 8515)
Testdaten mit 5.0% fehlenden Werten (absolut: 8515)
Testdaten mit 5.0% fehlenden Werten (absolut: 8515)
Testdaten mit 5.0% fehlenden Werten (absolut: 8515)
Backfill
run: 1/7
r2: 0.9973082101055876, rmse: 0.08703598848801375, t: 0.8794841766357422

run: 2/7
r2: 0.9858079818551079, rmse: 0.07306097819250446, t: 0.873241662979126

run: 3/7
r2: 0.9757670531819184, rmse: 0.09497599123099765, t: 0.8794314861297607

run: 4/7
r2: 0.9864813648105862, rmse: 0.06158480457363178, t: 0.8953442573547363

run: 5/7
r2: 0.9957604692455183, rmse: 0.10362087941748466, t: 0.8793857097625732

run: 6/7
r2: 0.9909789664362828, rmse: 0.06180154142551044, t: 0.8639280796051025

run: 7/7
r2: 0.9866297434430658, rmse: 0.0680957160943159, t: 0.8883531093597412

[[0.9973082101055876, 0.08703598848801375, 13573



r2: 0.9510462360770973, rmse: 0.363510803548255, t: 73.42559790611267

run: 2/7
[IterativeImputer] Completing matrix with shape (4898, 165)
[IterativeImputer] Change: 102.68818423868221, scaled tolerance: 0.11370298835951696 
[IterativeImputer] Change: 56.05925293385072, scaled tolerance: 0.11370298835951696 
[IterativeImputer] Change: 17.929977875461603, scaled tolerance: 0.11370298835951696 
[IterativeImputer] Change: 49.533973821214516, scaled tolerance: 0.11370298835951696 
[IterativeImputer] Change: 64.82538573504073, scaled tolerance: 0.11370298835951696 
[IterativeImputer] Change: 49.04546167583843, scaled tolerance: 0.11370298835951696 
[IterativeImputer] Change: 42.57107600144301, scaled tolerance: 0.11370298835951696 
[IterativeImputer] Change: 17.24457255558328, scaled tolerance: 0.11370298835951696 
[IterativeImputer] Change: 35.00458482144369, scaled tolerance: 0.11370298835951696 
[IterativeImputer] Change: 37.375309125087426, scaled tolerance: 0.11370298835951696 




r2: 0.879563133898175, rmse: 0.20811288246748097, t: 76.37029099464417

run: 3/7
[IterativeImputer] Completing matrix with shape (4898, 165)
[IterativeImputer] Change: 100.94204807142965, scaled tolerance: 0.11370298835951696 
[IterativeImputer] Change: 39.32914993697566, scaled tolerance: 0.11370298835951696 
[IterativeImputer] Change: 33.311199778702445, scaled tolerance: 0.11370298835951696 
[IterativeImputer] Change: 11.530692927896471, scaled tolerance: 0.11370298835951696 
[IterativeImputer] Change: 81.78871605048036, scaled tolerance: 0.11370298835951696 
[IterativeImputer] Change: 83.62299944134496, scaled tolerance: 0.11370298835951696 
[IterativeImputer] Change: 74.73370180759467, scaled tolerance: 0.11370298835951696 
[IterativeImputer] Change: 75.9053156708117, scaled tolerance: 0.11370298835951696 
[IterativeImputer] Change: 78.72979665949234, scaled tolerance: 0.11370298835951696 
[IterativeImputer] Change: 88.92476751286512, scaled tolerance: 0.11370298835951696 




r2: 0.4379278353092415, rmse: 0.447237636522821, t: 78.14746475219727

run: 4/7
[IterativeImputer] Completing matrix with shape (4898, 165)
[IterativeImputer] Change: 98.9689768052358, scaled tolerance: 0.11370298835951696 
[IterativeImputer] Change: 48.6020010296616, scaled tolerance: 0.11370298835951696 
[IterativeImputer] Change: 30.011937700150703, scaled tolerance: 0.11370298835951696 
[IterativeImputer] Change: 54.48159026208744, scaled tolerance: 0.11370298835951696 
[IterativeImputer] Change: 50.85096831934446, scaled tolerance: 0.11370298835951696 
[IterativeImputer] Change: 68.83774739988155, scaled tolerance: 0.11370298835951696 
[IterativeImputer] Change: 30.021691267536294, scaled tolerance: 0.11370298835951696 
[IterativeImputer] Change: 41.671691690672, scaled tolerance: 0.11370298835951696 
[IterativeImputer] Change: 39.66134242856224, scaled tolerance: 0.11370298835951696 
[IterativeImputer] Change: 29.21524154503604, scaled tolerance: 0.11370298835951696 




r2: 0.5663118888048915, rmse: 0.6932473618487891, t: 83.54127836227417

run: 5/7
[IterativeImputer] Completing matrix with shape (4898, 165)
[IterativeImputer] Change: 125.54761934140437, scaled tolerance: 0.11370298835951696 
[IterativeImputer] Change: 43.38932902318491, scaled tolerance: 0.11370298835951696 
[IterativeImputer] Change: 55.04175162675474, scaled tolerance: 0.11370298835951696 
[IterativeImputer] Change: 33.54868136027092, scaled tolerance: 0.11370298835951696 
[IterativeImputer] Change: 91.24836538664832, scaled tolerance: 0.11370298835951696 
[IterativeImputer] Change: 35.68795346842018, scaled tolerance: 0.11370298835951696 
[IterativeImputer] Change: 100.03736741310917, scaled tolerance: 0.11370298835951696 
[IterativeImputer] Change: 69.70064147644094, scaled tolerance: 0.11370298835951696 
[IterativeImputer] Change: 145.65280023114292, scaled tolerance: 0.11370298835951696 
[IterativeImputer] Change: 287.87960881074383, scaled tolerance: 0.11370298835951696 




r2: 0.951950370106241, rmse: 0.34131320026810563, t: 87.15883708000183

run: 6/7
[IterativeImputer] Completing matrix with shape (4898, 165)
[IterativeImputer] Change: 103.71610835656986, scaled tolerance: 0.11370298835951696 
[IterativeImputer] Change: 95.69269283295537, scaled tolerance: 0.11370298835951696 
[IterativeImputer] Change: 61.34995215165992, scaled tolerance: 0.11370298835951696 
[IterativeImputer] Change: 97.78100113159967, scaled tolerance: 0.11370298835951696 
[IterativeImputer] Change: 80.86322237829472, scaled tolerance: 0.11370298835951696 


KeyboardInterrupt: 

In [None]:
r2 = pd.DataFrame(r2).T
r2

In [None]:
r2_detail = pd.DataFrame(pd.DataFrame(r2_detail).T.stack().to_dict()).T
r2_detail.columns = ['r2', 'rmse', 'still_missing', 't']
r2_detail

### Durchlauf mit 10% fehlenden Werten

In [None]:
r3, r3_detail = run_all(0.1)

In [None]:
r3 = pd.DataFrame(r3).T
r3

In [None]:
r3_detail = pd.DataFrame(pd.DataFrame(r3_detail).T.stack().to_dict()).T
r3_detail.columns = ['r2', 'rmse', 'still_missing', 't']
r3_detail

### Imputationverfahren auf Originaldatensatz anwenden

In [None]:
for name, func in functions.items():
    print(name)

    t0 = time.time()
    df= func(experiment.reset_base()) 
    t1 = time.time()
    t = t1-t0

    print(f'{round(t, 2)}s vergangen')
    print('')
    #df.to_csv(f'additional_data/imputed_sets/{name}.csv')

### Ergebnisse abspeichern

In [None]:
t1 = time.time()
t = t1-t0
print(f'comlete runtime: {t/60}min')

In [None]:
sets = {'r1':r1, 'r1_detail':r1_detail,
       'r2':r2, 'r2_detail':r2_detail,
       'r3':r3, 'r3_detail':r3_detail,}
#for name, df in sets.items():
#    df.to_csv(f'additional_data/results/{name}.csv')