# Experiment

Hier werden die Ausreißer in den Ergebnissen der 3% Durchläufe untersucht. Hierfür werden ausgewählte Imputationverfahren evaluiert, ohne sie vorher zu scalieren.

### Vorbereitung

In [1]:
import pandas as pd
import numpy as np
import time
import math
from sklearn.metrics import r2_score, mean_squared_error

import experiment
from imputer import *

In [2]:
t0 = time.time()
results = []
base = experiment.reset_base()

In [3]:
functions =  {'Backfill':impute_backfill, 'Overall Mean':impute_overall_means, 
              'Yearly Mean':impute_yearly_means, 'Yearly Mean per Region':impute_yearly_means_per_region, 
              'Interpolate 3':interpolate3, 'Interpolate all':interpolate_all, 
              'ICE 3':iterative_imputer3, 'KNN 2':knn_imputer2
             }

In [4]:
def evaluate(df, t, cords):
    # scaling original data and imputed data
    train = experiment.reset_train(cords)
    #scaler = StandardScaler().fit(train)  # fitting on train?
    #norm_base = pd.DataFrame(scaler.transform(base))
    norm_base = pd.DataFrame(base)
    #df = pd.DataFrame(scaler.transform(df))

    # getting imputed values for simulated NaNs and true value
    res = pd.DataFrame({'y_true': [norm_base.iloc[cords[0][i], cords[1][i]] for i in cords.index],
                        'y_pred': [df.iloc[cords[0][i], cords[1][i]] for i in cords.index]
                        })
    res = res.dropna()

    # calculate evaluation metrics
    r2 = r2_score(res['y_true'], res['y_pred'])
    rmse = math.sqrt(mean_squared_error(res['y_true'], res['y_pred']))
    still_missing = df.isna().sum().sum()

    print(f'r2: {r2}, rmse: {rmse}, t: {t}')
    print('')

    return [r2, rmse, still_missing, t]

In [5]:
def run_all(frac):
    
    #getting n different training sets
    cords_list =[]
    n = 7
    for i in np.arange(1,n+1):
        cords_list.append(experiment.get_cords(frac, i))
    
    #setting up two dictionaries to save detailed and averaged results
    detailed_results = {}   
    results = {}
         
    #itterating over dictionary of all imputers
    for name, func in functions.items():
        func_runs1 = []
        func_runs2 = []
        print(name)
        
        #running each imputer n times on different training sets
        for count, cords in enumerate(cords_list):            
            print(f'run: {count+1}/{n}')
            
            #running current imputer and timeing runtime
            t0 = time.time()
            df= func(experiment.reset_train(cords)) 
            t1 = time.time()
            t = t1-t0
            
            #evaluating run
            func_runs1.append(evaluate(df, t, cords))
            func_runs2.append(experiment.evaluate(df, t, cords))
        
        #print evaluation result of all runs of current imputer
        print(func_runs1)
        print(func_runs2)
        print ('')
        
        #save detailed and averaged results
        detailed_results[name] = func_runs1
        results[name] = np.mean(np.array(func_runs1), axis=0)
        
    return results, detailed_results


### Durchlauf mit 3% fehlenden Werten

In [6]:
r1, r1_detail = run_all(0.03)

Testdaten mit 3.0% fehlenden Werten (absolut: 5109)
Testdaten mit 3.0% fehlenden Werten (absolut: 5109)
Testdaten mit 3.0% fehlenden Werten (absolut: 5109)
Testdaten mit 3.0% fehlenden Werten (absolut: 5109)
Testdaten mit 3.0% fehlenden Werten (absolut: 5109)
Testdaten mit 3.0% fehlenden Werten (absolut: 5109)
Testdaten mit 3.0% fehlenden Werten (absolut: 5109)
Backfill
run: 1/7
r2: 0.9964693249333287, rmse: 917098994403.9623, t: 1.2889063358306885

r2: 0.9937039221729441, rmse: 0.007707057811487927, t: 1.2889063358306885

run: 2/7
r2: 0.9703683283858512, rmse: 2705970651798.2363, t: 1.3806488513946533

r2: 0.9657391810107668, rmse: 0.10555221815029571, t: 1.3806488513946533

run: 3/7
r2: 0.9910573481070527, rmse: 1097877416483.2795, t: 1.3330531120300293

r2: 0.9766604006765891, rmse: 0.056011491920483096, t: 1.3330531120300293

run: 4/7
r2: 0.9362538660447858, rmse: 3030237225918.873, t: 1.2987034320831299

r2: 0.9501189660444422, rmse: 0.11902912409462993, t: 1.2987034320831299

run

r2: 0.9962071945114009, rmse: 950924399862.5491, t: 5.675708770751953

r2: 0.9980322787070551, rmse: 0.004310235097013991, t: 5.675708770751953

run: 2/7
r2: 0.99685230662051, rmse: 2170519301446.0415, t: 5.656502723693848

r2: 0.9913443640137568, rmse: 0.05659265506131489, t: 5.656502723693848

run: 3/7
r2: 0.9973580612193337, rmse: 595951085889.2628, t: 5.636073589324951

r2: 0.9977330864730987, rmse: 0.017427521217334804, t: 5.636073589324951

run: 4/7
r2: 0.9975248185741633, rmse: 593088824191.3138, t: 5.677340269088745

r2: 0.9979546905762926, rmse: 0.024072109591946847, t: 5.677340269088745

run: 5/7
r2: 0.9969589624720169, rmse: 3804345632054.9336, t: 5.586675643920898

r2: 0.9976587604618082, rmse: 0.07185399940817276, t: 5.586675643920898

run: 6/7
r2: 0.9972224415490841, rmse: 2415044892333.846, t: 5.706248760223389

r2: 0.998049959236905, rmse: 0.06702587074222292, t: 5.706248760223389

run: 7/7
r2: 0.9987317433573477, rmse: 2644840050087.999, t: 5.644956111907959

r2: 0.999



r2: 0.8725233366448126, rmse: 5388007105755.665, t: 159.67472290992737

r2: -5.998267275627868, rmse: 0.2512357292909726, t: 159.67472290992737

run: 2/7
[IterativeImputer] Completing matrix with shape (4898, 165)
[IterativeImputer] Change: 3.079902961030002e+16, scaled tolerance: 35084726045503.402 
[IterativeImputer] Change: 3495408454031530.5, scaled tolerance: 35084726045503.402 
[IterativeImputer] Change: 2492194958059387.0, scaled tolerance: 35084726045503.402 
[IterativeImputer] Change: 1007719463930486.4, scaled tolerance: 35084726045503.402 
[IterativeImputer] Change: 259370036738950.03, scaled tolerance: 35084726045503.402 
[IterativeImputer] Change: 145338349464639.88, scaled tolerance: 35084726045503.402 
[IterativeImputer] Change: 327358429387759.8, scaled tolerance: 35084726045503.402 
[IterativeImputer] Change: 227329885720082.38, scaled tolerance: 35084726045503.402 
[IterativeImputer] Change: 493921693739572.25, scaled tolerance: 35084726045503.402 
[IterativeImputer] 



r2: 0.8095260372140096, rmse: 16525549861609.215, t: 155.5043842792511

r2: 0.8958482340059831, rmse: 0.19213783973460316, t: 155.5043842792511

run: 3/7
[IterativeImputer] Completing matrix with shape (4898, 165)
[IterativeImputer] Change: 2.995436267547203e+16, scaled tolerance: 35084726045503.402 
[IterativeImputer] Change: 3002718212273284.5, scaled tolerance: 35084726045503.402 
[IterativeImputer] Change: 2480717202646975.5, scaled tolerance: 35084726045503.402 
[IterativeImputer] Change: 828950433642463.5, scaled tolerance: 35084726045503.402 
[IterativeImputer] Change: 197438585947221.44, scaled tolerance: 35084726045503.402 
[IterativeImputer] Change: 322790396087287.25, scaled tolerance: 35084726045503.402 
[IterativeImputer] Change: 198521659447639.3, scaled tolerance: 35084726045503.402 
[IterativeImputer] Change: 100854905698594.95, scaled tolerance: 35084726045503.402 
[IterativeImputer] Change: 97201093859368.84, scaled tolerance: 35084726045503.402 
[IterativeImputer] Ch



r2: 0.46908039849841243, rmse: 8259496938609.8545, t: 156.20110750198364

r2: 0.1304313516689375, rmse: 0.3336997745386481, t: 156.20110750198364

run: 4/7
[IterativeImputer] Completing matrix with shape (4898, 165)
[IterativeImputer] Change: 2.9597165999209428e+16, scaled tolerance: 35084726045503.402 
[IterativeImputer] Change: 4079700968262220.0, scaled tolerance: 35084726045503.402 
[IterativeImputer] Change: 1846452257742635.2, scaled tolerance: 35084726045503.402 
[IterativeImputer] Change: 289402118487395.4, scaled tolerance: 35084726045503.402 
[IterativeImputer] Change: 266988646736109.78, scaled tolerance: 35084726045503.402 
[IterativeImputer] Change: 264457902088456.03, scaled tolerance: 35084726045503.402 
[IterativeImputer] Change: 157496260765986.06, scaled tolerance: 35084726045503.402 
[IterativeImputer] Change: 100019188834462.77, scaled tolerance: 35084726045503.402 
[IterativeImputer] Change: 92192566711564.83, scaled tolerance: 35084726045503.402 
[IterativeImputer



r2: 0.6126511461655864, rmse: 7313807685135.505, t: 158.31662940979004

r2: 0.8111259137349857, rmse: 0.22671914294901196, t: 158.31662940979004

run: 5/7
[IterativeImputer] Completing matrix with shape (4898, 165)
[IterativeImputer] Change: 2.870131054950721e+16, scaled tolerance: 35084726045503.402 
[IterativeImputer] Change: 3535250339179746.5, scaled tolerance: 35084726045503.402 
[IterativeImputer] Change: 2452757282309790.0, scaled tolerance: 35084726045503.402 
[IterativeImputer] Change: 1508962099066109.0, scaled tolerance: 35084726045503.402 
[IterativeImputer] Change: 714784001669136.5, scaled tolerance: 35084726045503.402 
[IterativeImputer] Change: 411918028890769.75, scaled tolerance: 35084726045503.402 
[IterativeImputer] Change: 145368030256373.56, scaled tolerance: 35084726045503.402 
[IterativeImputer] Change: 160612183898712.28, scaled tolerance: 35084726045503.402 
[IterativeImputer] Change: 186302846667700.28, scaled tolerance: 35084726045503.402 
[IterativeImputer]



r2: 0.8872726015724632, rmse: 22642412208002.492, t: 160.36626887321472

r2: 0.03056093099925017, rmse: 1.429313888232683, t: 160.36626887321472

run: 6/7
[IterativeImputer] Completing matrix with shape (4898, 165)
[IterativeImputer] Change: 3.015861775028332e+16, scaled tolerance: 35084726045503.402 
[IterativeImputer] Change: 3119060963291272.0, scaled tolerance: 35084726045503.402 
[IterativeImputer] Change: 2489576383944474.5, scaled tolerance: 35084726045503.402 
[IterativeImputer] Change: 1041114328516673.9, scaled tolerance: 35084726045503.402 
[IterativeImputer] Change: 355675414479485.06, scaled tolerance: 35084726045503.402 
[IterativeImputer] Change: 200583451632813.78, scaled tolerance: 35084726045503.402 
[IterativeImputer] Change: 186049573530688.6, scaled tolerance: 35084726045503.402 
[IterativeImputer] Change: 170721685114572.66, scaled tolerance: 35084726045503.402 
[IterativeImputer] Change: 118143467238371.47, scaled tolerance: 35084726045503.402 
[IterativeImputer]



r2: 0.981929991222771, rmse: 6017265089913.039, t: 155.00531888008118

r2: 0.9814085175579573, rmse: 0.20216230348593398, t: 155.00531888008118

run: 7/7
[IterativeImputer] Completing matrix with shape (4898, 165)
[IterativeImputer] Change: 2.9310983034141212e+16, scaled tolerance: 35084726045503.402 
[IterativeImputer] Change: 2841763211357182.0, scaled tolerance: 35084726045503.402 
[IterativeImputer] Change: 2441514058437475.0, scaled tolerance: 35084726045503.402 
[IterativeImputer] Change: 1368915610425535.0, scaled tolerance: 35084726045503.402 
[IterativeImputer] Change: 587834238678858.5, scaled tolerance: 35084726045503.402 
[IterativeImputer] Change: 155013982105028.75, scaled tolerance: 35084726045503.402 
[IterativeImputer] Change: 332340015922168.44, scaled tolerance: 35084726045503.402 
[IterativeImputer] Change: 229652665616279.7, scaled tolerance: 35084726045503.402 
[IterativeImputer] Change: 329470149317924.7, scaled tolerance: 35084726045503.402 
[IterativeImputer] C



r2: 0.9697253864568411, rmse: 12616531398986.742, t: 153.47898411750793

r2: 0.8794799723938455, rmse: 1.012372541914244, t: 153.47898411750793

[[0.8725233366448126, 5388007105755.665, 0, 159.67472290992737], [0.8095260372140096, 16525549861609.215, 0, 155.5043842792511], [0.46908039849841243, 8259496938609.8545, 0, 156.20110750198364], [0.6126511461655864, 7313807685135.505, 0, 158.31662940979004], [0.8872726015724632, 22642412208002.492, 0, 160.36626887321472], [0.981929991222771, 6017265089913.039, 0, 155.00531888008118], [0.9697253864568411, 12616531398986.742, 0, 153.47898411750793]]
[[-5.998267275627868, 0.2512357292909726, 0, 159.67472290992737], [0.8958482340059831, 0.19213783973460316, 0, 155.5043842792511], [0.1304313516689375, 0.3336997745386481, 0, 156.20110750198364], [0.8111259137349857, 0.22671914294901196, 0, 158.31662940979004], [0.03056093099925017, 1.429313888232683, 0, 160.36626887321472], [0.9814085175579573, 0.20216230348593398, 0, 155.00531888008118], [0.8794799

In [7]:
r1 = pd.DataFrame(r1).T
r1

Unnamed: 0,0,1,2,3
Backfill,0.975567,4746041000000.0,135535.571429,1.313983
Overall Mean,-0.248897,38418600000000.0,0.0,1.918405
Yearly Mean,-0.580845,40605640000000.0,52298.0,1.517411
Yearly Mean per Region,-1.638806,46539110000000.0,57235.857143,1.836441
Interpolate 3,0.997265,1882102000000.0,134354.571429,5.654787
Interpolate all,0.997271,1877102000000.0,112679.285714,3.82904
ICE 3,0.800387,11251870000000.0,0.0,156.935345
KNN 2,-0.50191,37053230000000.0,0.0,44.281009


In [8]:
r1_detail = pd.DataFrame(pd.DataFrame(r1_detail).T.stack().to_dict()).T
r1_detail.columns = ['r2', 'rmse', 'still_missing', 't']
r1_detail

Unnamed: 0,Unnamed: 1,r2,rmse,still_missing,t
Backfill,0,0.996469,917099000000.0,135530.0,1.288906
Backfill,1,0.970368,2705971000000.0,135514.0,1.380649
Backfill,2,0.991057,1097877000000.0,135555.0,1.333053
Backfill,3,0.936254,3030237000000.0,135512.0,1.298703
Backfill,4,0.990041,6887532000000.0,135536.0,1.303488
Backfill,5,0.961493,8972660000000.0,135532.0,1.311738
Backfill,6,0.983287,9610911000000.0,135570.0,1.281341
Overall Mean,0,-0.339512,17465710000000.0,0.0,1.923356
Overall Mean,1,-0.061003,39002860000000.0,0.0,1.906039
Overall Mean,2,-0.740704,14955520000000.0,0.0,1.892828


### Durchlauf mit 5% fehlenden Werten

In [9]:
r2 , r2_detail =  run_all(0.05)

Testdaten mit 5.0% fehlenden Werten (absolut: 8515)
Testdaten mit 5.0% fehlenden Werten (absolut: 8515)
Testdaten mit 5.0% fehlenden Werten (absolut: 8515)
Testdaten mit 5.0% fehlenden Werten (absolut: 8515)
Testdaten mit 5.0% fehlenden Werten (absolut: 8515)
Testdaten mit 5.0% fehlenden Werten (absolut: 8515)
Testdaten mit 5.0% fehlenden Werten (absolut: 8515)
Backfill
run: 1/7
r2: 0.9503530314485333, rmse: 17021310884426.996, t: 1.455603837966919

r2: 0.9955395119500975, rmse: 0.1861136671230279, t: 1.455603837966919

run: 2/7
r2: 0.9809521867801201, rmse: 5928783971332.182, t: 1.9820747375488281

r2: 0.9527099134718778, rmse: 0.1345692633962322, t: 1.9820747375488281

run: 3/7
r2: 0.9903892741110194, rmse: 4751202435314.032, t: 1.9970667362213135

r2: 0.9763782188141665, rmse: 0.09655187044651366, t: 1.9970667362213135

run: 4/7
r2: 0.9931598463200009, rmse: 3357011513328.415, t: 1.9510610103607178

r2: 0.9821806209292083, rmse: 0.07169457597353222, t: 1.9510610103607178

run: 5/7
r

KeyboardInterrupt: 

In [None]:
r2 = pd.DataFrame(r2).T
r2

In [None]:
r2_detail = pd.DataFrame(pd.DataFrame(r2_detail).T.stack().to_dict()).T
r2_detail.columns = ['r2', 'rmse', 'still_missing', 't']
r2_detail

### Durchlauf mit 10% fehlenden Werten

In [None]:
r3, r3_detail = run_all(0.1)

In [None]:
r3 = pd.DataFrame(r3).T
r3

In [None]:
r3_detail = pd.DataFrame(pd.DataFrame(r3_detail).T.stack().to_dict()).T
r3_detail.columns = ['r2', 'rmse', 'still_missing', 't']
r3_detail

### Imputationverfahren auf Originaldatensatz anwenden

In [None]:
for name, func in functions.items():
    print(name)

    t0 = time.time()
    df= func(experiment.reset_base()) 
    t1 = time.time()
    t = t1-t0

    print(f'{round(t, 2)}s vergangen')
    print('')
    df.to_csv(f'additional_data/imputed_sets/{name}.csv')

### Ergebnisse abspeichern

In [None]:
t1 = time.time()
t = t1-t0
print(f'comlete runtime: {t/60}min')

In [None]:
sets = {'r1':r1, 'r1_detail':r1_detail,
       'r2':r2, 'r2_detail':r2_detail,
       'r3':r3, 'r3_detail':r3_detail,}
for name, df in sets.items():
    df.to_csv(f'additional_data/results/{name}.csv')