In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
pd.set_option('display.float_format', lambda x: '%.4f' % x)

base = pd.read_csv('additional_data/base.csv')

backfill = pd.read_csv('additional_data/imputed_sets/backfill.csv')
yearly_mean = pd.read_csv('additional_data/imputed_sets/Yearly Mean.csv')
overall_mean = pd.read_csv('additional_data/imputed_sets/Overall Mean.csv')
regional_mean = pd.read_csv('additional_data/imputed_sets/Yearly Mean per Region.csv')
interpolation_all = pd.read_csv('additional_data/imputed_sets/Interpolate all.csv')
interpolation_3 = pd.read_csv('additional_data/imputed_sets/Interpolate 3.csv')
ice1 = pd.read_csv('additional_data/imputed_sets/ICE 1.csv')
ice2 = pd.read_csv('additional_data/imputed_sets/ICE 2.csv')
ice3 = pd.read_csv('additional_data/imputed_sets/ICE 3.csv')
mice1 = pd.read_csv('additional_data/imputed_sets/MICE 1.csv')
mice2 = pd.read_csv('additional_data/imputed_sets/MICE 2.csv')
knn1 = pd.read_csv('additional_data/imputed_sets/KNN 1.csv')
knn2 = pd.read_csv('additional_data/imputed_sets/KNN 2.csv')

sets =  {'Backfill': backfill, 'Overall Mean':overall_mean, 
              'Yearly Mean':yearly_mean, 'Yearly Mean per Region':regional_mean, 
              'Interpolate 3':interpolation_3, 'Interpolate all':interpolation_all, 
              'ICE 1':ice1, 'ICE 2':ice2, 
              'ICE 3':ice3, 'MICE 1':mice1, 'MICE 2':mice2, 
              'KNN 1':knn1, 'KNN 2':knn2
             }

In [3]:
#get indicators with upper boundary 100
bondary_hundred = pd.read_csv('additional_data/chosen_indicators.csv', sep=';')
bondary_hundred.dropna(subset=['percentage'], inplace=True)
bondary_hundred.drop('percentage', axis=1, inplace=True)

#get indicators 
temp = base.drop('Country Name', axis=1).set_index('Indicator Name')
bondary_zero = temp[temp < 0].dropna(how='all').index.unique()

table = []
for name, df in sets.items():
    #count missing
    temp = df.set_index(['Country Name', 'Indicator Name'])
    missing = temp.isna().sum().sum()
    
    #count violations of boundary hundred
    temp = temp.loc[temp.index.isin(bondary_hundred['indicators'], level='Indicator Name')]
    implausible = (temp>100).sum().sum()
    
    #count violations of boundary zero
    temp = df.set_index(['Country Name', 'Indicator Name'])
    temp = temp.loc[~temp.index.isin(bondary_zero, level='Indicator Name')]
    implausible = implausible + (temp<0).sum().sum()
    
    table.append([name, implausible, missing])

In [4]:
table = pd.DataFrame(table, columns=['Datensatz', 'logische Fehler', 'NaN'])
table = table.set_index('Datensatz')
print(table.to_latex(label='tab:logic', caption='Prozentwerte \>100 für Indikatoren die diese nicht zulassen.'))

\begin{table}
\centering
\caption{Prozentwerte \>100 für Indikatoren die diese nicht zulassen.}
\label{tab:logic}
\begin{tabular}{lrr}
\toprule
{} &  logische Fehler &     NaN \\
Datensatz              &                  &         \\
\midrule
Backfill               &                0 &  135192 \\
Overall Mean           &                0 &       0 \\
Yearly Mean            &                0 &   52298 \\
Yearly Mean per Region &                0 &   58134 \\
Interpolate 3          &                0 &  134029 \\
Interpolate all        &                0 &  112422 \\
ICE 1                  &            60213 &       0 \\
ICE 2                  &            70409 &       0 \\
ICE 3                  &            31677 &       0 \\
MICE 1                 &            64751 &       0 \\
MICE 2                 &            28422 &       0 \\
KNN 1                  &            18084 &       0 \\
KNN 2                  &                0 &       0 \\
\bottomrule
\end{tabular}
\end{table}



  print(table.to_latex(label='tab:logic', caption='Prozentwerte \>100 für Indikatoren die diese nicht zulassen.'))
