In [19]:
import pandas as pd
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

def get_df():
    df = pd.read_csv('data/AirQualityUCI.csv', sep=';')

    # Create timestamps
    df.insert(0, 'Timestamp', pd.to_datetime(df['Date'] + ' ' + df['Time'], format='%d/%m/%Y %H.%M.%S'))

    # Drop useless columns
    df = df.drop(['Unnamed: 15', 'Unnamed: 16', 'Date', 'Time'], axis=1)

    # Columns to floats
    for col in df.columns.drop('Timestamp'):
        df[col] = df[col].map(lambda x: float(str(x).replace(',', '.')))
        
    # Add NaNs instead of 200
    df = df.replace(-200, np.nan)

    # Remove useless columns at end
    df = df.loc[~pd.isnull(df['Timestamp']), :]
    # Nice alternative dataframe with timestamps as index
    df_ts = df.set_index('Timestamp', drop=True)

    return df_ts

def impute(df):
    columns = df.columns
    
    # we add back NaN unless we explicitly want imputed values
    add_back_nan = df.isna()

    gts = ['CO(GT)', 'C6H6(GT)', 'NOx(GT)', 'NO2(GT)', 'NMHC(GT)']
    add_back_nan.loc[df.index < '2005-01-01', gts] = False
    add_back_nan[['NMHC(GT)']] = False

    # lags = pd.concat([df.shift(i).add_suffix(f'_lag{i}') for i in [1,-1]], axis=1)
    # df = pd.concat([df, lags], axis=1)

    # default bayesian ridge iterative imputer
    model = IterativeImputer(random_state=0,max_iter=1000)
    imputed = pd.DataFrame(model.fit_transform(df), columns=df.columns, index=df.index)[columns]

    # post processing of imputed dataframe
    imputed.drop(filter(lambda x : 'lag_' in x, imputed.columns), axis=1)
    for col in imputed.columns:
        imputed.loc[(imputed[col] < 0.05).to_numpy(), col] = np.nan
    imputed = imputed.interpolate(method='time')

    imputed[add_back_nan] = np.nan

    # save result
    imputed.to_csv('output_data/imputed_all.csv')
    imputed.to_excel('output_data/imputed_all.xlsx')
    
    return imputed

res = impute(get_df())
res

Unnamed: 0_level_0,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2004-03-10 18:00:00,2.6,1360.0,150.000000,11.9,1046.0,166.0,1056.0,113.0,1692.0,1268.0,13.6,48.9,0.7578
2004-03-10 19:00:00,2.0,1292.0,112.000000,9.4,955.0,103.0,1174.0,92.0,1559.0,972.0,13.3,47.7,0.7255
2004-03-10 20:00:00,2.2,1402.0,88.000000,9.0,939.0,131.0,1140.0,114.0,1555.0,1074.0,11.9,54.0,0.7502
2004-03-10 21:00:00,2.2,1376.0,80.000000,9.2,948.0,172.0,1092.0,122.0,1584.0,1203.0,11.0,60.0,0.7867
2004-03-10 22:00:00,1.6,1272.0,51.000000,6.5,836.0,131.0,1205.0,116.0,1490.0,1110.0,11.2,59.6,0.7888
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2005-04-04 10:00:00,3.1,1314.0,261.821642,13.5,1101.0,472.0,539.0,190.0,1374.0,1729.0,21.9,29.3,0.7568
2005-04-04 11:00:00,2.4,1163.0,264.415233,11.4,1027.0,353.0,604.0,179.0,1264.0,1269.0,24.3,23.7,0.7119
2005-04-04 12:00:00,2.4,1142.0,325.549398,12.4,1063.0,293.0,603.0,175.0,1241.0,1092.0,26.9,18.3,0.6406
2005-04-04 13:00:00,2.1,1003.0,315.044899,9.5,961.0,235.0,702.0,156.0,1041.0,770.0,28.3,13.5,0.5139


In [10]:
import matplotlib.pyplot as plt
imputed_df = pd.read_csv('output_data/imputed_all.csv', parse_dates=['Timestamp'])
imputed_df_ts = imputed_df.set_index('Timestamp', drop=True)

print(imputed_df.isna().sum())
print(get_df().isna().sum())


Timestamp          0
CO(GT)            99
PT08.S1(CO)      366
NMHC(GT)           0
C6H6(GT)         138
PT08.S2(NMHC)    366
NOx(GT)           75
PT08.S3(NOx)     366
NO2(GT)           75
PT08.S4(NO2)     366
PT08.S5(O3)      366
T                366
RH               366
AH               366
dtype: int64
CO(GT)           1683
PT08.S1(CO)       366
NMHC(GT)         8443
C6H6(GT)          366
PT08.S2(NMHC)     366
NOx(GT)          1639
PT08.S3(NOx)      366
NO2(GT)          1642
PT08.S4(NO2)      366
PT08.S5(O3)       366
T                 366
RH                366
AH                366
dtype: int64
