In [20]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

def get_df():
    df = pd.read_csv('data/AirQualityUCI.csv', sep=';')

    # Create timestamps
    df.insert(0, 'Timestamp', pd.to_datetime(df['Date'] + ' ' + df['Time'], format='%d/%m/%Y %H.%M.%S'))

    # Drop useless columns
    df = df.drop(['Unnamed: 15', 'Unnamed: 16', 'Date', 'Time'], axis=1)

    # Columns to floats
    for col in df.columns.drop('Timestamp'):
        df[col] = df[col].map(lambda x: float(str(x).replace(',', '.')))
        
    # Add NaNs instead of 200
    df = df.replace(-200, np.nan)

    # Remove useless columns at end
    df = df.loc[~pd.isnull(df['Timestamp']), :]
    # Nice alternative dataframe with timestamps as index
    df_ts = df.set_index('Timestamp', drop=True)

    return df_ts

df = get_df()
columns = df.columns
nmask = df.isna()
nmask[df.index < '2005-01-01'] = False

# lags = pd.concat([df.shift(i).add_suffix(f'_lag{i}') for i in [1,-1, 24, -24, 12, -12]], axis=1)
# df = pd.concat([df, lags], axis=1)

# default bayesian ridge
model = IterativeImputer(random_state=0,max_iter=1000)
imputed = pd.DataFrame(model.fit_transform(df), columns=df.columns, index=df.index)[columns]

gts = ['CO(GT)', 'C6H6(GT)', 'NOx(GT)', 'NO2(GT)']#, 'NMHC(GT)']
for g in gts:
    imputed[g] = imputed[g].mask(nmask[g])

imputed.to_csv('output_data/imputed_all.csv')

In [31]:
import matplotlib.pyplot as plt
imputed_df = pd.read_csv('output_data/imputed_all.csv', parse_dates=['Timestamp'])
imputed_df_ts = imputed_df.set_index('Timestamp', drop=True)

imputed_df.isna().sum()


Timestamp          0
CO(GT)            99
PT08.S1(CO)        0
NMHC(GT)           0
C6H6(GT)         138
PT08.S2(NMHC)      0
NOx(GT)           75
PT08.S3(NOx)       0
NO2(GT)           75
PT08.S4(NO2)       0
PT08.S5(O3)        0
T                  0
RH                 0
AH                 0
dtype: int64