In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression

# data cleaning
df = pd.read_csv('data/AirQualityUCI.csv', sep=';')
df.insert(0, 'Timestamp', pd.to_datetime(df['Date'] + ' ' + df['Time'], format='%d/%m/%Y %H.%M.%S'))
df = df.drop(['Unnamed: 15', 'Unnamed: 16', 'Date', 'Time'], axis=1)
for col in df.columns.drop('Timestamp'):
    df[col] = df[col].map(lambda x: float(str(x).replace(',', '.')))
df = df.replace(-200, np.nan)
df = df.loc[~pd.isnull(df['Timestamp']), :]
df_ts = df.set_index('Timestamp', drop=True)
df_ts.to_csv('output_data/cleaned.csv')

In [3]:
# Longest sequence of consecutive NAs
# df = df_ts

features = ['CO(GT)', 'PT08.S1(CO)', 'NMHC(GT)', 'C6H6(GT)', 'PT08.S2(NMHC)','NOx(GT)', 'PT08.S3(NOx)', 'NO2(GT)', 'PT08.S4(NO2)', 'PT08.S5(O3)','T', 'RH', 'AH']

def max_na(s):
    isna = s.isna()
    blocks = (~isna).cumsum()
    return isna.groupby(blocks).sum().max()

df.apply(max_na)
# Maximum sequence is around a week, except for NMHC which permanently goes offline

Timestamp           0
CO(GT)            173
PT08.S1(CO)        76
NMHC(GT)         8126
C6H6(GT)           76
PT08.S2(NMHC)      76
NOx(GT)           173
PT08.S3(NOx)       76
NO2(GT)           173
PT08.S4(NO2)       76
PT08.S5(O3)        76
T                  76
RH                 76
AH                 76
dtype: int64

In [4]:
# impute data
# Don't have NMHC (GT) for most timestamps
# can use previous imputed data for new data, or ignore past imputed data

# simple imputer - uses average 
def simple_impute(df, features):
    m = df[features].mean()
    imputed = df.fillna({x:m[x] for x in features})
    return imputed

# naive imputer - takes advantage of seasonality, default uses T-24 (same as previous hour)
# set to 24 for previous day
def naive_impute(df, features, cycle=1):
    imputed = df.copy()
    for i in range(cycle):
        imputed[i::cycle] = imputed[i::cycle].ffill()
    return imputed

# interpolate between closest non-na values
def interpolation_impute(df, features):
    return df.interpolate()
    
# univariate data imputation - linear model on each variable
def univariate_impute(df, features, cycle=7):
    for f in features:
        X = []
        y = []
        for j in range(df[f].shape[0]-cycle - 1):
            X.append(np.array(df[f][j:j+cycle]))
            y.append(np.array(df[f][j+cycle+1]))
            if X[-1].hasnan() or y[-1] == np.nan:
                X.pop()
                y.pop()
        model = LinearRegression().fit(X=X, y=y)
    return df

# multivariate data imputation - multivariate linear model on each variable
def multivariate_impute(df, features):
    pass

In [51]:
# test imputation (can later manually delete some values)
# df_i = simple_impute(df, features)
# df_i = naive_impute(df, features, cycle=1)
# df_i = naive_impute(df, features, cycle=24)
# df_i = interpolation_impute(df, features)
df_i = univariate_impute(df, features)

fig, ax = plt.subplots(len(features), figsize=(20,40))
for i, f in enumerate(features):
    ax[i].plot(df_i[f], color='red')
    ax[i].plot(df[f])
    ax[i].set_xlim([0, df.shape[0]])
    ax[i].set_title(f'{f} imputation')

AttributeError: 'Series' object has no attribute 'hasnan'

In [None]:
# Normalisation

