# Import Packages and Dataset

In [1]:
# import packages
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
import math
import gc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error

In [3]:
rseed = 42

In [4]:
#import Data
df=pd.read_csv("./data/Train.csv")

In [5]:
# define a function to replace spaces in the string (i.e. missings) with NaN
def replace_nan(x):
    if x==" ":
        return np.nan
    else :
        return float(x)

# define list of feature names
features=["temp","precip","rel_humidity","wind_dir","wind_spd","atmos_press"]


for feature in features : 
    # first replace every 'nan' in a cell with an empty space, split using comma, and then apply replace_nan function on every item
    df[feature]=df[feature].apply(lambda x: [ replace_nan(X) for X
 in x.replace("nan"," ").split(",")])

# Feature Engineering

In [6]:
# Turn wind data into sensible format
df["wind_dir_x"] = df.wind_dir.apply(lambda x: list(np.cos(np.array(x) * np.pi /180)))
df["wind_dir_y"] = df.wind_dir.apply(lambda x: list(np.sin(np.array(x) * np.pi /180)))
df.drop("wind_dir", axis=1, inplace=True)

In [7]:
from scipy.optimize import curve_fit

def fit_time(time_series, return_fit_curve=False):
    """gets a time series, must be hourly, calculates the starting time from fitting a sine on it.

    Args:
        time_series (list): a list describing the time series
        return_fit_curve (bool, optional): tells, weather it should return the x, y and fitted y values. Defaults to False.

    Returns:
        value: the phase angle describing the time of day, ranging from 0 to 2 pi
        OR also the x and y used for fitting and the fitted y.
    """
    # get some x values
    x = np.arange(len(time_series))
    # do a dataframe to drop NaNs
    xy = pd.DataFrame(
        {"x": x,
        "y": time_series}
    )
    xy.dropna(inplace=True)
    # reconvert
    x = xy.x
    y = xy.y 

    # do the fit
    def tod_func(x, y0, amp, phi):
        return y0 + amp * -np.cos(phi + x*np.pi/12)
    params = curve_fit(tod_func, x, y)
    [y0_fit, amp_fit, phi_fit] = params[0]

    # exclude negative amplitudes -> convert to phase information
    if amp_fit < 0:
        amp_fit = abs(amp_fit)
        phi_fit += np.pi

    # get tod-angle between 0 and 2 pi
    phi_fit = phi_fit % (2*np.pi)
    if phi_fit < 0:
        phi_fit += 2*np.pi
    # calulate y_fit
    y_fit = tod_func(x, y0_fit, amp_fit, phi_fit)
    if return_fit_curve:
        return [phi_fit, x, y, y_fit]
    return phi_fit

df["time_of_day_angle"] = df.temp.apply(fit_time)

In [8]:
# define features for extracting summary statistics
features=["temp","precip","rel_humidity","wind_dir_x","wind_dir_y","wind_spd","atmos_press"]

In [9]:
# Helper function for extracting the last recording of each feature
def last(feature):
    if not pd.isna(feature[-1]):
        return feature[-1]
    if pd.isna(feature[-1]) and not pd.isna(feature[-2]):
        return feature[-2]
    return feature[-25]

# function extracting and appending the last recording of each feature
def get_last(df,col_name):
    df["last_"+col_name]=df[col_name].apply(last)
    return df 

# function returning only non-Null values (helper for aggregation function)
def remove_nan_values(case):
    return [obs for obs in case if not math.isnan(obs)]

# function for aggreating features
def aggregate_features(df,col_name):
    df["mean_"+col_name]=df[col_name].apply(np.mean)
    df["std_"+col_name]=df[col_name].apply(np.std)
    return df  

In [10]:
# get last value for each time series
for col_name in features:
    df=get_last(df,col_name)

In [11]:
# remove NaNs from dataframe before calculating aggregated metrics
for col_name in features:
    df[col_name]=df[col_name].apply(remove_nan_values)
    df=aggregate_features(df,col_name)

# Tidying

### Remove columns with too many missings

In [12]:
# recording periods contain varying degrees of NaNs: compute percent NaN for each recording period and feature
# function to compute the percentage of NaNs per recording period
def compute_percent_nan(df, col_name):
    df['percent_nan_'+col_name] = df[col_name].apply(lambda x: np.isnan(np.array(x)).sum()/len(x)*100)
    return df

In [13]:
# calculate percentage of missings per recording period and feature and append to dataframe
nan_columns = []
for col_name in features:
    df=compute_percent_nan(df,col_name)
    nan_columns.append('percent_nan_'+col_name)

In [14]:
# drop NaNs
# filter observations based on percent NaN and check again the data distribution of the target and summary features
df = df[(df[nan_columns]<30).all(axis=1)]

In [15]:
# drop raw sensor data contained as list from the initial dataset
df.drop(labels=features,axis=1,inplace=True)

#drop nan-columns
df.drop(labels=nan_columns,axis=1,inplace=True)

#drop ID column
df.drop(labels='ID',axis=1,inplace=True)

#drop missings from 'last observation' columns
df.dropna(inplace=True)

In [16]:
# get target and feature columns w/o location
colnames = df.columns.to_list()[1::] # get columns w/o location column

#### write to csv

In [17]:
df.to_csv('data/air_quality_final.csv')