# **Pre-processing**

---

## **Import Libraries & Data Loading**

In [20]:
import os.path as osp

import pandas as pd

data_folder = osp.join('..', 'resource', 'dataset')

In [21]:
metadata = pd.read_csv(osp.join(data_folder, 'metadata.csv'))
metadata = metadata[metadata['attributes'] != 'weather_description']

city_attri_df = pd.read_csv(osp.join(data_folder, 'raw', 'city_attributes.csv'))
feature_dfs = {
    attribute: pd.read_csv(f'{data_folder}/raw/{attribute}.csv')
    for attribute in metadata['attributes']
}

In [22]:
def preprocess_features(
    feature_df: pd.DataFrame
) -> pd.DataFrame:
    # date & time
    feature_df = feature_df.copy(True)
    feature_df['datetime'] = pd.to_datetime(feature_df['datetime'])
    feature_df['date'] = feature_df['datetime'].dt.date

    feature_df.drop(columns=['datetime'], inplace=True)

    # fill missing values with forward fill
    feature_df.drop(0, axis=0, inplace=True)
    feature_df.ffill(inplace=True)

    # remove date 2012-10-01
    feature_df = feature_df[feature_df['date'] != pd.to_datetime("2012-10-01").date()]

    # aggregate by date and take mean
    return (feature_df
        .groupby(feature_df['date'])
        .mean(numeric_only=True)
        .reset_index()
    )

## **Preprocess data**

Remove nulls, normalize datetime and such

In [23]:
for attribute, df in feature_dfs.items():
    feature_dfs[attribute] = preprocess_features(df).set_index('date')

In [24]:
pd.DataFrame({
    attribute: df.isna().to_numpy().sum()
    for attribute, df in feature_dfs.items()
}, index=['Null Values'])

Unnamed: 0,humidity,pressure,temperature,wind_direction,wind_speed
Null Values,0,0,0,0,0


Normalizing values

For bounded values:

In [25]:
# humidity is [0, 100] %
feature_dfs['humidity'] = feature_dfs['humidity'] / 100

# wind direction is based on meteorological degrees [0, 360] clockwise starting from North
feature_dfs['wind_direction'] = feature_dfs['wind_direction'] / 360

For non-bounded values:

In [26]:
temp = feature_dfs['pressure']
pressure_min, pressure_max = temp.min(None), temp.max(None)

feature_dfs['pressure'] = (feature_dfs['pressure'] - pressure_min) / (pressure_max - pressure_min)


temp = feature_dfs['temperature']
temper_min, temper_max = temp.min(None), temp.max(None)

feature_dfs['temperature'] = (feature_dfs['temperature'] - temper_min) / (temper_max - temper_min)


temp = feature_dfs['wind_speed']
wind_min, wind_max = temp.min(None), temp.max(None)

feature_dfs['wind_speed'] = (feature_dfs['wind_speed'] - wind_min) / (wind_max - wind_min)

## **Save preprocessed data**

In [31]:
pd.DataFrame({
    'humidity': {'min': 0, 'max': 100},
    'pressure': {'min': pressure_min, 'max': pressure_max},
    'temperature': {'min': temper_min, 'max': temper_max},
    'wind_direction': {'min': 0, 'max': 360},
    'wind_speed': {'min': wind_min, 'max': wind_max},
}).to_csv(osp.join(data_folder, 'preprocessed', 'metascale.csv'))

In [32]:
for attribute, df in feature_dfs.items():
    df.to_csv(osp.join(data_folder, 'preprocessed', f'{attribute}.csv'))