# **Pre-processing**

---

## **Import Libraries & Data Loading**

In [37]:
import os.path as osp

import pandas as pd

data_folder = osp.join('..', 'resource', 'dataset')

In [38]:
metadata = pd.read_csv(osp.join(data_folder, 'metadata.csv'))
metadata = metadata[metadata['attributes'] != 'weather_description']

city_attri_df = pd.read_csv(osp.join(data_folder, 'raw', 'city_attributes.csv'))
feature_dfs = {
    attribute: pd.read_csv(f'{data_folder}/raw/{attribute}.csv')
    for attribute in metadata['attributes']
}

In [39]:
def preprocess_features(
    feature_df: pd.DataFrame
) -> pd.DataFrame:
    # date & time
    feature_df = feature_df.copy(True)
    feature_df['datetime'] = pd.to_datetime(feature_df['datetime'])
    feature_df['date'] = feature_df['datetime'].dt.date

    feature_df.drop(columns=['datetime'], inplace=True)

    # fill missing values with forward fill
    feature_df.drop(0, axis=0, inplace=True)
    feature_df.ffill(inplace=True)

    # remove date 2012-10-01
    feature_df = feature_df[feature_df['date'] != pd.to_datetime("2012-10-01").date()]

    # aggregate by date and take mean
    return (feature_df
        .groupby(feature_df['date'])
        .mean(numeric_only=True)
        .reset_index()
    )

## **Preprocess data**

Remove nulls, normalize datetime and such

In [40]:
for attribute, df in feature_dfs.items():
    feature_dfs[attribute] = preprocess_features(df).set_index('date')

In [41]:
pd.DataFrame({
    attribute: df.isna().to_numpy().sum()
    for attribute, df in feature_dfs.items()
}, index=['Null Values'])

Unnamed: 0,humidity,pressure,temperature,wind_direction,wind_speed
Null Values,0,0,0,0,0


## **Train-test split**
and normalizing data

In [42]:
import datetime
test_date = datetime.date(2016, 1, 1)

In [43]:
train_set = {}
test_set = {}

for attribute, df in feature_dfs.items():
    mask = df.index >= test_date

    train_set[attribute] = df[~mask]
    test_set[attribute] = df[mask]

Normalizing values

In [44]:
temp = train_set['pressure']
pressure_min, pressure_max = temp.min(None), temp.max(None)

temp = train_set['temperature']
temper_min, temper_max = temp.min(None), temp.max(None)

temp = train_set['wind_speed']
wind_min, wind_max = temp.min(None), temp.max(None)


metascale = pd.DataFrame({
    'humidity': {'min': 0, 'max': 100},
    'pressure': {'min': pressure_min, 'max': pressure_max},
    'temperature': {'min': temper_min, 'max': temper_max},
    'wind_direction': {'min': 0, 'max': 360},
    'wind_speed': {'min': wind_min, 'max': wind_max},
})

In [45]:
for attribute in metascale.columns:
    train_set[attribute] = (
        (train_set[attribute] - metascale[attribute]['min']) /
        (metascale[attribute]['max'] - metascale[attribute]['min'])
    )
    test_set[attribute] = (
        (test_set[attribute] - metascale[attribute]['min']) /
        (metascale[attribute]['max'] - metascale[attribute]['min'])
    )

## **Save preprocessed data**

In [47]:
metascale.to_csv(osp.join(data_folder, 'train-test-split', 'metascale.csv'))

for attribute, df in feature_dfs.items():
    df.to_csv(osp.join(data_folder, 'preprocessed', f'{attribute}.csv'))
    train_set[attribute].to_csv(osp.join(data_folder, 'train-test-split', 'train', f'{attribute}.csv'))
    test_set[attribute].to_csv(osp.join(data_folder, 'train-test-split', 'test', f'{attribute}.csv'))