# **Pre-processing**

---

## **Import Libraries & Data Loading**

In [3]:
import os.path as osp

import pandas as pd

data_folder = osp.join('..', 'resource', 'dataset')

In [4]:
metadata = pd.read_csv(osp.join(data_folder, 'metadata.csv'))
metadata = metadata[metadata['attributes'] != 'weather_description']

city_attri_df = pd.read_csv(osp.join(data_folder, 'raw', 'city_attributes.csv'))
feature_dfs = {
    attribute: pd.read_csv(f'{data_folder}/raw/{attribute}.csv')
    for attribute in metadata['attributes']
}

In [None]:
def preprocess_features(
    feature_df: pd.DataFrame
) -> pd.DataFrame:
    # date & time
    feature_df = feature_df.copy(True)
    feature_df['datetime'] = pd.to_datetime(feature_df['datetime'])
    feature_df['date'] = feature_df['datetime'].dt.date

    feature_df.drop(columns=['datetime'], inplace=True)

    # fill missing values with forward fill
    feature_df.drop(0, axis=0, inplace=True)
    feature_df.ffill(inplace=True)

    # remove date 2012-10-01
    feature_df = feature_df[feature_df['date'] != pd.to_datetime("2012-10-01").date()]

    # aggregate by date and take mean
    return (feature_df
        .groupby(feature_df['date'])
        .mean(numeric_only=True)
        .reset_index()
    )

## **Preprocess data**

In [6]:
for attribute, df in feature_dfs.items():
    feature_dfs[attribute] = preprocess_features(df)

In [7]:
pd.DataFrame({
    attribute: df.isna().to_numpy().sum()
    for attribute, df in feature_dfs.items()
}, index=['Null Values'])

Unnamed: 0,humidity,pressure,temperature,wind_direction,wind_speed
Null Values,0,0,0,0,0


## **Save preprocessed data**

In [8]:
for attribute, df in feature_dfs.items():
    df.to_csv(osp.join(data_folder, 'preprocessed', f'{attribute}.csv'), index=False)