# **Pre-processing**

---

## **Import Libraries & Data Loading**

In [72]:
import os.path as osp

import pandas as pd

data_folder = osp.join('..', 'resource', 'dataset')

In [None]:
city_attri_df = pd.read_csv(osp.join(data_folder, 'raw', 'city_attributes.csv'))

humidity_df = pd.read_csv(osp.join(data_folder, 'raw', 'humidity.csv'))
temp_df = pd.read_csv(osp.join(data_folder, 'raw','temperature.csv'))
pressure_df = pd.read_csv(osp.join(data_folder, 'raw', 'pressure.csv'))
wind_direct_df = pd.read_csv(osp.join(data_folder, 'raw', 'wind_direction.csv'))
wind_speed_df = pd.read_csv(osp.join(data_folder, 'raw', 'wind_speed.csv'))
weather_descript_df = pd.read_csv(osp.join(data_folder, 'raw', 'weather_description.csv'))

In [None]:
def preprocess_features(
    feature_df: pd.DataFrame
) -> pd.DataFrame:
    # date & time
    feature_df = feature_df.copy(True)
    feature_df['datetime'] = pd.to_datetime(feature_df['datetime'])
    feature_df['date'] = feature_df['datetime'].dt.date

    feature_df.drop(columns=['datetime'], inplace=True)

    # fill missing values with forward fill
    feature_df.drop(0, axis=0, inplace=True)
    feature_df.ffill(inplace=True)


    # aggregate by date and take mean
    return (feature_df
        .groupby(feature_df['date'])
        .mean(numeric_only=True)
        .reset_index()
    )

## **Humidity**

In [75]:
humidity_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45253 entries, 0 to 45252
Data columns (total 37 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   datetime           45253 non-null  object 
 1   Vancouver          43427 non-null  float64
 2   Portland           44804 non-null  float64
 3   San Francisco      44311 non-null  float64
 4   Seattle            44964 non-null  float64
 5   Los Angeles        45101 non-null  float64
 6   San Diego          44909 non-null  float64
 7   Las Vegas          44411 non-null  float64
 8   Phoenix            43945 non-null  float64
 9   Albuquerque        44543 non-null  float64
 10  Denver             43445 non-null  float64
 11  San Antonio        44689 non-null  float64
 12  Dallas             44934 non-null  float64
 13  Houston            45132 non-null  float64
 14  Kansas City        44741 non-null  float64
 15  Minneapolis        44743 non-null  float64
 16  Saint Louis        439

In [None]:
humidity_df = preprocess_features(humidity_df)
humidity_df

Unnamed: 0,date,Vancouver,Portland,San Francisco,Seattle,Los Angeles,San Diego,Las Vegas,Phoenix,Albuquerque,...,Philadelphia,New York,Montreal,Boston,Beersheba,Tel Aviv District,Eilat,Haifa,Nahariyya,Jerusalem
0,2012-10-01,78.272727,78.727273,83.0,78.0,88.0,79.909091,20.727273,24.181818,48.909091,...,68.272727,56.272727,75.727273,68.0,54.818182,62.909091,23.090909,51.0,51.0,50.0
1,2012-10-02,77.375,65.833333,65.958333,65.166667,64.916667,65.875,16.166667,23.5,39.625,...,54.5,50.5,47.0,68.75,72.916667,69.708333,33.833333,53.5,53.5,52.75
2,2012-10-03,71.625,66.208333,47.083333,58.416667,44.875,59.75,15.458333,18.0,33.458333,...,78.791667,68.541667,91.875,86.041667,75.541667,77.208333,47.291667,69.375,69.75,62.291667
3,2012-10-04,49.541667,51.166667,60.166667,48.125,66.625,75.583333,15.166667,19.833333,21.625,...,89.041667,95.0,95.458333,84.75,67.458333,64.625,49.333333,71.25,71.25,63.375
4,2012-10-05,43.125,39.75,69.666667,48.25,70.25,72.791667,14.166667,27.333333,17.958333,...,84.75,93.041667,89.833333,73.333333,65.666667,63.416667,46.875,70.5,70.5,67.25


In [78]:
humidity_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1887 entries, 0 to 1886
Data columns (total 37 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   date               1887 non-null   object 
 1   Vancouver          1887 non-null   float64
 2   Portland           1887 non-null   float64
 3   San Francisco      1887 non-null   float64
 4   Seattle            1887 non-null   float64
 5   Los Angeles        1887 non-null   float64
 6   San Diego          1887 non-null   float64
 7   Las Vegas          1887 non-null   float64
 8   Phoenix            1887 non-null   float64
 9   Albuquerque        1887 non-null   float64
 10  Denver             1887 non-null   float64
 11  San Antonio        1887 non-null   float64
 12  Dallas             1887 non-null   float64
 13  Houston            1887 non-null   float64
 14  Kansas City        1887 non-null   float64
 15  Minneapolis        1887 non-null   float64
 16  Saint Louis        1887 

## **Pressure**

In [79]:
pressure_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45253 entries, 0 to 45252
Data columns (total 37 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   datetime           45253 non-null  object 
 1   Vancouver          41019 non-null  float64
 2   Portland           45249 non-null  float64
 3   San Francisco      44438 non-null  float64
 4   Seattle            45240 non-null  float64
 5   Los Angeles        45001 non-null  float64
 6   San Diego          45078 non-null  float64
 7   Las Vegas          45165 non-null  float64
 8   Phoenix            44659 non-null  float64
 9   Albuquerque        44797 non-null  float64
 10  Denver             44710 non-null  float64
 11  San Antonio        45236 non-null  float64
 12  Dallas             45193 non-null  float64
 13  Houston            45244 non-null  float64
 14  Kansas City        45132 non-null  float64
 15  Minneapolis        45236 non-null  float64
 16  Saint Louis        451

In [None]:
pressure_df = preprocess_features(pressure_df)
pressure_df

Unnamed: 0,date,Vancouver,Portland,San Francisco,Seattle,Los Angeles,San Diego,Las Vegas,Phoenix,Albuquerque,...,Philadelphia,New York,Montreal,Boston,Beersheba,Tel Aviv District,Eilat,Haifa,Nahariyya,Jerusalem
0,2012-10-01,,1024.0,1009.727273,1030.0,1013.0,1013.0,1018.0,1013.0,1024.0,...,1014.0,1012.0,875.545455,1014.0,984.0,1012.0,1010.0,1013.0,1013.0,990.0
1,2012-10-02,972.8,1023.583333,1011.25,1034.291667,1012.75,1012.791667,1018.041667,1012.708333,1022.583333,...,1013.791667,1011.958333,890.625,1013.791667,984.583333,1012.583333,1011.458333,1013.0,1013.0,990.5
2,2012-10-03,1009.75,1021.083333,1011.041667,1028.833333,1009.875,1010.666667,1013.833333,1011.041667,1021.833333,...,1014.583333,1015.541667,1013.083333,1013.416667,986.0,1013.75,1012.958333,1013.0,1013.0,990.791667
3,2012-10-04,1018.416667,1022.875,1010.583333,1032.791667,1012.375,1013.083333,1011.583333,1011.791667,1018.375,...,1016.458333,1016.875,1018.0,1018.208333,986.541667,1014.208333,1013.208333,1013.0,1013.0,991.0
4,2012-10-05,1023.166667,1022.916667,1015.041667,1025.458333,1017.125,1017.875,1013.625,1015.166667,1021.0,...,1019.625,1019.25,1019.458333,1021.375,984.166667,1012.541667,1011.916667,1013.0,1013.0,989.75


In [82]:
pressure_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1887 entries, 0 to 1886
Data columns (total 37 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   date               1887 non-null   object 
 1   Vancouver          1886 non-null   float64
 2   Portland           1887 non-null   float64
 3   San Francisco      1887 non-null   float64
 4   Seattle            1887 non-null   float64
 5   Los Angeles        1887 non-null   float64
 6   San Diego          1887 non-null   float64
 7   Las Vegas          1887 non-null   float64
 8   Phoenix            1887 non-null   float64
 9   Albuquerque        1887 non-null   float64
 10  Denver             1887 non-null   float64
 11  San Antonio        1887 non-null   float64
 12  Dallas             1887 non-null   float64
 13  Houston            1887 non-null   float64
 14  Kansas City        1887 non-null   float64
 15  Minneapolis        1887 non-null   float64
 16  Saint Louis        1887 

## **Temperature**

In [83]:
temp_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45253 entries, 0 to 45252
Data columns (total 37 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   datetime           45253 non-null  object 
 1   Vancouver          44458 non-null  float64
 2   Portland           45252 non-null  float64
 3   San Francisco      44460 non-null  float64
 4   Seattle            45250 non-null  float64
 5   Los Angeles        45250 non-null  float64
 6   San Diego          45252 non-null  float64
 7   Las Vegas          45252 non-null  float64
 8   Phoenix            45250 non-null  float64
 9   Albuquerque        45252 non-null  float64
 10  Denver             45252 non-null  float64
 11  San Antonio        45252 non-null  float64
 12  Dallas             45249 non-null  float64
 13  Houston            45250 non-null  float64
 14  Kansas City        45252 non-null  float64
 15  Minneapolis        45240 non-null  float64
 16  Saint Louis        452

In [None]:
temp_df = preprocess_features(temp_df)
temp_df

Unnamed: 0,date,Vancouver,Portland,San Francisco,Seattle,Los Angeles,San Diego,Las Vegas,Phoenix,Albuquerque,...,Philadelphia,New York,Montreal,Boston,Beersheba,Tel Aviv District,Eilat,Haifa,Nahariyya,Jerusalem
0,2012-10-01,284.620769,282.118197,289.416642,281.767262,291.846501,291.573495,293.358911,296.701739,285.476208,...,286.043165,288.56942,285.88798,287.371091,306.621486,304.248983,310.158846,304.4,304.4,303.5
1,2012-10-02,286.14519,286.137728,292.958306,285.156888,295.89045,295.291472,297.248385,301.211968,289.771821,...,289.239595,290.892389,286.937931,289.01309,302.226773,302.787467,306.759071,303.9,303.9,302.675
2,2012-10-03,285.528125,289.599792,296.929167,287.673958,299.008542,297.87875,300.691875,302.867083,291.205417,...,290.353542,290.065625,287.374583,289.020833,301.194375,301.687917,303.289583,301.561042,301.5025,301.258125
3,2012-10-04,284.373333,286.4825,295.687083,284.391667,295.997917,296.080833,301.82,302.232917,293.09625,...,293.63375,291.987083,286.860833,290.04375,300.094167,299.94,301.770208,299.139167,299.139167,298.924167
4,2012-10-05,283.757292,288.286042,290.635417,284.75625,292.948333,293.894375,300.628542,301.81125,292.829167,...,294.015833,294.043542,287.535208,289.517292,299.712083,300.153125,299.86,298.8775,298.8775,297.5475


In [86]:
temp_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1887 entries, 0 to 1886
Data columns (total 37 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   date               1887 non-null   object 
 1   Vancouver          1887 non-null   float64
 2   Portland           1887 non-null   float64
 3   San Francisco      1887 non-null   float64
 4   Seattle            1887 non-null   float64
 5   Los Angeles        1887 non-null   float64
 6   San Diego          1887 non-null   float64
 7   Las Vegas          1887 non-null   float64
 8   Phoenix            1887 non-null   float64
 9   Albuquerque        1887 non-null   float64
 10  Denver             1887 non-null   float64
 11  San Antonio        1887 non-null   float64
 12  Dallas             1887 non-null   float64
 13  Houston            1887 non-null   float64
 14  Kansas City        1887 non-null   float64
 15  Minneapolis        1887 non-null   float64
 16  Saint Louis        1887 

## **Wind Direction**

In [87]:
wind_direct_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45253 entries, 0 to 45252
Data columns (total 37 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   datetime           45253 non-null  object 
 1   Vancouver          44458 non-null  float64
 2   Portland           45252 non-null  float64
 3   San Francisco      44459 non-null  float64
 4   Seattle            45252 non-null  float64
 5   Los Angeles        45252 non-null  float64
 6   San Diego          45252 non-null  float64
 7   Las Vegas          45248 non-null  float64
 8   Phoenix            45252 non-null  float64
 9   Albuquerque        45252 non-null  float64
 10  Denver             45252 non-null  float64
 11  San Antonio        45252 non-null  float64
 12  Dallas             45252 non-null  float64
 13  Houston            45251 non-null  float64
 14  Kansas City        45252 non-null  float64
 15  Minneapolis        45252 non-null  float64
 16  Saint Louis        452

In [None]:
wind_direct_df = preprocess_features(wind_direct_df)
wind_direct_df

Unnamed: 0,date,Vancouver,Portland,San Francisco,Seattle,Los Angeles,San Diego,Las Vegas,Phoenix,Albuquerque,...,Philadelphia,New York,Montreal,Boston,Beersheba,Tel Aviv District,Eilat,Haifa,Nahariyya,Jerusalem
0,2012-10-01,61.818182,57.727273,122.363636,32.272727,0.0,0.0,65.727273,7.727273,360.0,...,275.0,261.363636,235.727273,61.363636,155.0,248.181818,30.0,336.0,336.0,329.0
1,2012-10-02,209.5,214.041667,102.5,126.583333,49.833333,95.833333,168.375,22.833333,303.458333,...,278.041667,268.125,260.583333,70.416667,160.791667,136.166667,90.0,298.75,298.75,275.5
2,2012-10-03,162.916667,228.333333,111.25,97.708333,63.375,142.708333,157.916667,127.833333,132.708333,...,96.416667,118.916667,227.583333,154.958333,132.958333,207.5,139.375,261.5,262.5,245.291667
3,2012-10-04,87.166667,206.75,162.5,220.416667,19.458333,119.166667,218.958333,94.666667,144.541667,...,100.5,71.958333,101.583333,188.75,185.0,229.291667,168.541667,267.208333,267.208333,286.166667
4,2012-10-05,147.916667,182.25,205.833333,103.75,119.75,177.416667,214.375,124.0,225.875,...,70.0,49.583333,66.333333,245.833333,195.666667,200.291667,116.458333,270.25,270.25,288.5


In [90]:
wind_direct_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1887 entries, 0 to 1886
Data columns (total 37 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   date               1887 non-null   object 
 1   Vancouver          1887 non-null   float64
 2   Portland           1887 non-null   float64
 3   San Francisco      1887 non-null   float64
 4   Seattle            1887 non-null   float64
 5   Los Angeles        1887 non-null   float64
 6   San Diego          1887 non-null   float64
 7   Las Vegas          1887 non-null   float64
 8   Phoenix            1887 non-null   float64
 9   Albuquerque        1887 non-null   float64
 10  Denver             1887 non-null   float64
 11  San Antonio        1887 non-null   float64
 12  Dallas             1887 non-null   float64
 13  Houston            1887 non-null   float64
 14  Kansas City        1887 non-null   float64
 15  Minneapolis        1887 non-null   float64
 16  Saint Louis        1887 

## **Wind Speed**

In [91]:
wind_speed_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45253 entries, 0 to 45252
Data columns (total 37 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   datetime           45253 non-null  object 
 1   Vancouver          44458 non-null  float64
 2   Portland           45252 non-null  float64
 3   San Francisco      44459 non-null  float64
 4   Seattle            45252 non-null  float64
 5   Los Angeles        45252 non-null  float64
 6   San Diego          45252 non-null  float64
 7   Las Vegas          45238 non-null  float64
 8   Phoenix            45251 non-null  float64
 9   Albuquerque        45249 non-null  float64
 10  Denver             45251 non-null  float64
 11  San Antonio        45252 non-null  float64
 12  Dallas             45251 non-null  float64
 13  Houston            45250 non-null  float64
 14  Kansas City        45252 non-null  float64
 15  Minneapolis        45250 non-null  float64
 16  Saint Louis        452

In [None]:
wind_speed_df = preprocess_features(wind_speed_df)
wind_speed_df

Unnamed: 0,date,Vancouver,Portland,San Francisco,Seattle,Los Angeles,San Diego,Las Vegas,Phoenix,Albuquerque,...,Philadelphia,New York,Montreal,Boston,Beersheba,Tel Aviv District,Eilat,Haifa,Nahariyya,Jerusalem
0,2012-10-01,0.0,0.0,1.636364,0.0,0.0,0.0,0.0,1.636364,4.0,...,3.181818,6.363636,3.272727,3.0,2.363636,0.454545,8.0,2.0,2.0,2.0
1,2012-10-02,0.0,1.291667,1.708333,0.625,0.375,1.125,1.375,1.0,3.166667,...,2.333333,4.833333,1.666667,3.25,2.166667,2.416667,7.541667,3.0,3.0,2.0
2,2012-10-03,0.541667,2.625,2.375,1.583333,0.541667,1.083333,1.25,1.25,2.958333,...,1.833333,2.416667,1.333333,4.083333,1.083333,2.5,5.125,4.708333,4.75,2.166667
3,2012-10-04,0.625,4.625,2.791667,3.375,0.25,1.0,1.958333,0.916667,1.916667,...,1.833333,1.875,1.0,2.833333,1.041667,2.041667,5.583333,3.25,3.25,2.791667
4,2012-10-05,0.416667,3.708333,2.125,2.625,0.916667,1.708333,3.083333,1.291667,2.583333,...,0.416667,1.375,1.708333,1.916667,1.458333,2.166667,6.041667,2.25,2.25,2.0


In [94]:
wind_speed_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1887 entries, 0 to 1886
Data columns (total 37 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   date               1887 non-null   object 
 1   Vancouver          1887 non-null   float64
 2   Portland           1887 non-null   float64
 3   San Francisco      1887 non-null   float64
 4   Seattle            1887 non-null   float64
 5   Los Angeles        1887 non-null   float64
 6   San Diego          1887 non-null   float64
 7   Las Vegas          1887 non-null   float64
 8   Phoenix            1887 non-null   float64
 9   Albuquerque        1887 non-null   float64
 10  Denver             1887 non-null   float64
 11  San Antonio        1887 non-null   float64
 12  Dallas             1887 non-null   float64
 13  Houston            1887 non-null   float64
 14  Kansas City        1887 non-null   float64
 15  Minneapolis        1887 non-null   float64
 16  Saint Louis        1887 

## **Save preprocessed data**

In [40]:
humidity_df.to_csv(osp.join(data_folder, 'preprocessed', 'humidity.csv'), index=False)
pressure_df.to_csv(osp.join(data_folder, 'preprocessed', 'pressure.csv'), index=False)
temp_df.to_csv(osp.join(data_folder, 'preprocessed', 'temperature.csv'), index=False)
wind_direct_df.to_csv(osp.join(data_folder, 'preprocessed', 'wind_direction.csv'), index=False)
wind_speed_df.to_csv(osp.join(data_folder, 'preprocessed', 'wind_speed.csv'), index=False)