# 2. Features generation

This notebook computes features for the modelling piece.

### Index:

In [1]:
import os
import numpy as np
import pandas as pd

In [2]:
weather = pd.read_parquet('../data/03_primary/weather_primary.parquet')

# 1. Date features creation
In a time series prediction, the day and the month can be important to track seasonal patterns.
The year it is not included to avoid overfitting for predicting the future.

In [3]:
weather['Date_month'] = weather.Date.dt.month
weather['Date_day'] = weather.Date.dt.day

# 2. Charatec features encoding
The model may need string variables to become numeric to be used.

## 2.1 Location

In [4]:
locations = weather.Location.unique()
locations.sort() # keep order to make it replicable
locations = {x:i for i, x in enumerate(locations)}
dict(list(locations.items())[:3])

{'Adelaide': 0, 'Albany': 1, 'Albury': 2}

In [5]:
#Create the numeric feature
weather['Location_encoded'] = weather.Location.replace(to_replace=locations)
weather[['Location','Location_encoded']].drop_duplicates().head(3)

Unnamed: 0,Location,Location_encoded
0,Albury,2
3040,BadgerysCreek,4
6049,Cobar,10


Also, according to the type of variable, we will do a One-hot-encoding:

In [19]:
locations = list(weather.Location.unique())
header = ['Location']
for loc in locations:
    weather['Location.'+str(loc)] = np.where(weather.Location == loc, 1, 0)
    header += ['Location.'+str(loc)]
weather[header].drop_duplicates().head()

Unnamed: 0,Location,Location.Albury,Location.BadgerysCreek,Location.Cobar,Location.CoffsHarbour,Location.Moree,Location.Newcastle,Location.NorahHead,Location.NorfolkIsland,Location.Penrith,...,Location.PerthAirport,Location.Perth,Location.SalmonGums,Location.Walpole,Location.Hobart,Location.Launceston,Location.AliceSprings,Location.Darwin,Location.Katherine,Location.Uluru
0,Albury,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3040,BadgerysCreek,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6049,Cobar,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9058,CoffsHarbour,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12067,Moree,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## 2.2 Wind direction
<img src="https://cdn.windfinder.com/prod/images/assets_png/wind_directions.dad84db0.png">

In [20]:
wind_dir = {
      'N': 0
    , 'NNE': 1
    , 'NE': 2
    , 'ENE': 3
    , 'E': 4
    , 'ESE': 5
    , 'SE': 6
    , 'SSE': 7
    , 'S': 8
    , 'SSW': 9
    , 'SW': 10
    , 'WSW': 11
    , 'W': 12
    , 'WNW': 13
    , 'NW': 14
    , 'NNW': 15
}
weather['WindGustDir_encoded'] = weather.WindGustDir.replace(to_replace=wind_dir)
weather['WindDir9am_encoded'] = weather.WindDir9am.replace(to_replace=wind_dir)
weather['WindDir3pm_encoded'] = weather.WindDir3pm.replace(to_replace=wind_dir)
weather[['WindGustDir','WindDir9am','WindDir3pm',
         'WindGustDir_encoded','WindDir9am_encoded','WindDir3pm_encoded']].drop_duplicates().head(3)

Unnamed: 0,WindGustDir,WindDir9am,WindDir3pm,WindGustDir_encoded,WindDir9am_encoded,WindDir3pm_encoded
0,W,W,WNW,12.0,12.0,13.0
1,WNW,NNW,WSW,13.0,15.0,11.0
2,WSW,W,WSW,11.0,12.0,11.0


# 3. Save table as the master table for modelling      

In [21]:
weather.to_parquet('../data/04_model_input/master.parquet')