# Feature Engineering

> Feature engineering is the process of using domain knowledge of the data to create features that make machine learning algorithms work. (wikipedia)

In [None]:
from modules.utils import get_data,is_day_off
df = get_data()

In [None]:
df.head()

## Currently what is $X$ and what is $Y$ ?

## Workshop : find some new features (15min)
Only use available data: 
- Consumption for each day
- Temperature for each day
- is_day_off function

# is_day_off

In [None]:
def compute_day_off(date):
    if is_day_off(date):
        return 1
    return 0

df['is_day_off'] = df['Date'].apply(compute_day_off)

In [None]:
df.head()

In [None]:
import seaborn as sns
%matplotlib inline

sns.countplot(df['is_day_off'])

## conso_24h_lag
Since we have all half hours per day, we can safely make a shift of the dataset to get the 24h consumption lag !  
We can also use 24 lag values since we are only predicting for the next day  
Thanks to cleaning step 

In [None]:
df['conso_24_lag'] = df['Conso'].shift(48)

In [None]:
df.head()

In [None]:
import plotly.offline as py
import plotly.graph_objs as go

data = [go.Scatter( x=df[:200]['Date'], y=(df[:200]['Conso'])),go.Scatter( x=df[:200]['Date'], y=(df[:200]['conso_24_lag']))]
py.plot(data)

## conso_7_days_lag

In [None]:
df['conso_7_days_lag'] = df['Conso'].shift(336)

In [None]:
df.sample(10)

# is_weekend

In [None]:
from datetime import datetime

def is_weekend(date):
    if date.weekday() >=5:
        return 1
    return 0

df['is_weekend'] = df['Date'].apply(is_weekend)

In [None]:
df.sample(10)

# day_of_week

In [None]:
df['day_of_week']=df['Date'].dt.weekday

In [None]:
df.sample(5)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 7))
sns.boxplot(x='day_of_week',y='Conso',data=df)

# temp_rolling_7_days

In [None]:
df["temp_rolling_7_days"] = df["Temp"].rolling(window=48*7).mean()

In [None]:
import plotly.offline as py
import plotly.graph_objs as go

data = [go.Scatter( x=df[:10000]['Date'], y=(df[:10000]['Temp'])),go.Scatter( x=df[:10000]['Date'], y=(df[:10000]['temp_rolling_7_days']))]
py.plot(data)

# Heating and cooling degrees

In [None]:
def heating_degrees(temperature):
    """
    A heating degree day (HDD) is a measurement designed 
    to quantify the demand for energy needed to heat a building. 
    It is the number of degrees a temperature is below 18°C,
    which is the temperature below which buildings need to be heated. 
    """
    return max(18-temperature,0)


def cooling_degrees(temperature):
    """
    A cooling degree day (CDD) is a measurement designed 
    to quantify the demand for energy needed to cool a building.
    It is the number of degrees that a temperature is above 24°C,
    """
    return max(temperature-24,0)
    

In [None]:
df["heating_degrees"] = df["Temp"].apply(heating_degrees)
df["cooling_degrees"] = df["Temp"].apply(cooling_degrees)

# Automate everyting

In [None]:
def compute_day_off(date):
    if is_day_off(date):
        return 1
    return 0


def is_weekend(date):
    if date.weekday() >=5:
        return 1
    return 0

def heating_degrees(temperature):
    """
    A heating degree day (HDD) is a measurement designed 
    to quantify the demand for energy needed to heat a building. 
    It is the number of degrees a temperature is below 18°C,
    which is the temperature below which buildings need to be heated. 
    """
    return max(18-temperature,0)


def cooling_degrees(temperature):
    """
    A cooling degree day (CDD) is a measurement designed 
    to quantify the demand for energy needed to cool a building.
    It is the number of degrees that a temperature is above 24°C,
    """
    return max(temperature-24,0)

from datetime import datetime, timedelta

def is_bridge(date):
    """
    Check if a datetime is a holiday bridge 
    (friday with thursday off or mondy with tuesday off)
    """
    weekday = date.weekday()
    if weekday == 4:
        return is_day_off(date-timedelta(days=1))
    elif weekday == 0:
        return is_day_off(date+timedelta(days=1))
    return False

def get_data_with_features(consumption_csv="./data/eco2mix_regional_cons_def.csv",weather_csv="./data/meteo-paris.csv"):
    """
    A function to get consumption and weather data
    Do the wrangling
    Add interesting features
    
    """
    df = get_data(consumption_csv,weather_csv)
    df['is_day_off'] = df['Date'].apply(compute_day_off)
    df['is_bridge'] = df['Date'].apply(is_bridge)
    df['conso_24_lag'] = df['Conso'].shift(48)
    df['temp_24_lag'] = df['Temp'].shift(48)
    df['conso_7_days_lag'] = df['Conso'].shift(48*7)
    df["heating_degrees"] = df["Temp"].apply(heating_degrees)
    df["cooling_degrees"] = df["Temp"].apply(cooling_degrees)
    df['is_weekend'] = df['Date'].apply(is_weekend)
    df['day_of_week']=df['Date'].dt.weekday
    df["temp_rolling_7_days"] = df["Temp"].rolling(window=336).mean()
    df['month']=df['Date'].dt.month
    #df.set_index("Date",inplace=True)
    return df.dropna()

In [None]:
df = get_data_with_features()

In [None]:
df.tail(10)

![xkcd](https://imgs.xkcd.com/comics/machine_learning.png)