## Basic Feature Engineering

In [1]:
import pandas as pd
data = pd.read_csv('temperatures.csv')
data

Unnamed: 0,Date,Temp
0,1981-01-01,20.7
1,1981-01-02,17.9
2,1981-01-03,18.8
3,1981-01-04,14.6
4,1981-01-05,15.8
...,...,...
3645,1990-12-27,14.0
3646,1990-12-28,13.6
3647,1990-12-29,13.5
3648,1990-12-30,15.7


In [2]:
import pandas as pd
data = pd.read_csv('temperatures.csv',header = 0,parse_dates=True,index_col=0)
data.squeeze("columns")
data 

Unnamed: 0_level_0,Temp
Date,Unnamed: 1_level_1
1981-01-01,20.7
1981-01-02,17.9
1981-01-03,18.8
1981-01-04,14.6
1981-01-05,15.8
...,...
1990-12-27,14.0
1990-12-28,13.6
1990-12-29,13.5
1990-12-30,15.7


In [3]:
df = pd.DataFrame()

In [4]:
df['Months'] = [i.month for i in data.index]
df['Days'] = [i.day for i in data.index]
df['Temperature'] = [i for i in data.loc[:,'Temp']]
df

Unnamed: 0,Months,Days,Temperature
0,1,1,20.7
1,1,2,17.9
2,1,3,18.8
3,1,4,14.6
4,1,5,15.8
...,...,...,...
3645,12,27,14.0
3646,12,28,13.6
3647,12,29,13.5
3648,12,30,15.7


## Lag Feautres Method

#### The simplest approach is to predict the value at the next time (t+1) given the value at the current time(t).

In [18]:
import pandas as pd
dataset = pd.read_csv('temperatures.csv',parse_dates = True,header = 0,index_col = 0)
dataset.squeeze("columns")
dataset

Unnamed: 0_level_0,Temp
Date,Unnamed: 1_level_1
1981-01-01,20.7
1981-01-02,17.9
1981-01-03,18.8
1981-01-04,14.6
1981-01-05,15.8
...,...
1990-12-27,14.0
1990-12-28,13.6
1990-12-29,13.5
1990-12-30,15.7


In [19]:
dataframe = pd.DataFrame(dataset.values)
dataframe = pd.concat([dataframe.shift(1),dataframe],axis=1)
dataframe.columns = ['t-1','t']
dataframe

Unnamed: 0,t-1,t
0,,20.7
1,20.7,17.9
2,17.9,18.8
3,18.8,14.6
4,14.6,15.8
...,...,...
3645,14.6,14.0
3646,14.0,13.6
3647,13.6,13.5
3648,13.5,15.7


#### Trying with a window width of 3

In [20]:
df = pd.DataFrame(dataset.values)
df = pd.concat([df.shift(3),df.shift(2),df.shift(1),df],axis=1)
df.columns = ['t-3','t-2','t-1','t']
df

Unnamed: 0,t-3,t-2,t-1,t
0,,,,20.7
1,,,20.7,17.9
2,,20.7,17.9,18.8
3,20.7,17.9,18.8,14.6
4,17.9,18.8,14.6,15.8
...,...,...,...,...
3645,10.0,12.9,14.6,14.0
3646,12.9,14.6,14.0,13.6
3647,14.6,14.0,13.6,13.5
3648,14.0,13.6,13.5,15.7


## Rolling Window Method

##### In this method basically instead of taking the lag values we take the previous time steps statistics such as mean which actually is called as rolling mean.

In [21]:
dataset = pd.read_csv('temperatures.csv',parse_dates = True,index_col = 0,header=0)
dataset.squeeze("columns")
df = pd.DataFrame(dataset.values)
shifted = df.shift(1)
windows = shifted.rolling(window=2)
means = windows.mean()
dataframe = pd.concat([means,df],axis=1)
dataframe.columns = ['mean(t-1,t)','t+1']
dataframe

Unnamed: 0,"mean(t-1,t)",t+1
0,,20.7
1,,17.9
2,19.30,18.8
3,18.35,14.6
4,16.70,15.8
...,...,...
3645,13.75,14.0
3646,14.30,13.6
3647,13.80,13.5
3648,13.55,15.7


##### More window statistics that can be performed are: min, max, mean
#### If the window size is n then the shifting should be done by (n-1)

In [22]:
dataset = pd.read_csv('temperatures.csv',header=0,index_col=0,parse_dates=True)
dataset.squeeze("columns")
dataset = pd.DataFrame(dataset.values)
shifted = dataset.shift(2)
windows = shifted.rolling(window=3)
dataframe = pd.concat([windows.min(),windows.mean(),windows.max(),dataset],axis=1)
dataframe.columns = ['min','mean','max','t+1']
dataframe

Unnamed: 0,min,mean,max,t+1
0,,,,20.7
1,,,,17.9
2,,,,18.8
3,,,,14.6
4,17.9,19.133333,20.7,15.8
...,...,...,...,...
3645,10.0,12.266667,13.9,14.0
3646,10.0,12.500000,14.6,13.6
3647,12.9,13.833333,14.6,13.5
3648,13.6,14.066667,14.6,15.7


## Expanding Window Statistics

#### In this basically we will be keeping track of all the previous values

In [23]:
dataset = pd.read_csv('temperatures.csv',header=0,index_col=0,parse_dates=True)
dataset.squeeze("columns")
dataframe = pd.DataFrame(dataset.values)
windows = dataframe.expanding() # notice how we didn't shift the dataframe before expanding unlike rolling window
dataframe = pd.concat([windows.min(),windows.mean(),windows.max(),dataframe.shift(-1)],axis=1)
dataframe.columns = ['min','mean','max','t+1']
dataframe

Unnamed: 0,min,mean,max,t+1
0,20.7,20.700000,20.7,17.9
1,17.9,19.300000,20.7,18.8
2,17.9,19.133333,20.7,14.6
3,14.6,18.000000,20.7,15.8
4,14.6,17.560000,20.7,15.8
...,...,...,...,...
3645,0.0,11.174712,26.3,13.6
3646,0.0,11.175377,26.3,13.5
3647,0.0,11.176014,26.3,15.7
3648,0.0,11.177254,26.3,13.0
