In [54]:
#!pip install featuretools

In [55]:
import pandas as pd
import featuretools as ft

In [40]:
df_seoul = pd.read_csv('../data/bike_clean.csv')
df_seoul.head()

Unnamed: 0.1,Unnamed: 0,datetime,date,hour,season,holiday,open,rent_count,temperature,humidity,wind_speed,visibility,dewpoint_temp,solar_radiation,rainfall,snowfall
0,1,2017-12-01 00:00:00,2017-12-01,0,Winter,No Holiday,Yes,254,-5.2,37,2.2,20000,-17.6,0.0,0.0,0
1,2,2017-12-01 01:00:00,2017-12-01,1,Winter,No Holiday,Yes,204,-5.5,38,0.8,20000,-17.6,0.0,0.0,0
2,3,2017-12-01 02:00:00,2017-12-01,2,Winter,No Holiday,Yes,173,-6.0,39,1.0,20000,-17.7,0.0,0.0,0
3,4,2017-12-01 03:00:00,2017-12-01,3,Winter,No Holiday,Yes,107,-6.2,40,0.9,20000,-17.6,0.0,0.0,0
4,5,2017-12-01 04:00:00,2017-12-01,4,Winter,No Holiday,Yes,78,-6.0,36,2.3,20000,-18.6,0.0,0.0,0


In [41]:
df_seoul.dtypes

Unnamed: 0           int64
datetime            object
date                object
hour                 int64
season              object
holiday             object
open                object
rent_count           int64
temperature        float64
humidity             int64
wind_speed         float64
visibility           int64
dewpoint_temp      float64
solar_radiation    float64
rainfall           float64
snowfall             int64
dtype: object

### Datetime Feature Engineering

In [47]:
df_seoul['datetime'] =  pd.to_datetime(df_seoul['datetime'])
df_seoul['month'] = df_seoul.datetime.dt.month
df_seoul['week'] = df_seoul.datetime.dt.isocalendar().week # week number
df_seoul['day'] = df_seoul.datetime.dt.day # 1 to 31
df_seoul['dayofyear'] = df_seoul.datetime.dt.dayofyear # 1 to 365
df_seoul['dayofweek'] = df_seoul.datetime.dt.dayofweek # mon, tue ... 

In [43]:
df_seoul.month.value_counts(dropna=False)

12    744
1     744
3     744
5     744
7     744
8     744
10    744
4     720
6     720
9     720
11    720
2     672
Name: month, dtype: int64

In [44]:
df_seoul.week.value_counts(dropna=False)

48     192
21     168
50     168
51     168
52     168
1      168
2      168
3      168
4      168
5      168
6      168
7      168
9      168
20     168
10     168
11     168
12     168
13     168
14     168
15     168
16     168
17     168
18     168
19     168
47     168
8      168
22     168
49     168
23     168
24     168
25     168
26     168
27     168
28     168
29     168
30     168
31     168
32     168
33     168
34     168
35     168
36     168
37     168
38     168
39     168
40     168
41     168
42     168
43     168
44     168
45     168
46     168
NaN      0
Name: week, dtype: Int64

In [45]:
df_seoul.day.value_counts(dropna=False)

1     288
2     288
28    288
27    288
26    288
25    288
24    288
23    288
22    288
21    288
20    288
19    288
18    288
17    288
16    288
15    288
14    288
13    288
12    288
11    288
10    288
9     288
8     288
7     288
6     288
5     288
4     288
3     288
29    264
30    264
31    168
Name: day, dtype: int64

In [46]:
df_seoul.dayofweek.value_counts(dropna=False)
# should we change this to string?

4    1272
5    1248
6    1248
0    1248
1    1248
2    1248
3    1248
Name: dayofweek, dtype: int64

### Automated Feature Engineering

In [82]:
df_x = df_seoul.drop(['rent_count','datetime','date','month','week','day','dayofyear','dayofweek'], axis=1)

In [83]:
es = ft.EntitySet(id = 'bikes')

# adding a dataframe 
es.entity_from_dataframe(entity_id = 'seoul_bike', dataframe = df_x, index = 'Unnamed: 0')

Entityset: bikes
  Entities:
    seoul_bike [Rows: 8760, Columns: 13]
  Relationships:
    No relationships

In [84]:
es.normalize_entity(base_entity_id="seoul_bike",
                    new_entity_id="hour_of_day",
                    index="hour")

es.normalize_entity(base_entity_id="seoul_bike",
                    new_entity_id="Season",
                    index="season")

es.normalize_entity(base_entity_id="seoul_bike",
                    new_entity_id="Holiday",
                    index="holiday")

print(es)

Entityset: bikes
  Entities:
    seoul_bike [Rows: 8760, Columns: 13]
    hour_of_day [Rows: 24, Columns: 1]
    Season [Rows: 4, Columns: 1]
    Holiday [Rows: 2, Columns: 1]
  Relationships:
    seoul_bike.hour -> hour_of_day.hour
    seoul_bike.season -> Season.season
    seoul_bike.holiday -> Holiday.holiday


In [85]:
feature_matrix, feature_names = ft.dfs(entityset=es, 
target_entity = 'seoul_bike', 
max_depth = 2, 
verbose = 1, 
n_jobs = 3)

Built 177 features
EntitySet scattered to 3 workers in 9 seconds                                                                          
Elapsed: 00:02 | Progress: 100%|███████████████████████████████████████████████████████████████████████████████████████


In [86]:
feature_matrix.columns

Index(['hour', 'season', 'holiday', 'open', 'temperature', 'humidity',
       'wind_speed', 'visibility', 'dewpoint_temp', 'solar_radiation',
       ...
       'Holiday.STD(seoul_bike.visibility)',
       'Holiday.STD(seoul_bike.wind_speed)',
       'Holiday.SUM(seoul_bike.dewpoint_temp)',
       'Holiday.SUM(seoul_bike.humidity)', 'Holiday.SUM(seoul_bike.rainfall)',
       'Holiday.SUM(seoul_bike.snowfall)',
       'Holiday.SUM(seoul_bike.solar_radiation)',
       'Holiday.SUM(seoul_bike.temperature)',
       'Holiday.SUM(seoul_bike.visibility)',
       'Holiday.SUM(seoul_bike.wind_speed)'],
      dtype='object', length=177)

In [87]:
feature_matrix.head()

Unnamed: 0_level_0,hour,season,holiday,open,temperature,humidity,wind_speed,visibility,dewpoint_temp,solar_radiation,...,Holiday.STD(seoul_bike.visibility),Holiday.STD(seoul_bike.wind_speed),Holiday.SUM(seoul_bike.dewpoint_temp),Holiday.SUM(seoul_bike.humidity),Holiday.SUM(seoul_bike.rainfall),Holiday.SUM(seoul_bike.snowfall),Holiday.SUM(seoul_bike.solar_radiation),Holiday.SUM(seoul_bike.temperature),Holiday.SUM(seoul_bike.visibility),Holiday.SUM(seoul_bike.wind_speed)
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,Winter,No Holiday,Yes,-5.2,37,2.2,20000,-17.6,0.0,...,6064.993949,1.029499,35580.4,486850,1268.8,6356,4747.92,108556.1,119292280,14319.8
2,1,Winter,No Holiday,Yes,-5.5,38,0.8,20000,-17.6,0.0,...,6064.993949,1.029499,35580.4,486850,1268.8,6356,4747.92,108556.1,119292280,14319.8
3,2,Winter,No Holiday,Yes,-6.0,39,1.0,20000,-17.7,0.0,...,6064.993949,1.029499,35580.4,486850,1268.8,6356,4747.92,108556.1,119292280,14319.8
4,3,Winter,No Holiday,Yes,-6.2,40,0.9,20000,-17.6,0.0,...,6064.993949,1.029499,35580.4,486850,1268.8,6356,4747.92,108556.1,119292280,14319.8
5,4,Winter,No Holiday,Yes,-6.0,36,2.3,20000,-18.6,0.0,...,6064.993949,1.029499,35580.4,486850,1268.8,6356,4747.92,108556.1,119292280,14319.8
