In [1]:
#!pip install featuretools

In [23]:
import pandas as pd
import featuretools as ft

## For ML Model Data:

In [24]:
df_seoul = pd.read_csv('../data/bike_clean_byday.csv')

In [25]:
df_seoul.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 365 entries, 0 to 364
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Unnamed: 0       365 non-null    int64  
 1   date             365 non-null    object 
 2   season           365 non-null    object 
 3   holiday          365 non-null    object 
 4   snowfall         365 non-null    int64  
 5   rainfall         365 non-null    float64
 6   rent_count       365 non-null    int64  
 7   temperature      365 non-null    float64
 8   humidity         365 non-null    float64
 9   wind_speed       365 non-null    float64
 10  visibility       365 non-null    int64  
 11  dewpoint_temp    365 non-null    float64
 12  solar_radiation  365 non-null    float64
dtypes: float64(6), int64(4), object(3)
memory usage: 37.2+ KB


### Datetime Feature Engineering

In [26]:
df_seoul['datetime'] = pd.to_datetime(df_seoul['date'])
df_seoul['month'] = df_seoul.datetime.dt.month
df_seoul['day_of_week'] = df_seoul.datetime.dt.dayofweek
df_seoul['day'] = df_seoul.datetime.dt.day
df_seoul['year'] = df_seoul.datetime.dt.year

In [27]:
df_seoul.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 365 entries, 0 to 364
Data columns (total 18 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   Unnamed: 0       365 non-null    int64         
 1   date             365 non-null    object        
 2   season           365 non-null    object        
 3   holiday          365 non-null    object        
 4   snowfall         365 non-null    int64         
 5   rainfall         365 non-null    float64       
 6   rent_count       365 non-null    int64         
 7   temperature      365 non-null    float64       
 8   humidity         365 non-null    float64       
 9   wind_speed       365 non-null    float64       
 10  visibility       365 non-null    int64         
 11  dewpoint_temp    365 non-null    float64       
 12  solar_radiation  365 non-null    float64       
 13  datetime         365 non-null    datetime64[ns]
 14  month            365 non-null    int64    

### Automated Feature Engineering for ML Model Data

In [28]:
df_seoul = df_seoul.drop(['rent_count', 'year', 'datetime', 'date'], axis =1)
df_seoul.columns

Index(['Unnamed: 0', 'season', 'holiday', 'snowfall', 'rainfall',
       'temperature', 'humidity', 'wind_speed', 'visibility', 'dewpoint_temp',
       'solar_radiation', 'month', 'day_of_week', 'day'],
      dtype='object')

In [5]:
es = ft.EntitySet(id = 'bikes')

# adding a dataframe 
es.entity_from_dataframe(entity_id = 'seoul_bike', dataframe = df_seoul, index = 'Unnamed: 0')

Entityset: bikes
  Entities:
    seoul_bike [Rows: 353, Columns: 14]
  Relationships:
    No relationships

In [7]:
es.normalize_entity(base_entity_id="seoul_bike",
                    new_entity_id="Month",
                    index="month")

es.normalize_entity(base_entity_id="seoul_bike",
                    new_entity_id="Day",
                    index="day")

es.normalize_entity(base_entity_id="seoul_bike",
                    new_entity_id="Day_of_week",
                    index="dayofweek") # dayofweek for ml model data # day_of_week for demo data

es.normalize_entity(base_entity_id="seoul_bike",
                    new_entity_id="Season",
                    index="season")

es.normalize_entity(base_entity_id="seoul_bike",
                    new_entity_id="Holiday",
                    index="holiday")

print(es)

Entityset: bikes
  Entities:
    seoul_bike [Rows: 353, Columns: 14]
    Month [Rows: 12, Columns: 1]
    Day [Rows: 31, Columns: 1]
    Day_of_week [Rows: 7, Columns: 1]
    Season [Rows: 4, Columns: 1]
    Holiday [Rows: 2, Columns: 1]
  Relationships:
    seoul_bike.month -> Month.month
    seoul_bike.day -> Day.day
    seoul_bike.dayofweek -> Day_of_week.dayofweek
    seoul_bike.season -> Season.season
    seoul_bike.holiday -> Holiday.holiday


In [8]:
feature_matrix, feature_names = ft.dfs(entityset=es, 
target_entity = 'seoul_bike', 
max_depth = 2, 
verbose = 1, 
n_jobs = 3)

Built 298 features
EntitySet scattered to 3 workers in 11 seconds                                                                         
Elapsed: 00:04 | Progress: 100%|███████████████████████████████████████████████████████████████████████████████████████


In [12]:
feature_matrix.columns

Index(['month', 'day', 'dayofweek', 'season', 'temperature', 'humidity',
       'holiday', 'wind_speed', 'visibility', 'dewpoint_temp',
       ...
       'Holiday.STD(seoul_bike.visibility)',
       'Holiday.STD(seoul_bike.wind_speed)',
       'Holiday.SUM(seoul_bike.dewpoint_temp)',
       'Holiday.SUM(seoul_bike.humidity)', 'Holiday.SUM(seoul_bike.rainfall)',
       'Holiday.SUM(seoul_bike.snowfall)',
       'Holiday.SUM(seoul_bike.solar_radiation)',
       'Holiday.SUM(seoul_bike.temperature)',
       'Holiday.SUM(seoul_bike.visibility)',
       'Holiday.SUM(seoul_bike.wind_speed)'],
      dtype='object', length=298)

In [11]:
feature_matrix.head()

Unnamed: 0_level_0,season,holiday,snowfall,rainfall,temperature,humidity,wind_speed,visibility,dewpoint_temp,solar_radiation,...,Holiday.STD(seoul_bike.visibility),Holiday.STD(seoul_bike.wind_speed),Holiday.SUM(seoul_bike.dewpoint_temp),Holiday.SUM(seoul_bike.humidity),Holiday.SUM(seoul_bike.rainfall),Holiday.SUM(seoul_bike.snowfall),Holiday.SUM(seoul_bike.solar_radiation),Holiday.SUM(seoul_bike.temperature),Holiday.SUM(seoul_bike.visibility),Holiday.SUM(seoul_bike.wind_speed)
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,Autumn,No Holiday,787,10.9,2.6,84.5,1.7,10460,0.0,0,...,5282.178708,0.289704,-28.95,414.0,10.9,1230,0,28.4,67335,8.85
1,Autumn,No Holiday,414,0.0,4.1,81.5,0.95,4295,0.75,0,...,5282.178708,0.289704,-28.95,414.0,10.9,1230,0,28.4,67335,8.85
2,Autumn,No Holiday,29,0.0,5.8,70.5,1.1,4675,1.0,0,...,5282.178708,0.289704,-28.95,414.0,10.9,1230,0,28.4,67335,8.85
3,Autumn,No Holiday,0,0.0,6.5,73.0,1.2,3725,1.35,0,...,5282.178708,0.289704,-28.95,414.0,10.9,1230,0,28.4,67335,8.85
4,Autumn,No Holiday,0,0.0,4.6,26.0,1.6,14640,-12.95,0,...,5282.178708,0.289704,-28.95,414.0,10.9,1230,0,28.4,67335,8.85


In [26]:
df_merge = pd.read_csv('../data/bike_clean_byday.csv')[['rent_count', 'year']]

In [27]:
pd.concat([feature_matrix.reset_index(), df_merge], axis =1).to_csv("../data/bike_clean_byday_engineered.csv")

## For demo:

In [19]:
df_seoul_ori = pd.read_csv('../data/7days_demo.csv')

In [20]:
df_seoul_ori.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 15 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   date                  7 non-null      object 
 1   season                7 non-null      object 
 2   holiday               7 non-null      object 
 3   snowfall              7 non-null      int64  
 4   rainfall              7 non-null      float64
 5   rent_count            7 non-null      int64  
 6   temperature           7 non-null      float64
 7   humidity              7 non-null      float64
 8   wind_speed            7 non-null      float64
 9   visibility            7 non-null      int64  
 10  dewpoint_temp         7 non-null      float64
 11  solar_radiation       7 non-null      int64  
 12  rep_count             7 non-null      int64  
 13  predicted_rep_count   7 non-null      float64
 14  predicted_rent_count  7 non-null      float64
dtypes: float64(7), int64(5), ob

### Datetime Feature Engineering

In [21]:
df_seoul_ori['datetime'] = pd.to_datetime(df_seoul_ori['date'])
df_seoul_ori['month'] = df_seoul_ori.datetime.dt.month
df_seoul_ori['day_of_week'] = df_seoul_ori.datetime.dt.dayofweek # mon, tue ... 
df_seoul_ori['day'] = df_seoul_ori.datetime.dt.day
df_seoul_ori['year'] = df_seoul_ori.datetime.dt.year

In [22]:
df_seoul_ori.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   date                  7 non-null      object        
 1   season                7 non-null      object        
 2   holiday               7 non-null      object        
 3   snowfall              7 non-null      int64         
 4   rainfall              7 non-null      float64       
 5   rent_count            7 non-null      int64         
 6   temperature           7 non-null      float64       
 7   humidity              7 non-null      float64       
 8   wind_speed            7 non-null      float64       
 9   visibility            7 non-null      int64         
 10  dewpoint_temp         7 non-null      float64       
 11  solar_radiation       7 non-null      int64         
 12  rep_count             7 non-null      int64         
 13  predicted_rep_count   7 

### Automated Feature Engineering For Demo Data

In [4]:
df_seoul = df_seoul_ori.drop(['Unnamed: 0.1','date','datetime','rep_count', 'predicted_rep_count', 'predicted_rent_count', 'rent_count', 'year'], axis=1)
df_seoul.columns

Index(['Unnamed: 0', 'season', 'holiday', 'snowfall', 'rainfall',
       'temperature', 'humidity', 'wind_speed', 'visibility', 'dewpoint_temp',
       'solar_radiation', 'month', 'day_of_week', 'day'],
      dtype='object')

In [5]:
es = ft.EntitySet(id = 'bikes')

# adding a dataframe 
es.entity_from_dataframe(entity_id = 'seoul_bike', dataframe = df_seoul, index = 'Unnamed: 0')

Entityset: bikes
  Entities:
    seoul_bike [Rows: 353, Columns: 14]
  Relationships:
    No relationships

In [7]:
es.normalize_entity(base_entity_id="seoul_bike",
                    new_entity_id="Month",
                    index="month")

es.normalize_entity(base_entity_id="seoul_bike",
                    new_entity_id="Day",
                    index="day")

es.normalize_entity(base_entity_id="seoul_bike",
                    new_entity_id="Day_of_week",
                    index="dayofweek") # dayofweek for ml model data # day_of_week for demo data

es.normalize_entity(base_entity_id="seoul_bike",
                    new_entity_id="Season",
                    index="season")

es.normalize_entity(base_entity_id="seoul_bike",
                    new_entity_id="Holiday",
                    index="holiday")

print(es)

Entityset: bikes
  Entities:
    seoul_bike [Rows: 353, Columns: 14]
    Month [Rows: 12, Columns: 1]
    Day [Rows: 31, Columns: 1]
    Day_of_week [Rows: 7, Columns: 1]
    Season [Rows: 4, Columns: 1]
    Holiday [Rows: 2, Columns: 1]
  Relationships:
    seoul_bike.month -> Month.month
    seoul_bike.day -> Day.day
    seoul_bike.dayofweek -> Day_of_week.dayofweek
    seoul_bike.season -> Season.season
    seoul_bike.holiday -> Holiday.holiday


In [8]:
feature_matrix, feature_names = ft.dfs(entityset=es, 
target_entity = 'seoul_bike', 
max_depth = 2, 
verbose = 1, 
n_jobs = 3)

Built 298 features
EntitySet scattered to 3 workers in 11 seconds                                                                         
Elapsed: 00:04 | Progress: 100%|███████████████████████████████████████████████████████████████████████████████████████


In [12]:
feature_matrix.columns

Index(['month', 'day', 'dayofweek', 'season', 'temperature', 'humidity',
       'holiday', 'wind_speed', 'visibility', 'dewpoint_temp',
       ...
       'Holiday.STD(seoul_bike.visibility)',
       'Holiday.STD(seoul_bike.wind_speed)',
       'Holiday.SUM(seoul_bike.dewpoint_temp)',
       'Holiday.SUM(seoul_bike.humidity)', 'Holiday.SUM(seoul_bike.rainfall)',
       'Holiday.SUM(seoul_bike.snowfall)',
       'Holiday.SUM(seoul_bike.solar_radiation)',
       'Holiday.SUM(seoul_bike.temperature)',
       'Holiday.SUM(seoul_bike.visibility)',
       'Holiday.SUM(seoul_bike.wind_speed)'],
      dtype='object', length=298)

In [11]:
feature_matrix.head()

Unnamed: 0_level_0,season,holiday,snowfall,rainfall,temperature,humidity,wind_speed,visibility,dewpoint_temp,solar_radiation,...,Holiday.STD(seoul_bike.visibility),Holiday.STD(seoul_bike.wind_speed),Holiday.SUM(seoul_bike.dewpoint_temp),Holiday.SUM(seoul_bike.humidity),Holiday.SUM(seoul_bike.rainfall),Holiday.SUM(seoul_bike.snowfall),Holiday.SUM(seoul_bike.solar_radiation),Holiday.SUM(seoul_bike.temperature),Holiday.SUM(seoul_bike.visibility),Holiday.SUM(seoul_bike.wind_speed)
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,Autumn,No Holiday,787,10.9,2.6,84.5,1.7,10460,0.0,0,...,5282.178708,0.289704,-28.95,414.0,10.9,1230,0,28.4,67335,8.85
1,Autumn,No Holiday,414,0.0,4.1,81.5,0.95,4295,0.75,0,...,5282.178708,0.289704,-28.95,414.0,10.9,1230,0,28.4,67335,8.85
2,Autumn,No Holiday,29,0.0,5.8,70.5,1.1,4675,1.0,0,...,5282.178708,0.289704,-28.95,414.0,10.9,1230,0,28.4,67335,8.85
3,Autumn,No Holiday,0,0.0,6.5,73.0,1.2,3725,1.35,0,...,5282.178708,0.289704,-28.95,414.0,10.9,1230,0,28.4,67335,8.85
4,Autumn,No Holiday,0,0.0,4.6,26.0,1.6,14640,-12.95,0,...,5282.178708,0.289704,-28.95,414.0,10.9,1230,0,28.4,67335,8.85


In [12]:
df_merge = df_seoul_ori[['rent_count', 'year','date','datetime','rep_count', 'predicted_rep_count', 'predicted_rent_count', 'rent_count']]

In [27]:
pd.concat([feature_matrix.reset_index(), df_merge], axis =1).to_csv("../data/7days_demo_engineered.csv")