# Class 12

## Today's plan:
* Continue discussion of GradientBoosting
* Look at applying it to a real (our restaurants.csv) data set
* Basics of building a machine learning pipeline

# Review of GradientBoosting process

In [78]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor

In [30]:
df = pd.read_csv('../data/housing.csv')
tree = DecisionTreeRegressor(max_depth=4)

In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     506 non-null    float64
 1   ZN       506 non-null    float64
 2   INDUS    506 non-null    float64
 3   CHAS     506 non-null    int64  
 4   NOX      506 non-null    float64
 5   RM       506 non-null    float64
 6   AGE      506 non-null    float64
 7   DIS      506 non-null    float64
 8   RAD      506 non-null    int64  
 9   TAX      506 non-null    int64  
 10  PTRATIO  506 non-null    float64
 11  B        506 non-null    float64
 12  LSTAT    506 non-null    float64
 13  PRICE    506 non-null    float64
dtypes: float64(11), int64(3)
memory usage: 55.5 KB


In [67]:
# Set up X and y
X = df.drop('PRICE', axis=1 )
y = df['PRICE']

In [68]:
# Naive guess
guess = y.mean()
guess

22.532806324110698

In [69]:
# Error column
gradient = y - guess
gradient

0       1.467194
1      -0.932806
2      12.167194
3      10.867194
4      13.667194
         ...    
501    -0.132806
502    -1.932806
503     1.367194
504    -0.532806
505   -10.632806
Name: PRICE, Length: 506, dtype: float64

In [70]:
tree.fit(X,gradient)

DecisionTreeRegressor(max_depth=4)

In [71]:
tree.predict(X)

array([  4.8944664 ,  -0.90306273,  10.21597416,  10.21597416,
        10.21597416,  -0.90306273,  -0.90306273,  -2.51197299,
        -2.51197299,  -2.51197299,  -2.51197299,  -0.90306273,
        -2.51197299,  -0.90306273,  -0.90306273,  -0.90306273,
        -0.90306273,  -6.29384529,  -0.90306273,  -0.90306273,
        -6.29384529,  -0.90306273,  -6.29384529,  -6.29384529,
        -6.29384529,  -6.29384529,  -6.29384529,  -6.29384529,
        -0.90306273,   4.8944664 ,  -6.29384529,  -0.90306273,
        -6.29384529,  -6.29384529,  -6.29384529,  -0.90306273,
        -0.90306273,  -0.90306273,  -0.90306273,   4.8944664 ,
        10.21597416,   4.8944664 ,  -0.90306273,  -0.90306273,
        -0.90306273,  -0.90306273,  -0.90306273,  -2.51197299,
        -2.51197299,  -2.51197299,  -0.90306273,  -0.90306273,
        -0.90306273,  -0.90306273,  -2.51197299,  10.21597416,
        -0.90306273,   4.8944664 ,  -0.90306273,  -0.90306273,
        -0.90306273,  -2.51197299,  -0.90306273,   4.89

In [63]:
y + tree.predict(X)

0       0.977747
1      -0.842500
2      11.145596
3       9.845596
4      12.645596
         ...    
501    -0.622253
502    -1.842500
503     0.345596
504    -1.022253
505   -10.542500
Name: PRICE, Length: 506, dtype: float64

In [72]:
learning_rate = 0.1

In [73]:
guess += learning_rate*tree.predict(X)

In [74]:
guess

array([23.02225296, 22.44250005, 23.55440374, 23.55440374, 23.55440374,
       22.44250005, 22.44250005, 22.28160903, 22.28160903, 22.28160903,
       22.28160903, 22.44250005, 22.28160903, 22.44250005, 22.44250005,
       22.44250005, 22.44250005, 21.9034218 , 22.44250005, 22.44250005,
       21.9034218 , 22.44250005, 21.9034218 , 21.9034218 , 21.9034218 ,
       21.9034218 , 21.9034218 , 21.9034218 , 22.44250005, 23.02225296,
       21.9034218 , 22.44250005, 21.9034218 , 21.9034218 , 21.9034218 ,
       22.44250005, 22.44250005, 22.44250005, 22.44250005, 23.02225296,
       23.55440374, 23.02225296, 22.44250005, 22.44250005, 22.44250005,
       22.44250005, 22.44250005, 22.28160903, 22.28160903, 22.28160903,
       22.44250005, 22.44250005, 22.44250005, 22.44250005, 22.28160903,
       23.55440374, 22.44250005, 23.02225296, 22.44250005, 22.44250005,
       22.44250005, 22.28160903, 22.44250005, 23.02225296, 23.55440374,
       22.44250005, 22.44250005, 22.44250005, 22.44250005, 22.44

In [75]:
new_gradient = y - guess

In [76]:
tree.fit(X,new_gradient)

DecisionTreeRegressor(max_depth=4)

In [77]:
tree.predict(X)


array([  4.40501976,  -0.81275646,   9.19437675,   9.19437675,
         9.19437675,  -0.81275646,  -0.81275646,  -2.39769086,
        -2.39769086,  -2.39769086,  -2.39769086,  -0.81275646,
        -2.39769086,  -0.81275646,  -0.81275646,  -0.81275646,
        -0.81275646,  -5.96801651,  -0.81275646,  -0.81275646,
        -5.96801651,  -0.81275646,  -5.96801651,  -5.96801651,
        -5.96801651,  -5.96801651,  -5.96801651,  -5.96801651,
        -0.81275646,   4.40501976,  -5.96801651,  -0.81275646,
        -5.96801651,  -5.96801651,  -5.96801651,  -0.81275646,
        -0.81275646,  -0.81275646,  -0.81275646,   4.40501976,
         9.19437675,   4.40501976,  -0.81275646,  -0.81275646,
        -0.81275646,  -0.81275646,  -0.81275646,  -2.39769086,
        -2.39769086,  -2.39769086,  -0.81275646,  -0.81275646,
        -0.81275646,  -0.81275646,  -2.39769086,   9.19437675,
        -0.81275646,   4.40501976,  -0.81275646,  -0.81275646,
        -0.81275646,  -2.39769086,  -0.81275646,   4.40

# Reviewing the restaurants data set with GBM

In [86]:
# Import key packages
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor

In [113]:
#import data set
df = pd.read_csv('../data/restaurants.csv',parse_dates=['visit_date'])

In [88]:
df.head()

Unnamed: 0,id,visit_date,visitors,calendar_date,day_of_week,holiday,genre,area,latitude,longitude,reserve_visitors
0,air_ba937bf13d40fb24,2016-01-13,25,2016-01-13,Wednesday,0,Dining bar,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,
1,air_ba937bf13d40fb24,2016-01-14,32,2016-01-14,Thursday,0,Dining bar,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,
2,air_ba937bf13d40fb24,2016-01-15,29,2016-01-15,Friday,0,Dining bar,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,
3,air_ba937bf13d40fb24,2016-01-16,22,2016-01-16,Saturday,0,Dining bar,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,
4,air_ba937bf13d40fb24,2016-01-18,6,2016-01-18,Monday,0,Dining bar,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,


In [98]:
df.info()
# reserve_visitors has missing values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 252108 entries, 0 to 252107
Data columns (total 11 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   id                252108 non-null  object        
 1   visit_date        252108 non-null  datetime64[ns]
 2   visitors          252108 non-null  int64         
 3   calendar_date     252108 non-null  object        
 4   day_of_week       252108 non-null  object        
 5   holiday           252108 non-null  int64         
 6   genre             252108 non-null  object        
 7   area              252108 non-null  object        
 8   latitude          252108 non-null  float64       
 9   longitude         252108 non-null  float64       
 10  reserve_visitors  108394 non-null  float64       
dtypes: datetime64[ns](1), float64(3), int64(2), object(5)
memory usage: 21.2+ MB


In [91]:
df.isnull().sum()

id                       0
visit_date               0
visitors                 0
calendar_date            0
day_of_week              0
holiday                  0
genre                    0
area                     0
latitude                 0
longitude                0
reserve_visitors    143714
dtype: int64

In [90]:
df.reserve_visitors.value_counts().sort_index()
# Number of reservations on a particular day 
# So a missing value meant no reservations / don't take reservations

1.0      1975
2.0     13587
3.0      5376
4.0      8042
5.0      3581
6.0      4627
7.0      6713
8.0      5639
9.0      3738
10.0     6703
11.0     2439
12.0     3677
13.0      744
14.0     1318
15.0     2259
16.0      704
17.0     1459
18.0     1047
20.0      722
21.0     2591
22.0      696
23.0     3564
24.0      613
25.0     3476
27.0      736
28.0      706
30.0     1297
31.0      738
32.0     1182
33.0      771
35.0     1358
36.0     2033
37.0      774
40.0      731
41.0      772
42.0      608
43.0     1372
44.0     1246
46.0     1393
47.0      771
48.0      603
51.0      659
54.0      767
58.0      574
59.0      672
60.0     1411
76.0      767
83.0      575
96.0      588
Name: reserve_visitors, dtype: int64

In [114]:
# Adds a new column for each column with missing values
# that adds in true / false
def denote_null_values(df):
    empty_cols_query = df.isnull().sum() > 0
    empty_df_cols = df.loc[:, empty_cols_query].columns.tolist()
    for col in empty_df_cols:
        col_name = f"{col}_missing"
        df[col_name] = pd.isnull(df[col])
    return df

In [115]:
# Flags the values
denote_null_values(df)
# Replace the NaN values in the data set
df.fillna(0)

Unnamed: 0,id,visit_date,visitors,calendar_date,day_of_week,holiday,genre,area,latitude,longitude,reserve_visitors,reserve_visitors_missing
0,air_ba937bf13d40fb24,2016-01-13,25,2016-01-13,Wednesday,0,Dining bar,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,0.0,True
1,air_ba937bf13d40fb24,2016-01-14,32,2016-01-14,Thursday,0,Dining bar,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,0.0,True
2,air_ba937bf13d40fb24,2016-01-15,29,2016-01-15,Friday,0,Dining bar,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,0.0,True
3,air_ba937bf13d40fb24,2016-01-16,22,2016-01-16,Saturday,0,Dining bar,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,0.0,True
4,air_ba937bf13d40fb24,2016-01-18,6,2016-01-18,Monday,0,Dining bar,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,0.0,True
...,...,...,...,...,...,...,...,...,...,...,...,...
252103,air_a17f0778617c76e2,2017-04-21,49,2017-04-21,Friday,0,Italian/French,Hyōgo-ken Kōbe-shi Kumoidōri,34.695124,135.197852,6.0,False
252104,air_a17f0778617c76e2,2017-04-22,60,2017-04-22,Saturday,0,Italian/French,Hyōgo-ken Kōbe-shi Kumoidōri,34.695124,135.197852,37.0,False
252105,air_a17f0778617c76e2,2017-03-26,69,2017-03-26,Sunday,0,Italian/French,Hyōgo-ken Kōbe-shi Kumoidōri,34.695124,135.197852,35.0,False
252106,air_a17f0778617c76e2,2017-03-20,31,2017-03-20,Monday,1,Italian/French,Hyōgo-ken Kōbe-shi Kumoidōri,34.695124,135.197852,3.0,False


Unnamed: 0,id,visit_date,visitors,calendar_date,day_of_week,holiday,genre,area,latitude,longitude,reserve_visitors,reserve_visitors_missing
0,air_ba937bf13d40fb24,2016-01-13,25,2016-01-13,Wednesday,0,Dining bar,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,0.0,True
1,air_ba937bf13d40fb24,2016-01-14,32,2016-01-14,Thursday,0,Dining bar,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,0.0,True
2,air_ba937bf13d40fb24,2016-01-15,29,2016-01-15,Friday,0,Dining bar,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,0.0,True
3,air_ba937bf13d40fb24,2016-01-16,22,2016-01-16,Saturday,0,Dining bar,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,0.0,True
4,air_ba937bf13d40fb24,2016-01-18,6,2016-01-18,Monday,0,Dining bar,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,0.0,True
...,...,...,...,...,...,...,...,...,...,...,...,...
252103,air_a17f0778617c76e2,2017-04-21,49,2017-04-21,Friday,0,Italian/French,Hyōgo-ken Kōbe-shi Kumoidōri,34.695124,135.197852,6.0,False
252104,air_a17f0778617c76e2,2017-04-22,60,2017-04-22,Saturday,0,Italian/French,Hyōgo-ken Kōbe-shi Kumoidōri,34.695124,135.197852,37.0,False
252105,air_a17f0778617c76e2,2017-03-26,69,2017-03-26,Sunday,0,Italian/French,Hyōgo-ken Kōbe-shi Kumoidōri,34.695124,135.197852,35.0,False
252106,air_a17f0778617c76e2,2017-03-20,31,2017-03-20,Monday,1,Italian/French,Hyōgo-ken Kōbe-shi Kumoidōri,34.695124,135.197852,3.0,False


### Ways to deal with missing values:

* Add in average (continuous) / mode (categorical
* Add in at specific leveles (e.g. individual customers) using group by
* Model out what it would be.

In [99]:
filled = df[~df['reserve_visitors'].isnull()]
missing = df[df['reserve_visitors'].isnull()]

In [103]:
X = filled[['visitors','holiday','latitude','longitude']]
y = filled['reserve_visitors']

In [105]:
gbm = GradientBoostingRegressor()

In [106]:
gbm.fit(X,y)

GradientBoostingRegressor()

In [108]:
predicted_missing = gbm.predict(missing[['visitors','holiday','latitude','longitude']])
predicted_missing 

array([15.86261708, 16.07787226, 15.86261708, ..., 16.40184107,
       16.25422672, 16.94692842])

### Dealing with Categorical Data

**For example:** in the restaurants data set we have day_of_week / genre / area / holiday



**So what can we do?**

3 options - mix of general purpose, and elements dealing 

* OneHotEncoding
* OrdinalEncoding
* TargetEncoding

VERY useful library: **https://contrib.scikit-learn.org/category_encoders/**


In [111]:
# Bring in the category encoders library
import category_encoders as ce

### Ordinal Encoding

* Assign each value to it's own incremental value
* Mostly use when something has a natural order to it (e.g. low > medium > high; strongly disagree > disagree > agree > strongly agree)

Can also just assign each categorical item it's own number (often a bad idea, but can work well for tree based models).
* Often an easy way to get going
* Tree-based models are forgiving of this, though it's not necessarily good practice

In [116]:
# Inititalise
encoder = ce.OrdinalEncoder()

# and use the fit_transform method
encoder.fit_transform(df)

Unnamed: 0,id,visit_date,visitors,calendar_date,day_of_week,holiday,genre,area,latitude,longitude,reserve_visitors,reserve_visitors_missing
0,1,2016-01-13,25,1,1,0,1,1,35.658068,139.751599,,True
1,1,2016-01-14,32,2,2,0,1,1,35.658068,139.751599,,True
2,1,2016-01-15,29,3,3,0,1,1,35.658068,139.751599,,True
3,1,2016-01-16,22,4,4,0,1,1,35.658068,139.751599,,True
4,1,2016-01-18,6,5,5,0,1,1,35.658068,139.751599,,True
...,...,...,...,...,...,...,...,...,...,...,...,...
252103,829,2017-04-21,49,390,3,0,4,10,34.695124,135.197852,6.0,False
252104,829,2017-04-22,60,391,4,0,4,10,34.695124,135.197852,37.0,False
252105,829,2017-03-26,69,444,7,0,4,10,34.695124,135.197852,35.0,False
252106,829,2017-03-20,31,467,5,1,4,10,34.695124,135.197852,3.0,False


In [118]:
encoder.mapping

[{'col': 'id',
  'mapping': air_ba937bf13d40fb24      1
  air_25e9888d30b386df      2
  air_fd6aac1043520e83      3
  air_64d4491ad8cdb1c6      4
  air_ee3a01f0c71a769f      5
                         ... 
  air_cf5ab75a0afb8af9    826
  air_1c0b150f9e696a5f    827
  air_900d755ebd2f7bbd    828
  air_a17f0778617c76e2    829
  NaN                      -2
  Length: 830, dtype: int64,
  'data_type': dtype('O')},
 {'col': 'calendar_date',
  'mapping': 2016-01-13      1
  2016-01-14      2
  2016-01-15      3
  2016-01-16      4
  2016-01-18      5
               ... 
  2016-01-01    475
  2016-01-03    476
  2017-01-02    477
  2016-01-02    478
  NaN            -2
  Length: 479, dtype: int64,
  'data_type': dtype('O')},
 {'col': 'day_of_week',
  'mapping': Wednesday    1
  Thursday     2
  Friday       3
  Saturday     4
  Monday       5
  Tuesday      6
  Sunday       7
  NaN         -2
  dtype: int64,
  'data_type': dtype('O')},
 {'col': 'genre',
  'mapping': Dining bar                 

In [119]:
# Default tends to be to transform items that are text based
# But some numeric items might be categorical, and you might want to encode them.
# so use the cols argument for this
?ce.OrdinalEncoder

### OneHot Encoding

* Create a binary 1 or 0 for each category in a new column
* For examples for day of week would transform into 7 columns

Most of the time this appraoch is the default.
* Gives you a granular view of what the values represent
* Often the best way, but not perfect, and can get odd behaviour with trees
* Tends to be better with relatively smaller categories

Downsides
* Can add a **LOT** of columns to your data set: names / zip codes in US (>5000)
    * Some models with too many columns vs rows it may not converge / produce an outcome
    * More columns mean large increases in fitting times
* In tree model
    * Can make for very uneven splits
    * For example - 100k rows / but 50 rows have a '1's in column; This can introduce a lot of noise. 

In [122]:
# use_cat_names can make things easier to read
ohe = ce.OneHotEncoder(use_cat_names=True)
ohe.fit_transform(df['day_of_week'])[:20]

  elif pd.api.types.is_categorical(cols):


Unnamed: 0,day_of_week_Wednesday,day_of_week_Thursday,day_of_week_Friday,day_of_week_Saturday,day_of_week_Monday,day_of_week_Tuesday,day_of_week_Sunday
0,1,0,0,0,0,0,0
1,0,1,0,0,0,0,0
2,0,0,1,0,0,0,0
3,0,0,0,1,0,0,0
4,0,0,0,0,1,0,0
5,0,0,0,0,0,1,0
6,1,0,0,0,0,0,0
7,0,1,0,0,0,0,0
8,0,0,1,0,0,0,0
9,0,0,0,1,0,0,0


In [125]:
ohe = ce.OneHotEncoder()
ohe.fit_transform(df['id'])

  elif pd.api.types.is_categorical(cols):


Unnamed: 0,id_1,id_2,id_3,id_4,id_5,id_6,id_7,id_8,id_9,id_10,...,id_820,id_821,id_822,id_823,id_824,id_825,id_826,id_827,id_828,id_829
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
252103,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
252104,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
252105,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
252106,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [126]:
df.id.value_counts()

air_5c817ef28f236bdf    477
air_d97dabf7aae60da5    476
air_a083834e7ffe187e    476
air_36bcf77d3382d36e    476
air_232dcee6f7c51d37    475
                       ... 
air_1c0b150f9e696a5f     51
air_a17f0778617c76e2     47
air_789103bf53b8096b     41
air_a9a380530c1e121f     40
air_900d755ebd2f7bbd     20
Name: id, Length: 829, dtype: int64

### Target Encoding
* Re-encode variable as the avg of the target variable for that category


In [127]:
# initialise
te = ce.TargetEncoder()
# you need to declare X and y
X = df.drop('visitors',axis=1)
y = df['visitors']
# now can fit

te.fit_transform(X,y)

  elif pd.api.types.is_categorical(cols):


Unnamed: 0,id,visit_date,calendar_date,day_of_week,holiday,genre,area,latitude,longitude,reserve_visitors,reserve_visitors_missing
0,22.782609,2016-01-13,18.433460,19.230121,0,18.723532,19.609418,35.658068,139.751599,,True
1,22.782609,2016-01-14,19.229927,18.922702,0,18.723532,19.609418,35.658068,139.751599,,True
2,22.782609,2016-01-15,23.506897,23.072737,0,18.723532,19.609418,35.658068,139.751599,,True
3,22.782609,2016-01-16,26.780142,26.313688,0,18.723532,19.609418,35.658068,139.751599,,True
4,22.782609,2016-01-18,14.486726,17.177009,0,18.723532,19.609418,35.658068,139.751599,,True
...,...,...,...,...,...,...,...,...,...,...,...
252103,44.595745,2017-04-21,25.030612,23.072737,0,22.582953,20.466463,34.695124,135.197852,6.0,False
252104,44.595745,2017-04-22,27.448320,26.313688,0,22.582953,20.466463,34.695124,135.197852,37.0,False
252105,44.595745,2017-03-26,24.098333,23.873362,0,22.582953,20.466463,34.695124,135.197852,35.0,False
252106,44.595745,2017-03-20,24.043400,17.177009,1,22.582953,20.466463,34.695124,135.197852,3.0,False


In [129]:
df.groupby('day_of_week')['visitors'].mean()
# Which match what we see in the column above!

day_of_week
Friday       23.072737
Monday       17.177009
Saturday     26.313688
Sunday       23.873362
Thursday     18.922702
Tuesday      17.672137
Wednesday    19.230121
Name: visitors, dtype: float64

In [130]:
transformed = te.fit_transform(X, y)
# and look up the value for air_900d755ebd2f7bbd -- the restaurant with the lowest attendance value
transformed.id.value_counts()

# 82.200000 which is well above the data sample size

  elif pd.api.types.is_categorical(cols):


15.288732     568
37.754717     477
37.754202     476
31.777311     476
27.325630     476
             ... 
115.470588     51
44.595745      47
55.243902      41
44.875000      40
82.200000      20
Name: id, Length: 828, dtype: int64

In [136]:
te = ce.TargetEncoder(min_samples_leaf=40)
transformed = te.fit_transform(X, y)
transformed.id.value_counts()

# The limit of the leaf size drives the average back towards that of the overall data set
# helps to avoid overfitting sections of the model due to target encoding

  elif pd.api.types.is_categorical(cols):


15.288732     568
37.754717     477
37.754202     476
31.777311     476
27.325630     476
             ... 
115.469010     51
44.574224      47
46.027242      41
32.924381      40
20.973761      20
Name: id, Length: 828, dtype: int64

# Data Pipelines!

* The pipeline module in in scikit-learn let you combine a lot of steps into one
* Saves time and effort

In [141]:
from sklearn.pipeline import make_pipeline
pipe = make_pipeline(ce.TargetEncoder(),GradientBoostingRegressor()) 
# Transfor data first, then fit the model

In [153]:
df= df.fillna(0)

In [154]:
train = df.groupby('id').apply(lambda x: x.iloc[:-15])
test = df.groupby('id').apply(lambda x: x.iloc[-15:])

In [155]:
train.drop('visit_date', axis = 1,inplace=True)
test.drop('visit_date', axis = 1,inplace=True)

In [156]:
X_train, y_train = train.drop('visitors',axis=1), train['visitors']
X_test, y_test = test.drop('visitors',axis=1), test['visitors']

In [157]:
pipe.fit(X_train, y_train)

  elif pd.api.types.is_categorical(cols):


Pipeline(steps=[('targetencoder',
                 TargetEncoder(cols=['id', 'calendar_date', 'day_of_week',
                                     'genre', 'area'])),
                ('gradientboostingregressor', GradientBoostingRegressor())])

In [158]:
pipe.score(X_test, y_test)

0.39051965255921706