In [15]:
import pandas as pd
import numpy as np
import category_encoders as ce
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.pipeline import make_pipeline
from category_encoders import TargetEncoder, OrdinalEncoder

In [1]:
!pip install pdpbox

Collecting pdpbox
  Downloading PDPbox-0.2.1.tar.gz (34.0 MB)
[K     |████████████████████████████████| 34.0 MB 9.0 MB/s eta 0:00:012   |▏                               | 133 kB 1.1 MB/s eta 0:00:32     |█▌                              | 1.6 MB 1.1 MB/s eta 0:00:30     |████▎                           | 4.5 MB 1.1 MB/s eta 0:00:28
Collecting matplotlib==3.1.1
  Downloading matplotlib-3.1.1.tar.gz (37.8 MB)
[K     |████████████████████████████████| 37.8 MB 228 kB/s eta 0:00:011
[?25hCollecting sklearn
  Downloading sklearn-0.0.tar.gz (1.1 kB)
Building wheels for collected packages: pdpbox, matplotlib, sklearn
  Building wheel for pdpbox (setup.py) ... [?25ldone
[?25h  Created wheel for pdpbox: filename=PDPbox-0.2.1-py3-none-any.whl size=35758226 sha256=9bcecf044b6d111282741ae471079a16e6129062484fecdc336bbc1056df02b0
  Stored in directory: /Users/umaprasad/Library/Caches/pip/wheels/35/fb/ef/a08dd2a1611435285fa3f9d9104bf554f10c2eb7293b526ccb
  Building wheel for matplotlib (setup.py)

In [6]:
df = pd.read_csv('data/master.csv', parse_dates=['visit_date'])


In [9]:
df.sort_values(by=['id', 'visit_date'], inplace=True)

In [4]:
df.drop('calendar_date', axis=1, inplace=True)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 252108 entries, 0 to 252107
Data columns (total 10 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   id                252108 non-null  object        
 1   visit_date        252108 non-null  datetime64[ns]
 2   visitors          252108 non-null  int64         
 3   day_of_week       252108 non-null  object        
 4   holiday           252108 non-null  int64         
 5   genre             252108 non-null  object        
 6   area              252108 non-null  object        
 7   latitude          252108 non-null  float64       
 8   longitude         252108 non-null  float64       
 9   reserve_visitors  108394 non-null  float64       
dtypes: datetime64[ns](1), float64(3), int64(2), object(4)
memory usage: 19.2+ MB


In [6]:
# define some functions that we can reuse
def create_val_splits(df, val_units=15, return_val=False):
    """Function that will take in a dataset and split it up into training, validation, and test sets"""
    # split into training, validation, and test sets
    df = df.drop('visit_date', axis=1)
    train = df.groupby('id').apply(lambda x: x.iloc[:-val_units]).reset_index(drop=True)
    test  = df.groupby('id').apply(lambda x: x.iloc[-val_units:]).reset_index(drop=True)
    
    if return_val:
        val   = train.groupby('id').apply(lambda x: x.iloc[-val_units:]).reset_index(drop=True)
        train = train.groupby('id').apply(lambda x: x.iloc[:-val_units]).reset_index(drop=True)
        return train, val, test
    else:
        return train, test

In [14]:
df = df.fillna(0)

train, val, test = create_val_splits(df, return_val=True)

In [16]:
X_train, y_train = train.drop('visitors', axis=1), train['visitors']
X_val, y_val = val.drop('visitors', axis=1), val['visitors']
X_test, y_test = test.drop('visitors', axis=1), test['visitors']

In [32]:
pipe = make_pipeline(ce.TargetEncoder(), GradientBoostingRegressor())

In [33]:
pipe.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('targetencoder',
                 TargetEncoder(cols=['id', 'day_of_week', 'genre', 'area'],
                               drop_invariant=False, handle_missing='value',
                               handle_unknown='value', min_samples_leaf=1,
                               return_df=True, smoothing=1.0, verbose=0)),
                ('gradientboostingregressor',
                 GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0,
                                           criterion='friedman_mse', init=None,
                                           learning_..., loss='ls',
                                           max_depth=3, max_features=None,
                                           max_leaf_nodes=None,
                                           min_impurity_decrease=0.0,
                                           min_impurity_split=None,
                                           min_samples_leaf=1,
                                      

In [34]:
pipe.score(X_val, y_val)

0.48777443017580935

In [36]:
pipe.predict(X_test)

array([24.01843187, 23.13948663, 29.0476373 , ...,  4.65445439,
        5.32207093,  7.57956199])

In [38]:
pipe.predict(X_test[:1])

array([24.01843187])

In [63]:
df['month'] = df['visit_date'].dt.month
df['yesterday'] = df.groupby('id')['visitors'].apply(lambda x: x.shift())
df['yesterday'] = df['yesterday'].bfill()
df['year'] = df['visit_date'].dt.year
df['quarter'] = df['visit_date'].dt.quarter
df['ten_day'] = df.groupby('id')['visitors'].apply(lambda x: x.rolling(10).mean().shift()).values
df['ten_day'] = df['ten_day'].bfill()

In [58]:
df.head()

Unnamed: 0,id,visit_date,visitors,day_of_week,holiday,genre,area,latitude,longitude,reserve_visitors,month,yesterday
166836,air_00a91d42b08b08d9,2016-07-01,35,Friday,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,0.0,7,
166837,air_00a91d42b08b08d9,2016-07-02,9,Saturday,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,4.0,7,35.0
166838,air_00a91d42b08b08d9,2016-07-04,20,Monday,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,0.0,7,9.0
166839,air_00a91d42b08b08d9,2016-07-05,25,Tuesday,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,0.0,7,20.0
166840,air_00a91d42b08b08d9,2016-07-06,29,Wednesday,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,0.0,7,25.0


In [66]:
train, val, test = create_val_splits(df, return_val=True)

In [67]:
X_train, y_train = train.drop('visitors', axis=1), train['visitors']
X_val, y_val = val.drop('visitors', axis=1), val['visitors']
X_test, y_test = test.drop('visitors', axis=1), test['visitors']

In [65]:
pipe.fit(X_train, y_train).score(X_val, y_val)

0.5979044549027293

In [68]:
pipe.score(X_train, y_train)

0.45563867933796853

In [69]:
pipe.get_params()

{'memory': None,
 'steps': [('targetencoder',
   TargetEncoder(cols=['id', 'day_of_week', 'genre', 'area'], drop_invariant=False,
                 handle_missing='value', handle_unknown='value',
                 min_samples_leaf=1, return_df=True, smoothing=1.0, verbose=0)),
  ('gradientboostingregressor',
   GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                             init=None, learning_rate=0.1, loss='ls', max_depth=3,
                             max_features=None, max_leaf_nodes=None,
                             min_impurity_decrease=0.0, min_impurity_split=None,
                             min_samples_leaf=1, min_samples_split=2,
                             min_weight_fraction_leaf=0.0, n_estimators=100,
                             n_iter_no_change=None, presort='deprecated',
                             random_state=None, subsample=1.0, tol=0.0001,
                             validation_fraction=0.1, verbose=0, warm_start=False)

In [74]:
pipe

['_SUPPORTED_LOSS',
 '__abstractmethods__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_check_initialized',
 '_check_params',
 '_clear_state',
 '_compute_partial_dependence_recursion',
 '_estimator_type',
 '_fit_stage',
 '_fit_stages',
 '_get_param_names',
 '_get_tags',
 '_init_state',
 '_is_initialized',
 '_make_estimator',
 '_more_tags',
 '_raw_predict',
 '_raw_predict_init',
 '_required_parameters',
 '_resize_state',
 '_staged_raw_predict',
 '_validate_estimator',
 '_validate_y',
 'alpha',
 'apply',
 'ccp_alpha',
 'criterion',
 'estimators_',
 'feature_importances_',
 'fit

In [80]:
estimators = [100, 200]
learning_rate = [.05, .1]
tree_depth = [3, 4]
cv_scores = []

for estimator in estimators:
    for rate in learning_rate:
        for depth in tree_depth:
            print(f"Fitting model for:  estimators: {estimator}, learning_rate: {rate}, depth: {depth}")
            mod   = GradientBoostingRegressor(n_estimators=estimator, learning_rate=rate, max_depth=depth, max_features=0.7)
            pipe = make_pipeline(ce.TargetEncoder(), mod)
            pipe.fit(X_train, y_train)
            val_score = pipe.score(X_val, y_val)
            cv_scores.append((val_score, estimator, rate, depth))

Fitting model for:  estimators: 100, learning_rate: 0.05, depth: 3
Fitting model for:  estimators: 100, learning_rate: 0.05, depth: 4
Fitting model for:  estimators: 100, learning_rate: 0.1, depth: 3
Fitting model for:  estimators: 100, learning_rate: 0.1, depth: 4
Fitting model for:  estimators: 200, learning_rate: 0.05, depth: 3
Fitting model for:  estimators: 200, learning_rate: 0.05, depth: 4
Fitting model for:  estimators: 200, learning_rate: 0.1, depth: 3
Fitting model for:  estimators: 200, learning_rate: 0.1, depth: 4


In [81]:
cv_scores

[(0.5237070061670954, 100, 0.05, 3),
 (0.5328899528153548, 100, 0.05, 4),
 (0.5339639385362109, 100, 0.1, 3),
 (0.5404108798107004, 100, 0.1, 4),
 (0.533135931545744, 200, 0.05, 3),
 (0.5417218156915765, 200, 0.05, 4),
 (0.5458848341780838, 200, 0.1, 3),
 (0.5542977507636977, 200, 0.1, 4)]

In [82]:
max(cv_scores)

(0.5542977507636977, 200, 0.1, 4)

In [83]:
mod = GradientBoostingRegressor(n_estimators=200, learning_rate=0.1, max_depth=4)

In [84]:
train, test = create_val_splits(df, return_val=False)

In [94]:
X_train, y_train = train.drop('visitors', axis=1), train['visitors']
X_test, y_test = test.drop('visitors', axis=1), test['visitors']

In [86]:
pipe = make_pipeline(ce.TargetEncoder(), mod)

In [87]:
pipe.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('targetencoder',
                 TargetEncoder(cols=['id', 'day_of_week', 'genre', 'area'],
                               drop_invariant=False, handle_missing='value',
                               handle_unknown='value', min_samples_leaf=1,
                               return_df=True, smoothing=1.0, verbose=0)),
                ('gradientboostingregressor',
                 GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0,
                                           criterion='friedman_mse', init=None,
                                           learning_..., loss='ls',
                                           max_depth=4, max_features=None,
                                           max_leaf_nodes=None,
                                           min_impurity_decrease=0.0,
                                           min_impurity_split=None,
                                           min_samples_leaf=1,
                                      

In [88]:
pipe.score(X_test, y_test)

0.5165375620266042

In [95]:
naive_guess = y_test.mean()

In [99]:
naive_model = np.sum((y_test - naive_guess)**2)

In [101]:
preds = pipe.predict(X_test)

In [105]:
our_model = np.sum((y_test - preds)**2)

In [106]:
1 - (our_model / naive_model)

0.5165375620266042

In [109]:
feats_dict = {'Column': X_train.columns,
              'Importance': pipe[1].feature_importances_,
             }

feats = pd.DataFrame(feats_dict)

In [111]:
feats.sort_values(by='Importance', ascending=False)

Unnamed: 0,Column,Importance
0,id,0.423027
12,ten_day,0.388948
1,day_of_week,0.106283
9,yesterday,0.035771
6,longitude,0.012038
5,latitude,0.009119
2,holiday,0.007602
8,month,0.005214
3,genre,0.004642
7,reserve_visitors,0.003842


In [119]:
random_id = X_test['id'].sample(frac=1)

In [134]:
X_test_copy = X_test.copy()
X_test_copy['ten_day'] = np.random.permutation(X_test['ten_day'])

TypeError: permutation() takes no keyword arguments

In [133]:
pipe.score(X_test_copy, y_test)

0.2003556335062029

In [129]:
.516 - .275

0.241

In [124]:
X_test_copy['id'] = random_id

In [128]:
X_test_copy

Unnamed: 0,id,day_of_week,holiday,genre,area,latitude,longitude,reserve_visitors,month,yesterday,year,quarter,ten_day
0,air_2009041dbf9264de,Wednesday,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,2.0,4,17.0,2017,2,29.4
1,air_d07e57b21109304a,Thursday,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,8.0,4,35.0,2017,2,29.5
2,air_2cee51fa6fdf6c0d,Friday,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,1.0,4,29.0,2017,2,31.7
3,air_d98380a4aeb0290b,Saturday,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,33.0,4,17.0,2017,2,29.1
4,air_91236b89d29567af,Monday,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,0.0,4,9.0,2017,2,27.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
12430,air_a083834e7ffe187e,Tuesday,0,Bar/Cocktail,Tōkyō-to Nakano-ku Nakano,35.708146,139.666288,0.0,4,3.0,2017,2,4.5
12431,air_0f2f96335f274801,Wednesday,0,Bar/Cocktail,Tōkyō-to Nakano-ku Nakano,35.708146,139.666288,0.0,4,6.0,2017,2,4.7
12432,air_7cf5a02c0e01b647,Thursday,0,Bar/Cocktail,Tōkyō-to Nakano-ku Nakano,35.708146,139.666288,1.0,4,2.0,2017,2,4.4
12433,air_2c989829acbd1c6b,Friday,0,Bar/Cocktail,Tōkyō-to Nakano-ku Nakano,35.708146,139.666288,6.0,4,2.0,2017,2,4.0


In [122]:
X_test_copy

Unnamed: 0,id,day_of_week,holiday,genre,area,latitude,longitude,reserve_visitors,month,yesterday,year,quarter,ten_day
0,air_00a91d42b08b08d9,Wednesday,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,2.0,4,17.0,2017,2,29.4
1,air_00a91d42b08b08d9,Thursday,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,8.0,4,35.0,2017,2,29.5
2,air_00a91d42b08b08d9,Friday,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,1.0,4,29.0,2017,2,31.7
3,air_00a91d42b08b08d9,Saturday,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,33.0,4,17.0,2017,2,29.1
4,air_00a91d42b08b08d9,Monday,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,0.0,4,9.0,2017,2,27.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
12430,air_fff68b929994bfbd,Tuesday,0,Bar/Cocktail,Tōkyō-to Nakano-ku Nakano,35.708146,139.666288,0.0,4,3.0,2017,2,4.5
12431,air_fff68b929994bfbd,Wednesday,0,Bar/Cocktail,Tōkyō-to Nakano-ku Nakano,35.708146,139.666288,0.0,4,6.0,2017,2,4.7
12432,air_fff68b929994bfbd,Thursday,0,Bar/Cocktail,Tōkyō-to Nakano-ku Nakano,35.708146,139.666288,1.0,4,2.0,2017,2,4.4
12433,air_fff68b929994bfbd,Friday,0,Bar/Cocktail,Tōkyō-to Nakano-ku Nakano,35.708146,139.666288,6.0,4,2.0,2017,2,4.0


In [7]:
#starting next class work

In [8]:
y = df['visitors']

In [9]:
X = df[['id', 'day_of_week']]

In [16]:
te = TargetEncoder()
X['id'] = te.fit_transform(df['id'], y)

  elif pd.api.types.is_categorical(cols):
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['id'] = te.fit_transform(df['id'], y)


In [11]:
X.head()

Unnamed: 0,id,day_of_week
0,air_ba937bf13d40fb24,Wednesday
1,air_ba937bf13d40fb24,Thursday
2,air_ba937bf13d40fb24,Friday
3,air_ba937bf13d40fb24,Saturday
4,air_ba937bf13d40fb24,Monday


In [17]:
te.mapping

{'id': id
  1       22.782609
  2        6.743750
  3       32.622047
  4       15.398082
  5       28.795337
            ...    
  827    115.470588
  828     82.200000
  829     44.595745
 -1       20.973761
 -2       20.973761
 Length: 831, dtype: float64}

In [19]:
mod = GradientBoostingRegressor()
X_copy['pred'] = mod_predict
X_copy['day_of_week'] = 1

NameError: name 'X_copy' is not defined

In [13]:
import numpy as np

In [18]:
te.transform(np.array([1]))

KeyError: 'id'

In [27]:
import xgboost as xgb
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
# this is the file you should've gotten from your repo
from utils import extract_dates, get_val_scores
from category_encoders import OrdinalEncoder, OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score, train_test_split


df = pd.read_csv('/Users/umaprasad/dat-02-22/ClassMaterial/Unit3/data/ks2.csv', encoding='utf-8', parse_dates=['deadline', 'launched'])

df.head()

df['state'].value_counts()

failed        236498
successful    133956
Name: state, dtype: int64

In [21]:
from utils import extract_dates, get_val_scores

In [23]:
??extract_dates

In [24]:
df = extract_dates(df)
df.columns

  df[col_name] = getattr(df[col].dt, part)


Index(['ID', 'name', 'category', 'main_category', 'currency', 'deadline',
       'launched', 'state', 'country', 'goal', 'deadline_dayofweek',
       'deadline_dayofyear', 'deadline_days_in_month', 'deadline_is_leap_year',
       'deadline_is_month_end', 'deadline_is_month_start',
       'deadline_is_quarter_end', 'deadline_is_quarter_start',
       'deadline_is_year_end', 'deadline_is_year_start', 'deadline_quarter',
       'deadline_week', 'deadline_weekofyear', 'deadline_day', 'deadline_hour',
       'deadline_minute', 'deadline_month', 'deadline_year',
       'launched_dayofweek', 'launched_dayofyear', 'launched_days_in_month',
       'launched_is_leap_year', 'launched_is_month_end',
       'launched_is_month_start', 'launched_is_quarter_end',
       'launched_is_quarter_start', 'launched_is_year_end',
       'launched_is_year_start', 'launched_quarter', 'launched_week',
       'launched_weekofyear', 'launched_day', 'launched_hour',
       'launched_minute', 'launched_month', 'laun

In [28]:
mod1 = xgb.XGBClassifier()

In [29]:
mod1.get_params()

{'objective': 'binary:logistic',
 'use_label_encoder': True,
 'base_score': None,
 'booster': None,
 'colsample_bylevel': None,
 'colsample_bynode': None,
 'colsample_bytree': None,
 'gamma': None,
 'gpu_id': None,
 'importance_type': 'gain',
 'interaction_constraints': None,
 'learning_rate': None,
 'max_delta_step': None,
 'max_depth': None,
 'min_child_weight': None,
 'missing': nan,
 'monotone_constraints': None,
 'n_estimators': 100,
 'n_jobs': None,
 'num_parallel_tree': None,
 'random_state': None,
 'reg_alpha': None,
 'reg_lambda': None,
 'scale_pos_weight': None,
 'subsample': None,
 'tree_method': None,
 'validate_parameters': None,
 'verbosity': None}

In [30]:
pipe = make_pipeline(OrdinalEncoder(), mod1)

In [31]:
X = df.drop(['deadline', 'launched', 'state'], axis=1)
y = df['state']

In [32]:
?get_val_scores

In [33]:
scores = get_val_scores(pipe, X, y, return_test_score=True, random_state=1985, stratify=True, use_kfold=False)





  elif pd.api.types.is_categorical(cols):




In [35]:
scores

{'validation_score': 0.6991210163143421, 'test_score': 0.6503219014455197}

In [36]:
feats = pd.DataFrame({ 'Importance': pipe.steps[1][1].feature_importances_, 'Column': X.columns})
feats.sort_values(by='Importance', ascending=False)

Unnamed: 0,Importance,Column
6,0.281691,goal
3,0.256934,main_category
2,0.227745,category
4,0.111773,currency
5,0.064959,country
0,0.028704,ID
1,0.028193,name


In [37]:
df['duration'] = (df['deadline'] - df['launched']).dt.days

In [38]:
category_avgs = df.groupby('category')[['goal']].mean().reset_index().rename({'goal': 'category_goal_avg'}, axis=1)

In [39]:
df = df.merge(category_avgs, on='category')

In [41]:
df['cat_goal_pct'] = df['goal'] / df['category_goal_avg']
# and our results
df[['goal', 'category_goal_avg', 'cat_goal_pct']].head()

#Go through the same process, but this time, create two columns for the main_category feature: it's average value, and the percent of average for each campaign

#Take 5-7 minutes



Unnamed: 0,goal,category_goal_avg,cat_goal_pct
0,1533.95,5213.996468,0.294199
1,6060.97,5213.996468,1.162442
2,2000.0,5213.996468,0.383583
3,10000.0,5213.996468,1.917915
4,757.52,5213.996468,0.145286


In [42]:
df[['goal', 'category_goal_avg', 'cat_goal_pct']].head()

Unnamed: 0,goal,category_goal_avg,cat_goal_pct
0,1533.95,5213.996468,0.294199
1,6060.97,5213.996468,1.162442
2,2000.0,5213.996468,0.383583
3,10000.0,5213.996468,1.917915
4,757.52,5213.996468,0.145286


In [43]:
main_cats = df.groupby('main_category')[['goal']].mean().reset_index().rename({'goal': 'main_cat_goal_avg'}, axis=1)

In [44]:
df = df.merge(main_cats, on='main_category')

In [45]:
df['main_goal_pct'] = df['goal'] / df['main_cat_goal_avg']

In [46]:
df[['goal', 'main_cat_goal_avg', 'main_goal_pct']].head()

Unnamed: 0,goal,main_cat_goal_avg,main_goal_pct
0,1533.95,22590.745149,0.067902
1,6060.97,22590.745149,0.268294
2,2000.0,22590.745149,0.088532
3,10000.0,22590.745149,0.442659
4,757.52,22590.745149,0.033532


In [47]:
X = df.drop(['deadline', 'launched', 'state'], axis=1)
y = df['state']

In [48]:
scores = get_val_scores(pipe, X, y, random_state=1985, stratify=True, use_kfold=False)

  elif pd.api.types.is_categorical(cols):




