# FIFA Pipeline implementation RUN

In this notebook we implement a more rigorous pipeline approach.<br>
As much as possible should be done with scikit-learn.

In [1]:
%run 4_pipeline_functions.ipynb
%run 5_pipeline_clean.ipynb

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 27 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Unnamed: 0                2000 non-null   int64  
 1   Name                      2000 non-null   object 
 2   Age                       2000 non-null   int64  
 3   Nationality               2000 non-null   object 
 4   Overall                   2000 non-null   int64  
 5   Club                      1969 non-null   object 
 6   Value                     2000 non-null   object 
 7   Wage                      2000 non-null   object 
 8   Preferred Foot            1997 non-null   object 
 9   International Reputation  1997 non-null   float64
 10  Weak Foot                 1997 non-null   float64
 11  Skill Moves               1997 non-null   float64
 12  Work Rate                 1997 non-null   object 
 13  Body Type                 1997 non-null   object 
 14  Position

## Start building the pipeline

Perform CV on the train set and keep a hold-out set 

In [2]:
X = df.drop('Value', axis = 1)
y = df['Value']

In [3]:
test_scores = {} # Save all the test scores for comparison.
best_CV_scores = {} # Save all the best CV scores for comparison.

#### DummyRegressor (naive)
Perform a naive regressor (see if our models perform better)

In [4]:
print('Mean:', format(np.mean(df.Value), 'E'))
print('Median:', format(np.median(df.Value), 'E'))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

strategies = ['mean', 'median']

for strategy in strategies:
    dummy_reg = DummyRegressor(strategy = strategy)
    
    dummy_reg.fit(X_train, y_train)
    
    y_pred = dummy_reg.predict(X_test)
    
    print('Strategy:', strategy, '### Score:', format(np.sqrt(mean_squared_error(y_test, y_pred)), 'E'))

Mean: 2.403422E+06
Median: 6.750000E+05
Strategy: mean ### Score: 4.773063E+06
Strategy: median ### Score: 5.062146E+06


## Main run

In [5]:
#n_iters = [10 ** 2, 10 ** 3]
n_iters = [10 ** 2, 10 ** 3, 10 ** 4]

### Numerical features

In [6]:
set_features = 'num_features'

X_train, X_test, y_train, y_test = get_train_test_sets(X, set_features)

col_trans = build_col_trans(set_features)

steps = [('col_trans', col_trans),
         ('scaler', None),
         ('model', None)]

pipeline = Pipeline(steps)

ttr = TransformedTargetRegressor(regressor = pipeline, transformer = None)

for n_iter in n_iters:
    %run 6_pipeline_num.ipynb    

Type of search: GS
{'regressor__col_trans__imp_num_cols__strategy': 'mean', 'regressor__model': KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                    weights='uniform'), 'regressor__model__n_neighbors': 3, 'regressor__model__weights': 'uniform', 'regressor__scaler': None, 'transformer': None}
Best CV score: 1.017913E+06
Test score: 9.709342E+05
#################################################################################################
Type of search: RS
{'transformer': RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
             with_scaling=True), 'regressor__scaler': None, 'regressor__model__weights': 'uniform', 'regressor__model__n_neighbors': 3, 'regressor__model': KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                    weights='unifo

Type of search: GS
{'regressor__col_trans__imp_num_cols__strategy': 'mean', 'regressor__model': ElasticNet(alpha=0.01, copy_X=True, fit_intercept=False, l1_ratio=0.95,
           max_iter=1000000, normalize=False, positive=False, precompute=False,
           random_state=1, selection='cyclic', tol=0.0001, warm_start=False), 'regressor__model__alpha': 0.01, 'regressor__model__fit_intercept': False, 'regressor__model__l1_ratio': 0.95, 'regressor__model__normalize': False, 'regressor__scaler': RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
             with_scaling=True), 'transformer': RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
             with_scaling=True)}
Best CV score: 1.144477E+06
Test score: 9.676150E+05
#################################################################################################
Type of search: RS
{'transformer': RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
             wi

Type of search: RS
{'transformer': RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
             with_scaling=True), 'regressor__scaler': RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
             with_scaling=True), 'regressor__model__normalize': False, 'regressor__model__l1_ratio': 0.95, 'regressor__model__fit_intercept': False, 'regressor__model__alpha': 0.01, 'regressor__model': ElasticNet(alpha=0.01, copy_X=True, fit_intercept=False, l1_ratio=0.95,
           max_iter=1000000, normalize=False, positive=False, precompute=False,
           random_state=1, selection='cyclic', tol=0.0001, warm_start=False), 'regressor__col_trans__imp_num_cols__strategy': 'mean'}
Best CV score: 1.144477E+06
Test score: 9.676150E+05
#################################################################################################
Type of search: RS
{'transformer': None, 'regressor__scaler': RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_cent

### Numerical + ordinal features

In [7]:
set_features = 'num_ord_features'

X_train, X_test, y_train, y_test = get_train_test_sets(X, set_features)

col_trans = build_col_trans(set_features)

steps = [('col_trans', col_trans),
         ('scaler', None),         
        ('model', None)]

pipeline = Pipeline(steps)

ttr = TransformedTargetRegressor(regressor = pipeline, transformer = None)

for n_iter in n_iters:
    %run 7_pipeline_num_ord.ipynb

Type of search: GS
{'regressor__col_trans__imp_num_cols__strategy': 'median', 'regressor__col_trans__imp_ord_cols__strategy': 'median', 'regressor__model': KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=2, p=2,
                    weights='distance'), 'regressor__model__n_neighbors': 2, 'regressor__model__weights': 'distance', 'regressor__scaler': RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
             with_scaling=True), 'transformer': None}
Best CV score: 9.897204E+05
Test score: 1.117380E+06
#################################################################################################
Type of search: RS
{'transformer': None, 'regressor__scaler': RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
             with_scaling=True), 'regressor__model__weights': 'distance', 'regressor__model__n_neighbors': 2, 'regressor__model': KNeighborsRegre

Type of search: RS
{'transformer': None, 'regressor__scaler': StandardScaler(copy=True, with_mean=True, with_std=True), 'regressor__model__subsample': 0.9, 'regressor__model__n_estimators': 80, 'regressor__model__min_samples_split': 2, 'regressor__model__min_samples_leaf': 3, 'regressor__model__max_features': 'auto', 'regressor__model__max_depth': 10, 'regressor__model__loss': 'lad', 'regressor__model__learning_rate': 0.1, 'regressor__model__lambda': 1, 'regressor__model__colsample_bytree': 1, 'regressor__model__alpha': 10, 'regressor__model': XGBRegressor(alpha=10, base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', lambda=1, learning_rate=0.1, loss='lad',
             max_delta_step=0, max_depth=10, max_features='auto',
             min_child_weight=1, min_samples_leaf=3, min_samples_split=2,
             missing=nan, n_estimators=80, n_jobs=1, nthread=None,
             objective='r

Type of search: RS
{'transformer': RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
             with_scaling=True), 'regressor__scaler': RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
             with_scaling=True), 'regressor__model__n_estimators': 80, 'regressor__model__loss': 'exponential', 'regressor__model__learning_rate': 0.1, 'regressor__model': AdaBoostRegressor(base_estimator=None, learning_rate=0.1, loss='exponential',
                  n_estimators=80, random_state=1), 'regressor__col_trans__imp_ord_cols__strategy': 'most_frequent', 'regressor__col_trans__imp_num_cols__strategy': 'most_frequent'}
Best CV score: 1.083923E+06
Test score: 9.950124E+05
#################################################################################################
Type of search: RS
{'transformer': PowerTransformer(copy=True, method='yeo-johnson', standardize=True), 'regressor__scaler': RobustScaler(copy=True, quantile_range=(25.0, 75.0), wi

Type of search: RS
{'transformer': PowerTransformer(copy=True, method='yeo-johnson', standardize=True), 'regressor__scaler': StandardScaler(copy=True, with_mean=True, with_std=True), 'regressor__model__n_estimators': 50, 'regressor__model__min_samples_split': 2, 'regressor__model__min_samples_leaf': 1, 'regressor__model__max_features': 'auto', 'regressor__model__max_depth': 100, 'regressor__model__bootstrap': True, 'regressor__model': RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=100, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=50, n_jobs=None, oob_score=False,
                      random_state=1, verbose=0, warm_start=False), 'regressor__col_trans__imp_ord_cols__strategy': 'mean', 'regressor__

### Numerical + ordinal + categorical features

In [8]:
set_features = 'num_ord_cat_features'

for skipPosition in [False, True]:
    if skipPosition:
        try:
            cat_features.remove('Position')
        except ValueError:
            pass

    X_train, X_test, y_train, y_test = get_train_test_sets(X, set_features)

    col_trans = ColumnTransformer([
                ('imp_num_cols', SimpleImputer(), num_features),
                ('imp_ord_cols', SimpleImputer(), ord_features),
                ('imp_cat_cols', SimpleImputer(fill_value = 'missing_value'), cat_features)
            ])

    preprocessor = ColumnTransformer([
            ('scaler_num_ord', StandardScaler(), slice(0, len(num_features + ord_features))),
            ('categoricalencoder', None, slice(len(num_features + ord_features), \
                                                          len(num_features + ord_features + cat_features)))
        ])

    steps = [('col_trans', col_trans),
             ('preprocessor', preprocessor),
             ('model', None)]

    pipeline = Pipeline(steps)

    ttr = TransformedTargetRegressor(regressor = pipeline, transformer = None)

    for n_iter in n_iters:
        %run 8_pipeline_num_ord_cat.ipynb

Type of search: GS
{'regressor__col_trans__imp_cat_cols__strategy': 'most_frequent', 'regressor__col_trans__imp_num_cols__strategy': 'most_frequent', 'regressor__col_trans__imp_ord_cols__strategy': 'most_frequent', 'regressor__model': KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=1, p=2,
                    weights='uniform'), 'regressor__model__n_neighbors': 1, 'regressor__model__weights': 'uniform', 'regressor__preprocessor__categoricalencoder': OneHotEncoder(categories='auto', drop=None, dtype=<class 'numpy.float64'>,
              handle_unknown='ignore', sparse=True), 'regressor__preprocessor__categoricalencoder__drop': None, 'regressor__preprocessor__scaler_num_ord': RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
             with_scaling=True), 'transformer': None}
Best CV score: 1.253525E+06
Test score: 1.319504E+06
##############################################

Type of search: RS
{'transformer': StandardScaler(copy=True, with_mean=True, with_std=True), 'regressor__preprocessor__scaler_num_ord': StandardScaler(copy=True, with_mean=True, with_std=True), 'regressor__preprocessor__categoricalencoder__drop': None, 'regressor__preprocessor__categoricalencoder': OneHotEncoder(categories='auto', drop=None, dtype=<class 'numpy.float64'>,
              handle_unknown='ignore', sparse=True), 'regressor__model__n_estimators': 90, 'regressor__model__loss': 'square', 'regressor__model__learning_rate': 0.05, 'regressor__model': AdaBoostRegressor(base_estimator=None, learning_rate=0.05, loss='square',
                  n_estimators=90, random_state=1), 'regressor__col_trans__imp_ord_cols__strategy': 'mean', 'regressor__col_trans__imp_num_cols__strategy': 'mean', 'regressor__col_trans__imp_cat_cols__strategy': 'most_frequent'}
Best CV score: 1.154062E+06
Test score: 1.077877E+06
#################################################################################

Type of search: GS
{'regressor__col_trans__imp_cat_cols__strategy': 'constant', 'regressor__col_trans__imp_num_cols__strategy': 'mean', 'regressor__col_trans__imp_ord_cols__strategy': 'most_frequent', 'regressor__model': ElasticNet(alpha=0.01, copy_X=True, fit_intercept=False, l1_ratio=0.95,
           max_iter=1000000, normalize=False, positive=False, precompute=False,
           random_state=1, selection='cyclic', tol=0.0001, warm_start=False), 'regressor__model__alpha': 0.01, 'regressor__model__fit_intercept': False, 'regressor__model__l1_ratio': 0.95, 'regressor__model__normalize': False, 'regressor__preprocessor__categoricalencoder': OneHotEncoder(categories='auto', drop=None, dtype=<class 'numpy.float64'>,
              handle_unknown='ignore', sparse=True), 'regressor__preprocessor__categoricalencoder__drop': None, 'regressor__preprocessor__scaler_num_ord': StandardScaler(copy=True, with_mean=True, with_std=True), 'transformer': RobustScaler(copy=True, quantile_range=(25.0, 75.0

Type of search: RS
{'transformer': PowerTransformer(copy=True, method='yeo-johnson', standardize=True), 'regressor__preprocessor__scaler_num_ord': StandardScaler(copy=True, with_mean=True, with_std=True), 'regressor__preprocessor__categoricalencoder__drop': None, 'regressor__preprocessor__categoricalencoder': OneHotEncoder(categories='auto', drop=None, dtype=<class 'numpy.float64'>,
              handle_unknown='ignore', sparse=True), 'regressor__model__subsample': 0.7, 'regressor__model__n_estimators': 90, 'regressor__model__min_samples_split': 8, 'regressor__model__min_samples_leaf': 1, 'regressor__model__max_features': 2, 'regressor__model__max_depth': 100, 'regressor__model__loss': 'lad', 'regressor__model__learning_rate': 0.1, 'regressor__model__lambda': 1, 'regressor__model__colsample_bytree': 1, 'regressor__model__alpha': 0, 'regressor__model': XGBRegressor(alpha=0, base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=

Type of search: RS
{'transformer': PowerTransformer(copy=True, method='yeo-johnson', standardize=True), 'regressor__preprocessor__scaler_num_ord': RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
             with_scaling=True), 'regressor__preprocessor__categoricalencoder__drop': None, 'regressor__preprocessor__categoricalencoder': OneHotEncoder(categories='auto', drop=None, dtype=<class 'numpy.float64'>,
              handle_unknown='ignore', sparse=True), 'regressor__model__n_estimators': 40, 'regressor__model__min_samples_split': 2, 'regressor__model__min_samples_leaf': 1, 'regressor__model__max_features': 'auto', 'regressor__model__max_depth': 100, 'regressor__model__bootstrap': True, 'regressor__model': RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=100, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split

Type of search: GS
{'regressor__col_trans__imp_cat_cols__strategy': 'constant', 'regressor__col_trans__imp_num_cols__strategy': 'mean', 'regressor__col_trans__imp_ord_cols__strategy': 'most_frequent', 'regressor__model': Lasso(alpha=0.01, copy_X=True, fit_intercept=False, max_iter=100000,
      normalize=False, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False), 'regressor__model__alpha': 0.01, 'regressor__model__fit_intercept': False, 'regressor__model__normalize': False, 'regressor__preprocessor__categoricalencoder': OneHotEncoder(categories='auto', drop=None, dtype=<class 'numpy.float64'>,
              handle_unknown='ignore', sparse=True), 'regressor__preprocessor__categoricalencoder__drop': None, 'regressor__preprocessor__scaler_num_ord': StandardScaler(copy=True, with_mean=True, with_std=True), 'transformer': RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
             with_scaling=True)}
Best CV

Type of search: RS
{'transformer': StandardScaler(copy=True, with_mean=True, with_std=True), 'regressor__preprocessor__scaler_num_ord': RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
             with_scaling=True), 'regressor__preprocessor__categoricalencoder__drop': None, 'regressor__preprocessor__categoricalencoder': OneHotEncoder(categories='auto', drop=None, dtype=<class 'numpy.float64'>,
              handle_unknown='ignore', sparse=True), 'regressor__model__n_estimators': 50, 'regressor__model__min_samples_split': 2, 'regressor__model__min_samples_leaf': 3, 'regressor__model__max_features': 'auto', 'regressor__model__max_depth': 10, 'regressor__model__loss': 'quantile', 'regressor__model__learning_rate': 0.05, 'regressor__model': GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.05, loss='quantile',
                          max_depth=10, max_features='auto',
             

Type of search: RS
{'transformer': StandardScaler(copy=True, with_mean=True, with_std=True), 'regressor__preprocessor__scaler_num_ord': RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
             with_scaling=True), 'regressor__preprocessor__categoricalencoder__drop': None, 'regressor__preprocessor__categoricalencoder': OneHotEncoder(categories='auto', drop=None, dtype=<class 'numpy.float64'>,
              handle_unknown='ignore', sparse=True), 'regressor__model__n_estimators': 80, 'regressor__model__min_samples_split': 2, 'regressor__model__min_samples_leaf': 1, 'regressor__model__max_features': 'auto', 'regressor__model__max_depth': None, 'regressor__model__bootstrap': True, 'regressor__model': RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, mi

Type of search: RS
{'transformer': RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
             with_scaling=True), 'regressor__preprocessor__scaler_num_ord': RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
             with_scaling=True), 'regressor__preprocessor__categoricalencoder__drop': None, 'regressor__preprocessor__categoricalencoder': OneHotEncoder(categories='auto', drop=None, dtype=<class 'numpy.float64'>,
              handle_unknown='ignore', sparse=True), 'regressor__model__weights': 'distance', 'regressor__model__n_neighbors': 3, 'regressor__model': KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                    weights='distance'), 'regressor__col_trans__imp_ord_cols__strategy': 'median', 'regressor__col_trans__imp_num_cols__strategy': 'median', 'regressor__col_trans__imp_cat_cols__strategy': 'most_frequent'}
Best CV sc

Type of search: RS
{'transformer': StandardScaler(copy=True, with_mean=True, with_std=True), 'regressor__preprocessor__scaler_num_ord': StandardScaler(copy=True, with_mean=True, with_std=True), 'regressor__preprocessor__categoricalencoder__drop': None, 'regressor__preprocessor__categoricalencoder': OneHotEncoder(categories='auto', drop=None, dtype=<class 'numpy.float64'>,
              handle_unknown='ignore', sparse=True), 'regressor__model__n_estimators': 50, 'regressor__model__loss': 'exponential', 'regressor__model__learning_rate': 0.1, 'regressor__model': AdaBoostRegressor(base_estimator=None, learning_rate=0.1, loss='exponential',
                  n_estimators=50, random_state=1), 'regressor__col_trans__imp_ord_cols__strategy': 'mean', 'regressor__col_trans__imp_num_cols__strategy': 'median', 'regressor__col_trans__imp_cat_cols__strategy': 'most_frequent'}
Best CV score: 1.091739E+06
Test score: 1.000081E+06
#######################################################################

In [9]:
print_key_value_sorted(test_scores)

('xgbregressor_RS_n_iter1000_num_ord_cat_features', '3.927704E+05')
('xgbregressor_RS_n_iter1000_num_ord_features', '4.021800E+05')
('xgbregressor_RS_n_iter10000_num_ord_features', '4.066801E+05')
('xgbregressor_RS_n_iter1000_num_ord_cat_features_skipPosition', '4.119046E+05')
('xgbregressor_RS_n_iter10000_num_ord_cat_features', '4.186841E+05')
('xgbregressor_RS_n_iter10000_num_ord_cat_features_skipPosition', '4.305628E+05')
('randomforestregressor_RS_n_iter100_num_ord_cat_features_skipPosition', '4.322234E+05')
('randomforestregressor_RS_n_iter100_num_ord_features', '4.329406E+05')
('randomforestregressor_RS_n_iter1000_num_ord_cat_features', '4.341956E+05')
('randomforestregressor_RS_n_iter100_num_ord_cat_features', '4.341956E+05')
('randomforestregressor_RS_n_iter1000_num_ord_features', '4.357663E+05')
('xgbregressor_RS_n_iter100_num_ord_features', '4.375737E+05')
('randomforestregressor_RS_n_iter10000_num_ord_features', '4.426755E+05')
('randomforestregressor_RS_n_iter1000_num_ord_c

In [10]:
print_key_value_sorted(best_CV_scores)

('gradientboostingregressor_RS_n_iter10000_num_ord_features', '5.749916E+05')
('gradientboostingregressor_RS_n_iter1000_num_ord_features', '5.766528E+05')
('xgbregressor_RS_n_iter10000_num_ord_features', '5.989173E+05')
('extratreesregressor_RS_n_iter10000_num_ord_features', '6.106671E+05')
('xgbregressor_RS_n_iter10000_num_ord_cat_features', '6.180426E+05')
('xgbregressor_RS_n_iter1000_num_ord_features', '6.220461E+05')
('xgbregressor_RS_n_iter10000_num_ord_cat_features_skipPosition', '6.220467E+05')
('xgbregressor_RS_n_iter100_num_ord_features', '6.228664E+05')
('extratreesregressor_RS_n_iter10000_num_ord_cat_features_skipPosition', '6.359289E+05')
('xgbregressor_RS_n_iter1000_num_ord_cat_features_skipPosition', '6.398743E+05')
('extratreesregressor_RS_n_iter1000_num_ord_features', '6.415050E+05')
('extratreesregressor_RS_n_iter10000_num_ord_cat_features', '6.453250E+05')
('gradientboostingregressor_RS_n_iter10000_num_ord_cat_features', '6.509422E+05')
('gradientboostingregressor_RS_