# FIFA Pipeline best model

Re-create the best model and improve it. Does it generalize well to different train-test splits?

In [33]:
%run 4_pipeline_functions.ipynb
%run 5_pipeline_clean.ipynb

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 27 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Unnamed: 0                2000 non-null   int64  
 1   Name                      2000 non-null   object 
 2   Age                       2000 non-null   int64  
 3   Nationality               2000 non-null   object 
 4   Overall                   2000 non-null   int64  
 5   Club                      1969 non-null   object 
 6   Value                     2000 non-null   object 
 7   Wage                      2000 non-null   object 
 8   Preferred Foot            1997 non-null   object 
 9   International Reputation  1997 non-null   float64
 10  Weak Foot                 1997 non-null   float64
 11  Skill Moves               1997 non-null   float64
 12  Work Rate                 1997 non-null   object 
 13  Body Type                 1997 non-null   object 
 14  Position

In [34]:
X = df.drop('Value', axis = 1)
y = df['Value']

set_features = 'num_ord_cat_features'

X_train, X_test, y_train, y_test = get_train_test_sets(X, set_features)

col_trans = ColumnTransformer([
                ('imp_num_cols', SimpleImputer(), num_features),
                ('imp_ord_cols', SimpleImputer(), ord_features),
                ('imp_cat_cols', SimpleImputer(fill_value = 'missing_value'), cat_features)
            ])

preprocessor = ColumnTransformer([
        ('scaler_num_ord', StandardScaler(), slice(0, len(num_features + ord_features))),
        ('categoricalencoder', None, slice(len(num_features + ord_features), \
                                           len(num_features + ord_features + cat_features)))
        ])

steps = [('col_trans', col_trans),
        ('preprocessor', preprocessor),
        ('model', None)]

pipeline = Pipeline(steps)

ttr = TransformedTargetRegressor(regressor = pipeline, transformer = None)

In [35]:
n_iter = 100

gbr = GradientBoostingRegressor(random_state = 1)
key = 'gradientboostingregressor'

param_grid = [{'regressor__col_trans__imp_num_cols__strategy': ['most_frequent', 'mean', 'median'],
               'regressor__col_trans__imp_ord_cols__strategy': ['most_frequent', 'mean', 'median'],
               'regressor__col_trans__imp_cat_cols__strategy': ['most_frequent', 'constant'],
               'regressor__preprocessor__scaler_num_ord': [None, StandardScaler(), RobustScaler()],
               'regressor__preprocessor__categoricalencoder': [OneHotEncoder(handle_unknown = 'ignore')],
               'regressor__preprocessor__categoricalencoder__drop': [None, 'first'],
               'regressor__model': [gbr],
               'regressor__model__loss': ['ls', 'lad', 'huber', 'quantile'], 
               'regressor__model__learning_rate': [0.01, 0.02, 0.05, 0.1, 1],
               'regressor__model__n_estimators': np.arange(10, 101, 10),
               'regressor__model__max_features': [2, 'auto', 'sqrt'],
               'regressor__model__max_depth': [None, 3, 5, 10, 50, 100],
               'regressor__model__min_samples_leaf': [1, 3, 5],
               'regressor__model__min_samples_split': [2, 4, 8],
               'transformer': [None, StandardScaler(), RobustScaler(), PowerTransformer()]}]

In [39]:
filename = 'save_pkl/best_model.pkl'
if os.path.isfile(filename):
    my_s = joblib.load(filename)
else:
    my_s = RandomizedSearchCV(ttr, param_distributions = param_grid, n_iter = n_iter, cv = 5, \
                                    scoring = 'neg_root_mean_squared_error', n_jobs = -1, verbose = 10, \
                                    random_state = 1)
    my_s = my_s.fit(X_train, y_train)
    joblib.dump(my_s, filename)

In [40]:
best_est = my_s.best_estimator_
best_est.fit(X_train, y_train)
y_pred = best_est.predict(X_test)
test_score = format(np.sqrt(mean_squared_error(y_test, y_pred)), 'E')
best_CV_score = format(-my_s.best_score_, 'E')
print(my_s.best_params_)
print('Best CV score:', best_CV_score)
print('Test score:', test_score)

{'transformer': None, 'regressor__preprocessor__scaler_num_ord': RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
             with_scaling=True), 'regressor__preprocessor__categoricalencoder__drop': None, 'regressor__preprocessor__categoricalencoder': OneHotEncoder(categories='auto', drop=None, dtype=<class 'numpy.float64'>,
              handle_unknown='ignore', sparse=True), 'regressor__model__n_estimators': 60, 'regressor__model__min_samples_split': 4, 'regressor__model__min_samples_leaf': 5, 'regressor__model__max_features': 'auto', 'regressor__model__max_depth': None, 'regressor__model__loss': 'ls', 'regressor__model__learning_rate': 0.1, 'regressor__model': GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.1, loss='ls',
                          max_depth=None, max_features='auto',
                          max_leaf_nodes=None, min_impurity_decrease=0.0,
                    

Investigate how well the model generalizes to other train-test splits.

In [28]:
X_train, X_test, y_train, y_test = get_train_test_sets(X, set_features, random_state = 2)
best_est.fit(X_train, y_train)
y_pred = best_est.predict(X_test)
test_score = format(np.sqrt(mean_squared_error(y_test, y_pred)), 'E')
best_CV_score = format(-my_s.best_score_, 'E')
#print(my_s.best_params_)
print('Best CV score:', best_CV_score)
print('Test score:', test_score)

Best CV score: 7.385445E+05
Test score: 5.368910E+05


There might be a risk of overfitting.