In [39]:
!pip install xgboost



In [40]:
import pandas as pd
import numpy as np

file_path = r'C:\Users\serik\OneDrive\Рабочий стол\Новая папка\bank-additional\bank-additional-full.csv'
df = pd.read_csv(file_path, sep=';')
X = df.drop(["y", "duration"], axis=1)
y = df["y"]

In [41]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y = le.fit_transform(y)

In [42]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.20, shuffle=True, random_state=1, stratify=y)

In [43]:
numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X_train.select_dtypes(include=['object']).columns

In [44]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

In [45]:
from sklearn.base import BaseEstimator, TransformerMixin

class UnknownToNaN(BaseEstimator, TransformerMixin):
    def __init__(self, unknown_label='unknown'):
        self.unknown_label = unknown_label

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X.replace(self.unknown_label, np.nan)

In [46]:
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

ohe_features = ['job', 'month', 'day_of_week']
oe_features = ['marital', 'education', 'default', 'housing', 'loan', 'contact', 'poutcome']

In [47]:
categorical_transformer_ohe = Pipeline(steps=[
    ('unknown_to_nan', UnknownToNaN(unknown_label='unknown')),
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

categorical_transformer_oe = Pipeline(steps=[
    ('unknown_to_nan', UnknownToNaN(unknown_label='unknown')),
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal_encoder', OrdinalEncoder())])


In [48]:
preprocessor = ColumnTransformer(
    transformers=[
        ('dog', numeric_transformer, numeric_features),
        ('cat_ohe', categorical_transformer_ohe, ohe_features),
        ('cat_oe', categorical_transformer_oe, oe_features)])

In [49]:
import xgboost

pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', xgboost.XGBClassifier(tree_method='gpu_hist', n_jobs=-1))])

In [50]:
param_grid = {
    'classifier__n_estimators': [50, 100, 200, 300, 400, 500],
    'classifier__learning_rate': [0.001, 0.01, 0.1, 0.2, 0.3],
    'classifier__max_depth': [2, 3, 4, 5, 6, 7, 8]
}

In [51]:
from sklearn.model_selection import GridSearchCV

grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1, scoring='accuracy')
grid_search.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('dog',
                                                                         Pipeline(steps=[('imputer',
                                                                                          SimpleImputer(strategy='median')),
                                                                                         ('scaler',
                                                                                          StandardScaler())]),
                                                                         Index(['age', 'campaign', 'pdays', 'previous', 'emp.var.rate',
       'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed'],
      dtype='object')),
                                                                        ('cat_ohe',
                                                                         Pipeline(steps...
  

In [52]:
from sklearn.metrics import accuracy_score

best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)

print(f"Accuracy on validation set: {accuracy}")

Accuracy on validation set: 0.902282107307599
