In [60]:
!pip install xgboost



In [61]:
import pandas as pd
import numpy as np

file_path = r'C:\Users\serik\OneDrive\Рабочий стол\Новая папка\bank-additional\bank-additional-full.csv'
df = pd.read_csv(file_path, sep=';')
X = df.drop(["y", "duration"], axis=1)
y = df["y"]

In [62]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.20, shuffle=True, random_state=1, stratify=y)

In [63]:
numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X_train.select_dtypes(include=['object']).columns

In [64]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

In [65]:
from sklearn.base import BaseEstimator, TransformerMixin

class UnknownToNaN(BaseEstimator, TransformerMixin):
    def __init__(self, unknown_label='unknown'):
        self.unknown_label = unknown_label

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X.replace(self.unknown_label, np.nan)

In [66]:
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

ohe_features = ['job', 'month', 'day_of_week']
oe_features = ['marital', 'education', 'default', 'housing', 'loan', 'contact', 'poutcome']

In [67]:
categorical_transformer_ohe = Pipeline(steps=[
    ('unknown_to_nan', UnknownToNaN(unknown_label='unknown')),
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

categorical_transformer_oe = Pipeline(steps=[
    ('unknown_to_nan', UnknownToNaN(unknown_label='unknown')),
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal_encoder', OrdinalEncoder())])


In [68]:
preprocessor = ColumnTransformer(
    transformers=[
        ('dog', numeric_transformer, numeric_features),
        ('cat_ohe', categorical_transformer_ohe, ohe_features),
        ('cat_oe', categorical_transformer_oe, oe_features)])

In [69]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.ensemble import StackingClassifier


base_models = [
    ('logistic_regression', LogisticRegression(random_state=1, max_iter=1000)),
    ('random_forest', RandomForestClassifier(random_state=1, n_jobs=-1)),
    ('gradient_boosting', GradientBoostingClassifier(random_state=1))
]

stacking_classifier = StackingClassifier(estimators=base_models, final_estimator=LogisticRegression(), cv=5)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', stacking_classifier)
])


In [70]:
from sklearn.model_selection import cross_val_score

pipeline.fit(X_train, y_train)
cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5, n_jobs=-1, scoring='accuracy')


In [71]:
from sklearn.metrics import accuracy_score

y_pred = pipeline.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)

print(f"Accuracy on validation set: {accuracy}")

Accuracy on validation set: 0.9024034959941734
