In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

# Adatok betöltése
train_data = pd.read_csv('/kaggle/input/playground-series-s3e26/train.csv')
test_data = pd.read_csv('/kaggle/input/playground-series-s3e26/test.csv')

# Célváltozó és jellemzők elkülönítése
X = train_data.drop(['Status', 'id'], axis=1)
y = train_data['Status']

# Célváltozó osztályainak eloszlásának ellenőrzése
class_distribution = y.value_counts()
print(class_distribution)

# Célváltozó címkéinek konvertálása
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Kategorikus és numerikus változók azonosítása
categorical_features = X.select_dtypes(include=['object', 'bool']).columns
numerical_features = X.select_dtypes(exclude=['object', 'bool']).columns

# Előkészítő folyamat létrehozása
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

Status
C     4965
D     2665
CL     275
Name: count, dtype: int64


In [2]:
# Alapmodellek
models = [
    ('lr', LogisticRegression(max_iter=5000)),
    ('rf', RandomForestClassifier()),
    ('gb', GradientBoostingClassifier()),
    ('dt', DecisionTreeClassifier()),
    ('svc', SVC(probability=True)),
    ('gnb', GaussianNB()),
    ('knn', KNeighborsClassifier()),
    ('xgb', XGBClassifier()),
    ('lgb', LGBMClassifier()),
    ('cat', CatBoostClassifier(verbose=0))
]

# Alapmodellek és előkészítő folyamat kombinálása
pipeline_models = [
    (name, Pipeline([('preprocessor', preprocessor), ('model', model)])) for name, model in models
]

# Stacked Ensemble Modell a pipeline modellekkel
stacked_model = StackingClassifier(estimators=pipeline_models, final_estimator=LogisticRegression(max_iter=5000))

X_train, X_val, y_train_encoded, y_val_encoded = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)


In [3]:
from sklearn.metrics import log_loss

# Eredmények tárolása
model_results = {}

# Külön-külön modellek értékelése
for name, model_pipeline in pipeline_models:
    model_pipeline.fit(X_train, y_train_encoded)
    val_predictions = model_pipeline.predict_proba(X_val)
    val_score = log_loss(y_val_encoded, val_predictions)
    model_results[name] = val_score
    print(f"{name} Validation Log Loss: {val_score}")

# Stacked modell illesztése és értékelése
stacked_model.fit(X_train, y_train_encoded)
stacked_val_predictions = stacked_model.predict_proba(X_val)
stacked_val_score = log_loss(y_val_encoded, stacked_val_predictions)
model_results['stacked'] = stacked_val_score
print(f"Stacked Model Validation Log Loss: {stacked_val_score}")

# Eredmények kiírása
print("\nModel Performance Comparison:")
for name, score in model_results.items():
    print(f"{name}: Log Loss = {score}")


lr Validation Log Loss: 0.5457120876046133
rf Validation Log Loss: 0.5299432273009381
gb Validation Log Loss: 0.46201733559663405
dt Validation Log Loss: 9.483972049255367
svc Validation Log Loss: 0.5123949453254711
gnb Validation Log Loss: 4.282593356816437
knn Validation Log Loss: 2.4415062712369227
xgb Validation Log Loss: 0.5350469316932817
lgb Validation Log Loss: 0.482749500893966
cat Validation Log Loss: 0.47681603496743985



KeyboardInterrupt



In [4]:
# explicitly require this experimental feature
from sklearn.experimental import enable_halving_search_cv # noqa
# now you can import normally from model_selection
from sklearn.model_selection import HalvingRandomSearchCV
from sklearn.model_selection import HalvingGridSearchCV

In [None]:

# Kiválasztott modellek és paraméterrácsok
selected_models = {
    'lr': LogisticRegression(max_iter=5000),
    'rf': RandomForestClassifier(),
    'gb': GradientBoostingClassifier(),
    'xgb': XGBClassifier(),
    'cat': CatBoostClassifier(verbose=0),
    'svc': SVC(probability=True)
}

param_grids = {
    'lr': {'model__C': [0.001, 0.01, 0.1, 1, 10, 100]},
    'rf': {'model__n_estimators': [100, 200, 500], 'model__max_depth': [10, 20, 30, None]},
    'gb': {'model__n_estimators': [100, 200, 300], 'model__learning_rate': [0.001, 0.01, 0.1, 0.2], 'model__max_depth': [3, 5, 10]},
    'xgb': {'model__n_estimators': [100, 200, 300], 'model__learning_rate': [0.01, 0.1, 0.2, 0.3], 'model__max_depth': [3, 6, 10]},
    'cat': {'model__iterations': [100, 200, 300], 'model__learning_rate': [0.01, 0.1, 0.2], 'model__depth': [4, 6, 10]},
    'svc': {'model__C': [0.1, 1, 10], 'model__gamma': [0.001, 0.01, 0.1, 1]}
}

stratified_kfold = StratifiedKFold(n_splits=5)

# HalvingGridSearch beállítása
best_params = {}
for name, model in selected_models.items():
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('model', model)
    ])
    search = HalvingGridSearchCV(pipeline, param_grids[name], cv=stratified_kfold, scoring='neg_log_loss', verbose=1)
    search.fit(X_train, y_train_encoded)
    best_params[name] = {k.replace('model__', ''): v for k, v in search.best_params_.items()}
    print(f"{name} Best Params: {best_params[name]}")


n_iterations: 2
n_required_iterations: 2
n_possible_iterations: 2
min_resources_: 2108
max_resources_: 6324
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 6
n_resources: 2108
Fitting 5 folds for each of 6 candidates, totalling 30 fits
----------
iter: 1
n_candidates: 2
n_resources: 6324
Fitting 5 folds for each of 2 candidates, totalling 10 fits
lr Best Params: {'C': 0.1}
n_iterations: 3
n_required_iterations: 3
n_possible_iterations: 3
min_resources_: 702
max_resources_: 6324
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 12
n_resources: 702
Fitting 5 folds for each of 12 candidates, totalling 60 fits
----------
iter: 1
n_candidates: 4
n_resources: 2106
Fitting 5 folds for each of 4 candidates, totalling 20 fits
----------
iter: 2
n_candidates: 2
n_resources: 6318
Fitting 5 folds for each of 2 candidates, totalling 10 fits
rf Best Params: {'max_depth': 10, 'n_estimators': 200}
n_iterations: 4
n_required_iterations: 4
n_possible_itera

In [None]:
from sklearn.ensemble import StackingClassifier

# Optimalizált modellek létrehozása a megtalált legjobb paraméterekkel
optimized_models = [
    ('lr', Pipeline([
        ('preprocessor', preprocessor),
        ('model', LogisticRegression(**best_params['lr'], max_iter=5000))
    ])),
    ('rf', Pipeline([
        ('preprocessor', preprocessor),
        ('model', RandomForestClassifier(**best_params['rf']))
    ])),
    ('gb', Pipeline([
        ('preprocessor', preprocessor),
        ('model', GradientBoostingClassifier(**best_params['gb']))
    ])),
    ('xgb', Pipeline([
        ('preprocessor', preprocessor),
        ('model', XGBClassifier(**best_params['xgb']))
    ])),
    ('cat', Pipeline([
        ('preprocessor', preprocessor),
        ('model', CatBoostClassifier(**best_params['cat'], verbose=0))
    ])),
    ('svc', Pipeline([
        ('preprocessor', preprocessor),
        ('model', SVC(**best_params['svc'], probability=True))
    ]))
]

# Stacked Ensemble Modell létrehozása
final_estimator = LogisticRegression()
stacked_model = StackingClassifier(
    estimators=optimized_models, 
    final_estimator=final_estimator
)

# Modell illesztése
stacked_model.fit(X_train, y_train_encoded)

# Kiértékelés
stacked_val_predictions = stacked_model.predict_proba(X_val)
stacked_val_score = log_loss(y_val_encoded, stacked_val_predictions)
print(f"Optimized Stacked Model Validation Log Loss: {stacked_val_score}")
