In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

# Adatok betöltése
train_data = pd.read_csv('/kaggle/input/playground-series-s3e26/train.csv')
test_data = pd.read_csv('/kaggle/input/playground-series-s3e26/test.csv')

# Célváltozó és jellemzők elkülönítése
X = train_data.drop(['Status', 'id'], axis=1)
y = train_data['Status']

# Célváltozó címkéinek konvertálása
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Kategorikus és numerikus változók azonosítása
categorical_features = X.select_dtypes(include=['object', 'bool']).columns
numerical_features = X.select_dtypes(exclude=['object', 'bool']).columns

# Előkészítő folyamat létrehozása
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

# Alapmodellek
models = [
    ('lr', LogisticRegression(max_iter=1000)),
    ('rf', RandomForestClassifier()),
    ('gb', GradientBoostingClassifier()),
    ('dt', DecisionTreeClassifier()),
    ('svc', SVC(probability=True)),
    ('gnb', GaussianNB()),
    ('knn', KNeighborsClassifier()),
    ('xgb', XGBClassifier()),
    ('lgb', LGBMClassifier()),
    ('cat', CatBoostClassifier(verbose=0))
]

# Alapmodellek és előkészítő folyamat kombinálása
pipeline_models = [
    (name, Pipeline([('preprocessor', preprocessor), ('model', model)])) for name, model in models
]

# Stacked Ensemble Modell a pipeline modellekkel
stacked_model = StackingClassifier(estimators=pipeline_models, final_estimator=LogisticRegression())

# Adatkészlet felosztása
X_train, X_val, y_train_encoded, y_val_encoded = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Modell illesztése
stacked_model.fit(X_train, y_train_encoded)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [23]:
from sklearn.metrics import log_loss

# Eredmények tárolása
model_results = {}

# Külön-külön modellek értékelése
for name, model_pipeline in pipeline_models:
    model_pipeline.fit(X_train, y_train_encoded)
    val_predictions = model_pipeline.predict_proba(X_val)
    val_score = log_loss(y_val_encoded, val_predictions)
    model_results[name] = val_score
    print(f"{name} Validation Log Loss: {val_score}")

# Stacked modell illesztése és értékelése
stacked_model.fit(X_train, y_train_encoded)
stacked_val_predictions = stacked_model.predict_proba(X_val)
stacked_val_score = log_loss(y_val_encoded, stacked_val_predictions)
model_results['stacked'] = stacked_val_score
print(f"Stacked Model Validation Log Loss: {stacked_val_score}")

# Eredmények kiírása
print("\nModel Performance Comparison:")
for name, score in model_results.items():
    print(f"{name}: Log Loss = {score}")


lr Validation Log Loss: 0.5278518182151819
rf Validation Log Loss: 0.5240022525313943
gb Validation Log Loss: 0.4416982079962455
dt Validation Log Loss: 9.711952146593237
svc Validation Log Loss: 0.5094189886380727
gnb Validation Log Loss: 2.7253543424365874
knn Validation Log Loss: 2.9240187230819292
xgb Validation Log Loss: 0.5107226968469903
lgb Validation Log Loss: 0.462115188825359
cat Validation Log Loss: 0.456879616053268


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Stacked Model Validation Log Loss: 0.44831909367672096

Model Performance Comparison:
lr: Log Loss = 0.5278518182151819
rf: Log Loss = 0.5240022525313943
gb: Log Loss = 0.4416982079962455
dt: Log Loss = 9.711952146593237
svc: Log Loss = 0.5094189886380727
gnb: Log Loss = 2.7253543424365874
knn: Log Loss = 2.9240187230819292
xgb: Log Loss = 0.5107226968469903
lgb: Log Loss = 0.462115188825359
cat: Log Loss = 0.456879616053268
stacked: Log Loss = 0.44831909367672096


In [27]:
# explicitly require this experimental feature
from sklearn.experimental import enable_halving_search_cv # noqa
# now you can import normally from model_selection
from sklearn.model_selection import HalvingRandomSearchCV
from sklearn.model_selection import HalvingGridSearchCV

In [31]:


# Frissített alapmodellek
models = [
    ('lr', LogisticRegression(max_iter=5000)),  # Növelt iterációk
    ('rf', RandomForestClassifier()),
    ('gb', GradientBoostingClassifier()),
    ('xgb', XGBClassifier()),
    ('lgb', LGBMClassifier()),
    ('cat', CatBoostClassifier(verbose=0))
]

# Paraméterrácsok a HalvingGridSearch-hez
param_grids = {
    'lr': {'model__C': [0.001, 0.01, 0.1, 1, 10, 100]},
    'rf': {'model__n_estimators': [100, 200, 500], 'model__max_depth': [10, 20, 30, None]},
    'gb': {'model__n_estimators': [100, 200, 300], 'model__learning_rate': [0.001, 0.01, 0.1, 0.2], 'model__max_depth': [3, 5, 10]},
    'xgb': {'model__n_estimators': [100, 200, 300], 'model__learning_rate': [0.01, 0.1, 0.2, 0.3], 'model__max_depth': [3, 6, 10]},
    'lgb': {'model__n_estimators': [100, 200, 300], 'model__learning_rate': [0.01, 0.1, 0.2], 'model__num_leaves': [31, 50, 100]},
    'cat': {'model__iterations': [100, 200, 300], 'model__learning_rate': [0.01, 0.1, 0.2], 'model__depth': [4, 6, 10]}
}


In [34]:
from sklearn.model_selection import StratifiedKFold

stratified_kfold = StratifiedKFold(n_splits=5)

# HalvingGridSearch beállítása minden modellhez a paraméterkereséshez
best_params = {}
for name, model in models:
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('model', model)
    ])
    search = HalvingGridSearchCV(pipeline, param_grids[name], cv=stratified_kfold, scoring='neg_log_loss', verbose=1)
    search.fit(X_train, y_train_encoded)
    best_params[name] = {k.replace('model__', ''): v for k, v in search.best_params_.items()}
    print(f"{name} Best Params: {best_params[name]}")

# Az alapmodellek újbóli létrehozása a megtalált legjobb paraméterekkel
optimized_models = [
    (name, Pipeline([
        ('preprocessor', preprocessor),
        ('model', model(**best_params[name]))
    ])) for name, model in models
]

# Stacked Ensemble Modell a legjobb paraméterekkel
optimized_stacked_model = StackingClassifier(estimators=optimized_models, final_estimator=LogisticRegression())

# Modell illesztése
optimized_stacked_model.fit(X_train, y_train_encoded)

# Kiértékelés
optimized_val_predictions = optimized_stacked_model.predict_proba(X_val)
optimized_val_score = log_loss(y_val_encoded, optimized_val_predictions)
print(f"Optimized Stacked Model Validation Log Loss: {optimized_val_score}")



n_iterations: 2
n_required_iterations: 2
n_possible_iterations: 2
min_resources_: 2108
max_resources_: 6324
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 6
n_resources: 2108
Fitting 5 folds for each of 6 candidates, totalling 30 fits
----------
iter: 1
n_candidates: 2
n_resources: 6324
Fitting 5 folds for each of 2 candidates, totalling 10 fits
lr Best Params: {'C': 0.1}
n_iterations: 3
n_required_iterations: 3
n_possible_iterations: 3
min_resources_: 702
max_resources_: 6324
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 12
n_resources: 702
Fitting 5 folds for each of 12 candidates, totalling 60 fits
----------
iter: 1
n_candidates: 4
n_resources: 2106
Fitting 5 folds for each of 4 candidates, totalling 20 fits
----------
iter: 2
n_candidates: 2
n_resources: 6318
Fitting 5 folds for each of 2 candidates, totalling 10 fits
rf Best Params: {'max_depth': 20, 'n_estimators': 500}
n_iterations: 4
n_required_iterations: 4
n_possible_itera

Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/opt/conda/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 234, in __call__
    return self._score(
  File "/opt/conda/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 327, in _score
    return self._sign * self._score_func(y, y_pred, **self._kwargs)
  File "/opt/conda/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 2635, in log_loss
    raise ValueError(
ValueError: y_true and y_pred contain different number of classes 2, 3. Please provide the true labels explicitly through the labels argument. Classes found in y_true: [0 2]

Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/opt/conda/lib/py

----------
iter: 1
n_candidates: 9
n_resources: 702
Fitting 5 folds for each of 9 candidates, totalling 45 fits


         nan         nan         nan         nan         nan         nan
         nan         nan         nan         nan         nan         nan
         nan         nan         nan         nan         nan         nan
         nan         nan         nan -0.52772747 -0.52772747 -0.51924733
 -0.51924733 -0.51924733 -0.54315769 -0.54315769 -1.145084   -1.41776408]


----------
iter: 2
n_candidates: 3
n_resources: 2106
Fitting 5 folds for each of 3 candidates, totalling 15 fits


         nan         nan         nan         nan         nan         nan
         nan         nan         nan         nan         nan         nan
         nan         nan         nan         nan         nan         nan
         nan         nan         nan -0.52772747 -0.52772747 -0.51924733
 -0.51924733 -0.51924733 -0.54315769 -0.54315769 -1.145084   -1.41776408
 -0.4917224  -0.49155505 -0.49156864]


----------
iter: 3
n_candidates: 1
n_resources: 6318
Fitting 5 folds for each of 1 candidates, totalling 5 fits


         nan         nan         nan         nan         nan         nan
         nan         nan         nan         nan         nan         nan
         nan         nan         nan         nan         nan         nan
         nan         nan         nan -0.52772747 -0.52772747 -0.51924733
 -0.51924733 -0.51924733 -0.54315769 -0.54315769 -1.145084   -1.41776408
 -0.4917224  -0.49155505 -0.49156864 -0.47046936]


lgb Best Params: {'learning_rate': 0.01, 'n_estimators': 200, 'num_leaves': 50}
n_iterations: 4
n_required_iterations: 4
n_possible_iterations: 4
min_resources_: 234
max_resources_: 6324
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 27
n_resources: 234
Fitting 5 folds for each of 27 candidates, totalling 135 fits


KeyboardInterrupt: 