# Космический корабль Титаник

---


#### Boostings

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.exceptions import DataConversionWarning
import warnings

warnings.filterwarnings(action='ignore', category=DataConversionWarning)
warnings.filterwarnings('ignore')

pip install catboost

In [3]:
def check_missing_values(df):

    missing_values = df.isnull().sum()
    if missing_values.sum() > 0:
        print("Обнаружены пропущенные значения:")
        print(missing_values[missing_values > 0])
        return True
    else:
        print("Пропущенные значения не обнаружены.")
        return False

In [4]:
def plot_confusion_matrix(y_true, y_pred, model_name="Model"):

    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f'Confusion Matrix - {model_name}')
    plt.ylabel('Actual Label')
    plt.xlabel('Predicted Label')
    plt.show()

In [5]:
def plot_roc_curve(y_true, y_proba, model_name="Model"):

    fpr, tpr, thresholds = roc_curve(y_true, y_proba)
    roc_auc = auc(fpr, tpr)
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'Receiver Operating Characteristic (ROC) - {model_name}')
    plt.legend(loc="lower right")
    plt.show()

In [9]:
try:
    import xgboost as xgb
    from xgboost import XGBClassifier
    XGB_AVAILABLE = True
except ImportError:
    print("XGBoost is not installed.  Install it with: pip install xgboost")
    XGB_AVAILABLE = False

try:
    import lightgbm as lgb
    from lightgbm import LGBMClassifier
    LGBM_AVAILABLE = True
except ImportError:
    print("LightGBM is not installed. Install it with: pip install lightgbm")
    LGBM_AVAILABLE = False

try:
    import catboost as cb
    from catboost import CatBoostClassifier
    CATBOOST_AVAILABLE = True
except ImportError:
    print("CatBoost is not installed. Install it with: pip install catboost")
    CATBOOST_AVAILABLE = False

In [11]:
def random_forest_pipeline_with_imputation(train_data, target_column, imputation_strategy='median', knn_neighbors=5, random_state=42):

    X = train_data.drop(target_column, axis=1)
    y = train_data[target_column].astype(int)
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=random_state)

    numerical_features = X_train.select_dtypes(include=['number']).columns
    categorical_features = X_train.select_dtypes(exclude=['number']).columns

    if imputation_strategy == 'mean' or imputation_strategy == 'median':
        numerical_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy=imputation_strategy)),
            ('scaler', StandardScaler())
        ])
    elif imputation_strategy == 'knn':
        if len(numerical_features) == 0:
            imputation_strategy = 'median'
            print("Warning: No numerical features for KNN imputation. Switching to median imputation.")
            numerical_transformer = Pipeline(steps=[
                ('imputer', SimpleImputer(strategy=imputation_strategy)),
                ('scaler', StandardScaler())
            ])
        else:
            numerical_transformer = Pipeline(steps=[
                ('imputer', KNNImputer(n_neighbors=knn_neighbors)),
                ('scaler', StandardScaler())
            ])
    else:
        raise ValueError("Invalid imputation strategy. Must be 'mean', 'median', or 'knn'.")

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_features),
            ('cat', categorical_transformer, categorical_features)
        ])

    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', RandomForestClassifier(random_state=random_state))])

    param_grid = {
        'classifier__n_estimators': [50, 100, 200],
        'classifier__max_depth': [None, 10, 20],
        'classifier__min_samples_split': [2, 5, 10],
        'classifier__min_samples_leaf': [1, 2, 4]
    }

    grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring='accuracy', n_jobs=-1, verbose=1)

    X_train_processed = preprocessor.fit_transform(X_train)

    if np.isnan(X_train_processed.toarray() if hasattr(X_train_processed, 'toarray') else X_train_processed).any():
        raise ValueError("NaN values present in X_train_processed after preprocessing.")


    try:
        grid_search.fit(X_train, y_train)
    except Exception as e:
        print(f"Error during GridSearchCV fitting: {e}")
        return None, None

    return grid_search.best_estimator_, grid_search.best_params_

In [12]:
def boosting_pipeline_with_imputation(train_data, target_column, boosting_type='gradient_boosting', imputation_strategy='median', knn_neighbors=5, random_state=42):

    X = train_data.drop(target_column, axis=1)
    y = train_data[target_column].astype(int)
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=random_state)

    numerical_features = X_train.select_dtypes(include=['number']).columns
    categorical_features = X_train.select_dtypes(exclude=['number']).columns

    if imputation_strategy == 'mean' or imputation_strategy == 'median':
        numerical_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy=imputation_strategy)),
            ('scaler', StandardScaler())
        ])
    elif imputation_strategy == 'knn':
        if len(numerical_features) == 0:
            imputation_strategy = 'median'
            print("Warning: No numerical features for KNN imputation. Switching to median imputation.")
            numerical_transformer = Pipeline(steps=[
                ('imputer', SimpleImputer(strategy=imputation_strategy)),
                ('scaler', StandardScaler())
            ])
        else:
            numerical_transformer = Pipeline(steps=[
                ('imputer', KNNImputer(n_neighbors=knn_neighbors)),
                ('scaler', StandardScaler())
            ])
    else:
        raise ValueError("Invalid imputation strategy. Must be 'mean', 'median', or 'knn'.")

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_features),
            ('cat', categorical_transformer, categorical_features)
        ])

    if boosting_type == 'gradient_boosting':
        classifier = GradientBoostingClassifier(random_state=random_state)
        param_grid = {
            'classifier__n_estimators': [50, 100, 200],
            'classifier__learning_rate': [0.01, 0.1, 0.2],
            'classifier__max_depth': [3, 5, 7],
            'classifier__min_samples_split': [2, 4],
            'classifier__min_samples_leaf': [1, 2]
        }
    elif boosting_type == 'adaboost':
        classifier = AdaBoostClassifier(random_state=random_state)
        param_grid = {
            'classifier__n_estimators': [50, 100, 200],
            'classifier__learning_rate': [0.01, 0.1, 0.2]
        }
    elif boosting_type == 'xgboost':
        if not XGB_AVAILABLE:
            print("XGBoost is not available. Skipping.")
            return None, None
        classifier = XGBClassifier(random_state=random_state, use_label_encoder=False, eval_metric='logloss')
        param_grid = {
            'classifier__n_estimators': [50, 100, 200],
            'classifier__learning_rate': [0.01, 0.1, 0.2],
            'classifier__max_depth': [3, 5, 7],
            'classifier__subsample': [0.8, 1.0],
            'classifier__colsample_bytree': [0.8, 1.0]
        }
    elif boosting_type == 'lightgbm':
        if not LGBM_AVAILABLE:
            print("LightGBM is not available. Skipping.")
            return None, None
        classifier = LGBMClassifier(random_state=random_state)
        param_grid = {
            'classifier__n_estimators': [50, 100, 200],
            'classifier__learning_rate': [0.01, 0.1, 0.2],
            'classifier__max_depth': [3, 5, 7],
            'classifier__num_leaves': [20, 31, 40],
            'classifier__reg_alpha': [0.0, 0.1],    # L1 regularization
            'classifier__reg_lambda': [0.0, 0.1]   # L2 regularization
        }
    elif boosting_type == 'catboost':
        if not CATBOOST_AVAILABLE:
            print("CatBoost is not available. Skipping.")
            return None, None

        classifier = CatBoostClassifier(random_state=random_state, verbose=0)  # verbose=0 to suppress training output
        param_grid = {
            'classifier__iterations': [50, 100, 200],
            'classifier__learning_rate': [0.01, 0.1, 0.2],
            'classifier__depth': [3, 5, 7],
            'classifier__l2_leaf_reg': [1, 3, 5]  # L2 regularization
        }
    elif boosting_type == 'logistic_regression':
        classifier = LogisticRegression(random_state=random_state, solver='liblinear')
        param_grid = {
            'classifier__penalty': ['l1', 'l2'],
            'classifier__C': [0.1, 1.0, 10.0]
        }
    else:
        raise ValueError("Invalid boosting type. Must be 'gradient_boosting', 'adaboost', 'xgboost', 'lightgbm', 'catboost', or 'logistic_regression'.")

    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', classifier)])

    grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring='accuracy', n_jobs=-1, verbose=1)

    X_train_processed = preprocessor.fit_transform(X_train)

    if np.isnan(X_train_processed.toarray() if hasattr(X_train_processed, 'toarray') else X_train_processed).any():
        raise ValueError("NaN values present in X_train_processed after preprocessing.")

    try:
        grid_search.fit(X_train, y_train)
    except Exception as e:
        print(f"Error during GridSearchCV fitting: {e}")
        return None, None

    return grid_search.best_estimator_, grid_search.best_params_

In [14]:
if __name__ == '__main__':
    train_data = pd.read_csv('train.csv')
    test_data = pd.read_csv('test.csv')

    train_data['Transported'] = train_data['Transported'].astype(str).map({'True': 1, 'False': 0})
    target_column = 'Transported'

    print("Checking missing values in training data:")
    has_missing_train = check_missing_values(train_data)
    print("\nChecking missing values in test data:")
    has_missing_test = check_missing_values(test_data)

    imputation_strategies = ['mean', 'median', 'knn']
    boosting_types = ['gradient_boosting', 'adaboost', 'xgboost', 'lightgbm', 'catboost']

    all_models = []

    for imputation_strategy in imputation_strategies:
        print(f"\nTraining Random Forest with imputation strategy: {imputation_strategy}")
        pipeline, params = random_forest_pipeline_with_imputation(
            train_data.copy(), target_column, imputation_strategy=imputation_strategy
        )
        if pipeline:
            print(f"Best parameters for Random Forest ({imputation_strategy}): {params}")
            all_models.append((f"RandomForest_{imputation_strategy}", pipeline))

        for boosting_type in boosting_types:
            print(f"\nTraining {boosting_type} with imputation strategy: {imputation_strategy}")
            pipeline, params = boosting_pipeline_with_imputation(
                train_data.copy(), target_column, boosting_type=boosting_type, imputation_strategy=imputation_strategy
            )
            if pipeline:
                print(f"Best parameters for {boosting_type} ({imputation_strategy}): {params}")
                all_models.append((f"{boosting_type}_{imputation_strategy}", pipeline))


    print("\nTraining complete. Trained the following models:")
    for model_name, model in all_models:
        print(f"- {model_name}")

    if all_models:
        X = train_data.drop(target_column, axis=1)
        y = train_data[target_column].astype(int)
        X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

        print("\nEvaluating models on validation set:")
        for model_name, model in all_models:
            try:
                y_pred = model.predict(X_val)
                accuracy = accuracy_score(y_val, y_pred)
                print(f"{model_name} accuracy: {accuracy:.4f}")
            except Exception as e:
                print(f"Error during prediction with {model_name}: {e}")

Checking missing values in training data:
Обнаружены пропущенные значения:
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
dtype: int64

Checking missing values in test data:
Обнаружены пропущенные значения:
HomePlanet       87
CryoSleep        93
Cabin           100
Destination      92
Age              91
VIP              93
RoomService      82
FoodCourt       106
ShoppingMall     98
Spa             101
VRDeck           80
Name             94
dtype: int64

Training Random Forest with imputation strategy: mean
Fitting 3 folds for each of 81 candidates, totalling 243 fits
Best parameters for Random Forest (mean): {'classifier__max_depth': None, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 200}

Training gradient_boosting with imputation strategy

The best model on validation set is gradient_boosting_median with accuracy 0.7901