In [11]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor
from sklearn.metrics import mean_absolute_error
import matplotlib.pyplot as plt

# Для ускорения вычислений
import warnings
warnings.filterwarnings('ignore')


In [12]:
def load_and_preprocess():
    # Используем оптимизированное чтение CSV
    train = pd.read_csv('./departure-delayed/train_features.csv', 
                       usecols=lambda x: x != 'Year')  # Сразу исключаем ненужную колонку
    train = train.fillna(0)
    
    y = pd.read_csv('./departure-delayed/train_target.csv', 
                    index_col='id')
    
    real_columns = train.select_dtypes(exclude=['object']).columns.tolist()
    
    return train, y, real_columns

train, y, real_columns = load_and_preprocess()


In [13]:
def visual_clfs_mae(min_samples_leaf_values, mae_values, mae_train):
    plt.figure(figsize=(10, 6))
    plt.plot(min_samples_leaf_values, mae_train, 'r-o', label='Train')
    plt.plot(min_samples_leaf_values, mae_values, 'b-o', label='Valid')
    plt.title('MAE vs min_samples_leaf')
    plt.legend()
    plt.xlabel('min_samples_leaf')
    plt.ylabel('Mean Absolute Error')
    plt.grid(True)
    plt.xscale('log')
    plt.savefig('mae_plot.png')
    plt.close()


In [14]:
def train_clfs(train, y, real_columns, min_samples_leaf_values=[1,5,20,100,500,1000]):
    # Используем numpy arrays вместо pandas для ускорения
    X = train[real_columns].values
    y_values = y.values
    
    x_train, x_val, y_train, y_val = train_test_split(
        X, y_values, 
        train_size=0.8, 
        random_state=42
    )
    
    mae_values = []
    mae_train = []
    
    for leaf in min_samples_leaf_values:
        model = DecisionTreeRegressor(
            min_samples_leaf=leaf, 
            random_state=42
        )
        model.fit(x_train, y_train)
        
        # Векторизованные предсказания
        mae_train.append(mean_absolute_error(y_train, model.predict(x_train)))
        mae_values.append(mean_absolute_error(y_val, model.predict(x_val)))
    
    return mae_values, mae_train

In [15]:
# [3] Получение лучшей модели
def get_optimized_model(train, y, real_columns, max_depth=21, min_samples_leaf=1000):
    X = train[real_columns].values
    y_values = y.values
    
    x_train, x_val, y_train, y_val = train_test_split(
        X, y_values,
        train_size=0.8, 
        random_state=42
    )
    
    model = DecisionTreeRegressor(
        max_depth=max_depth,
        min_samples_leaf=min_samples_leaf,
        random_state=42
    )
    model.fit(x_train, y_train)
    
    val_mae = mean_absolute_error(y_val, model.predict(x_val))
    return x_train, x_val, y_train, y_val, model, val_mae

# Получаем лучшую модель и данные
x_train, x_val, y_train, y_val, best_model, mae = get_optimized_model(train, y, real_columns)
print(f'Best model MAE: {mae}')

Best model MAE: 33.02198247215921


In [16]:
param_grid = {
    'n_estimators': [4, 8],
    'max_samples': [0.5, 1.0],
    'max_features': [0.5, 1.0]
}

bagging_regressor = BaggingRegressor(
    estimator=best_model, 
    random_state=42, 
    n_jobs=-1
)

grid_search = GridSearchCV(
    bagging_regressor,
    param_grid,
    scoring='neg_mean_absolute_error',
    cv=3,
    n_jobs=-1,
    verbose=1
)

# Используем numpy array для ускорения
grid_search.fit(x_train, y_train.ravel())
print(f'Best Bagging parameters: {grid_search.best_params_}')
print(f'Best Bagging MAE: {-grid_search.best_score_}')


Fitting 3 folds for each of 8 candidates, totalling 24 fits
Best Bagging parameters: {'max_features': 1.0, 'max_samples': 1.0, 'n_estimators': 8}
Best Bagging MAE: 33.12221202879778


In [17]:
param_grid_rf = {
    'n_estimators': [4, 8],
    'max_depth': [6, 8],
    'max_features': ['sqrt']
}

forest_regressor = RandomForestRegressor(
    random_state=42,
    n_jobs=-1,
    verbose=1
)

grid_search_rf = GridSearchCV(
    forest_regressor,
    param_grid_rf,
    scoring='neg_mean_absolute_error',
    cv=3,
    n_jobs=-1,
    verbose=1
)

grid_search_rf.fit(x_train, y_train.ravel())
print(f'Best RF parameters: {grid_search_rf.best_params_}')
print(f'Best RF MAE: {-grid_search_rf.best_score_}')


Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.


Best RF parameters: {'max_depth': 8, 'max_features': 'sqrt', 'n_estimators': 8}
Best RF MAE: 33.46334314311063


[Parallel(n_jobs=-1)]: Done   4 out of   8 | elapsed:    1.5s remaining:    1.5s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    1.5s finished
