## 08: Эксперименты с признаками (оригинальные названия колонок)

Здесь собраны функции:
- run_experiment(columns, experiment_name, installation_name=None) — обучает модели на выбранных колонках; выводит таблицу MAE и R².
- compute_target_correlations(df, target='corrosion_rate', feature_cols=None, top_k=30) — считает корреляции признаков с целью и выводит топ по абсолютной величине.

Колонки передаются в оригинальных именах из таблицы (без мэппинга).


In [80]:
# Импорты и загрузка
import os, sys
import numpy as np
import pandas as pd

if '../src' not in sys.path:
    sys.path.append('../src')
from database import load_corrosion_data


from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

print('ОК')


ОК


In [2]:
# Загрузка данных
DF = load_corrosion_data()
TARGET = 'corrosion_rate'
assert TARGET in DF.columns, 'В данных отсутствует corrosion_rate'

print(f"Данные загружены: {len(DF):,} строк, {len(DF.columns)} колонок")
print('Список колонок:')
print(sorted(list(DF.columns)))

Данные загружены: 442,052 строк, 64 колонок
Список колонок:
['acetic_acid_content', 'ammonia_content', 'ammonium_content', 'butane_content', 'butylene_content', 'chlorine_content', 'co2_content', 'component', 'component_type_id', 'component_type_name', 'contour', 'corrosion_inhibitor_content', 'corrosion_rate', 'cross_sectional_area', 'diameter_to_thickness_ratio', 'diesel_content', 'equipment', 'ethane_content', 'ethylene_content', 'gasoline_c6_c8_content', 'h2s_content', 'heavy_naphtha_content', 'hexane_content', 'hydrochloric_acid_content', 'hydrogen_content', 'hydrogen_fluoride_content', 'id', 'inner_diameter', 'installation', 'is_replaced', 'isobutane_content', 'isopentane_content', 'kerosene_content', 'material_code', 'material_resistance_score', 'material_type', 'measurement', 'measurement_date', 'methane_content', 'mms', 'naphthenic_acid_content', 'nitrogen_content', 'nominal_thickness_mmc', 'operating_pressure', 'operating_temperature', 'outer_diameter', 'oxygen_content', 'pen

In [16]:
# Проверяем наличие колонки installation
if 'installation' in DF.columns:
    print("Доступные установки:", sorted(DF['installation'].unique()))

Доступные установки: ['KK-2', 'АВТ-1', 'АВТ-2', 'АВТ-5', 'АВТ-6', 'КК']


In [42]:
def compute_target_correlations(df: pd.DataFrame, target: str = TARGET,
                                feature_cols: list | None = None,
                                top_k: int = 30, method: str = 'pearson',
                                return_best_features: bool = True) -> tuple[pd.DataFrame, list] | pd.DataFrame:
    """
    Вычисляет корреляции признаков с целевой переменной и возвращает лучшие признаки
    
    Parameters:
    -----------
    df : pd.DataFrame
        DataFrame с данными
    target : str
        Название целевой колонки
    feature_cols : list, optional
        Список признаков для анализа. Если None, используются все числовые колонки
    top_k : int
        Количество топ-признаков для возврата
    method : str
        Метод корреляции ('pearson' или 'spearman')
    return_best_features : bool
        Если True, возвращает кортеж (DataFrame, list), иначе только DataFrame
    
    Returns:
    --------
    tuple[pd.DataFrame, list] or pd.DataFrame
        DataFrame с корреляциями и список лучших признаков, либо только DataFrame
    """
    if target not in df.columns:
        raise ValueError(f"Целевая колонка '{target}' отсутствует")

    # Только числовые признаки
    num_df = df.select_dtypes(include=[np.number]).copy()
    if feature_cols is not None and len(feature_cols) > 0:
        feature_cols = [c for c in feature_cols if c in num_df.columns and c != target]
    else:
        feature_cols = [c for c in num_df.columns if c != target]

    # Очистка по цели
    valid = num_df[target].notna()
    num_df = num_df.loc[valid]

    # Стандартное ограничение цели
    y = num_df[target]

    # Корреляции
    if method == 'pearson':
        corr_series = num_df[feature_cols].corrwith(y)
    elif method == 'spearman':
        corr_series = num_df[feature_cols].rank().corrwith(y.rank())
    else:
        raise ValueError("method должен быть 'pearson' или 'spearman'")

    # Сортировка по абсолютному значению
    res = corr_series.dropna().sort_values(key=lambda s: s.abs(), ascending=False)
    if top_k:
        res = res.head(top_k)

    # Создание DataFrame с результатами
    out_df = pd.DataFrame({'feature': res.index, 'corr': res.values})
    
    # Список лучших признаков
    best_features = res.index.tolist()
    
    # Вывод результатов
    print("ТОП-{} ПРИЗНАКОВ ПО КОРРЕЛЯЦИИ С {}:".format(top_k, target))
    print("=" * 60)
    for i, (feature, corr) in enumerate(zip(out_df['feature'], out_df['corr']), 1):
        significance = "***" if abs(corr) > 0.3 else "** " if abs(corr) > 0.2 else "*  " if abs(corr) > 0.1 else "   "
        direction = "↑" if corr > 0 else "↓"
        print(f"{i:2d}. {significance} {feature:30} : {corr:+.4f} {direction}")
    
    print(f"\nВсего проанализировано признаков: {len(feature_cols)}")
    print(f"Возвращено топ-признаков: {len(best_features)}")
    
    if return_best_features:
        return out_df, best_features
    else:
        return out_df

In [95]:
# Функция запуска эксперимента
def run_experiment(df_filtered: pd.DataFrame, columns: list, experiment_name: str,
                   test_size: float = 0.2, random_state: int = 42) -> pd.DataFrame:
    """
    Запускает эксперимент машинного обучения на предварительно отфильтрованных данных
    
    Parameters:
    -----------
    df_filtered : pd.DataFrame
        Предварительно отфильтрованный DataFrame с данными
    columns : list
        Список колонок для использования в качестве признаков
    experiment_name : str
        Название эксперимента для идентификации
    test_size : float
        Доля тестовой выборки (по умолчанию 0.2)
    random_state : int
        Seed для воспроизводимости (по умолчанию 42)
    
    Returns:
    --------
    pd.DataFrame
        DataFrame с результатами эксперимента
    """
    if not isinstance(columns, (list, tuple)) or len(columns) == 0:
        raise ValueError('columns должен быть непустым списком колонок')
    
    if len(df_filtered) < 10:
        raise ValueError(f"Слишком мало данных для обучения: {len(df_filtered)} строк")
    
    # Используем переданный DataFrame
    df_use = df_filtered.copy()
    context = f"n_samples={len(df_use)}"
    
    X, y = build_xy_from_columns(df_use, columns, target=TARGET)
    
    if len(X) < 50:
        print(f"Предупреждение: мало данных для обучения: {len(X)} строк")
    
    X_tr, X_va, y_tr, y_va = train_test_split(X, y, test_size=test_size, random_state=random_state)

    # Ridge
    ridge = Pipeline([
        ('scaler', StandardScaler(with_mean=False)), 
        ('model', Ridge(alpha=1.0, random_state=random_state))
    ])
    ridge.fit(X_tr, y_tr)
    pr = ridge.predict(X_va)

    # RandomForest
    rf = RandomForestRegressor(n_estimators=30, n_jobs=-1, random_state=random_state)
    rf.fit(X_tr, y_tr)
    prf = rf.predict(X_va)

    res = pd.DataFrame([
        {
            'experiment': experiment_name, 
            'context': context, 
            'model': 'Ridge',
            'MAE': mean_absolute_error(y_va, pr), 
            'R2': r2_score(y_va, pr), 
            'n_samples': len(X),
            'n_features': len(columns)
        },
        {
            'experiment': experiment_name, 
            'context': context, 
            'model': 'RandomForest',
            'MAE': mean_absolute_error(y_va, prf), 
            'R2': r2_score(y_va, prf), 
            'n_samples': len(X),
            'n_features': len(columns)
        },
    ])
    print(res.to_string(index=False))
    return res

In [21]:
# Фильтрация по конкретной установке
INSTALLATION_FILTER = 'KK-2'
df_kk_1 = DF[DF['installation'] == INSTALLATION_FILTER]

print(f"После фильтрации по '{INSTALLATION_FILTER}': {len(df_kk_1):,} строк")

После фильтрации по 'KK-2': 75,510 строк


In [120]:
cols = ['acetic_acid_content', 'ammonia_content', 'ammonium_content', 'butane_content', 'butylene_content', 'chlorine_content', 'co2_content', 'component_type_id',
        'corrosion_inhibitor_content', 'cross_sectional_area', 'diameter_to_thickness_ratio', 'diesel_content', 'ethane_content', 'ethylene_content', 'gasoline_c6_c8_content', 'h2s_content', 
        'heavy_naphtha_content', 'hexane_content', 'hydrochloric_acid_content', 'hydrogen_content',
        'hydrogen_fluoride_content', 'inner_diameter', 'installation', 'is_replaced', 'isobutane_content', 'isopentane_content', 'kerosene_content', 'material_code', 
        'material_resistance_score', 'material_type',  'methane_content', 'naphthenic_acid_content', 'nitrogen_content', 'nominal_thickness_mmc',
        'operating_pressure', 'operating_temperature', 'outer_diameter', 'oxygen_content', 'pentane_content', 'propane_content', 'propylene_content', 'radius', 'residues_content',
        'sodium_hydroxide_content', 'sulfur_content', 'sulfuric_acid_content', 'total_acids', 'total_chlorine_compounds', 'total_components', 
        'total_composition', 'total_sulfur_compounds', 'wall_thickness', 'water_content']



corr_df, best_features = compute_target_correlations(df_kk_1, TARGET, top_k=20)
print(f"Лучшие признаки: {best_features}")

ТОП-20 ПРИЗНАКОВ ПО КОРРЕЛЯЦИИ С corrosion_rate:
 1. *   residues_content               : +0.1061 ↑
 2.     butane_content                 : +0.0864 ↑
 3.     isopentane_content             : +0.0702 ↑
 4.     chlorine_content               : +0.0685 ↑
 5.     butylene_content               : +0.0678 ↑
 6.     isobutane_content              : +0.0610 ↑
 7.     kerosene_content               : -0.0588 ↓
 8.     pentane_content                : +0.0529 ↑
 9.     tmin_mmc                       : +0.0513 ↑
10.     nominal_thickness_mmc          : +0.0512 ↑
11.     wall_thickness                 : +0.0489 ↑
12.     nitrogen_content               : -0.0452 ↓
13.     h2s_content                    : +0.0451 ↑
14.     heavy_naphtha_content          : -0.0447 ↓
15.     ethane_content                 : -0.0429 ↓
16.     diesel_content                 : -0.0421 ↓
17.     radius                         : +0.0389 ↑
18.     outer_diameter                 : +0.0389 ↑
19.     cross_sectional_area     

  c /= stddev[:, None]
  c /= stddev[None, :]


In [54]:
cols = ['cross_sectional_area','component_type_id']
# 'cross_sectional_area'+
run_experiment(df_kk_1, cols, experiment_name='1_KK-1')

experiment         context        model      MAE       R2  n_samples  n_features
    1_KK-1 n_samples=75510        Ridge 0.047467 0.012349      37814           2
    1_KK-1 n_samples=75510 RandomForest 0.043952 0.246160      37814           2


Unnamed: 0,experiment,context,model,MAE,R2,n_samples,n_features
0,1_KK-1,n_samples=75510,Ridge,0.047467,0.012349,37814,2
1,1_KK-1,n_samples=75510,RandomForest,0.043952,0.24616,37814,2


In [57]:
cols = ['cross_sectional_area','component_type_id','nominal_thickness_mmc']
# 'nominal_thickness_mmc'+
run_experiment(df_kk_1, cols, experiment_name='1_KK-1')

experiment         context        model      MAE       R2  n_samples  n_features
    1_KK-1 n_samples=75510        Ridge 0.047712 0.030116      37814           3
    1_KK-1 n_samples=75510 RandomForest 0.043945 0.246158      37814           3


Unnamed: 0,experiment,context,model,MAE,R2,n_samples,n_features
0,1_KK-1,n_samples=75510,Ridge,0.047712,0.030116,37814,3
1,1_KK-1,n_samples=75510,RandomForest,0.043945,0.246158,37814,3


In [61]:
cols = ['cross_sectional_area','component_type_id','nominal_thickness_mmc','h2s_content']
# 'h2s_content'+
run_experiment(df_kk_1, cols, experiment_name='1_KK-1')

experiment         context        model      MAE       R2  n_samples  n_features
    1_KK-1 n_samples=75510        Ridge 0.047703 0.034404      37814           4
    1_KK-1 n_samples=75510 RandomForest 0.042822 0.327708      37814           4


Unnamed: 0,experiment,context,model,MAE,R2,n_samples,n_features
0,1_KK-1,n_samples=75510,Ridge,0.047703,0.034404,37814,4
1,1_KK-1,n_samples=75510,RandomForest,0.042822,0.327708,37814,4


In [64]:
cols = ['cross_sectional_area','component_type_id','nominal_thickness_mmc','h2s_content','operating_pressure']
# 'operating_pressure'-
run_experiment(df_kk_1, cols, experiment_name='1_KK-1')

experiment         context        model      MAE       R2  n_samples  n_features
    1_KK-1 n_samples=75510        Ridge 0.047718 0.033840      37814           5
    1_KK-1 n_samples=75510 RandomForest 0.043078 0.313552      37814           5


Unnamed: 0,experiment,context,model,MAE,R2,n_samples,n_features
0,1_KK-1,n_samples=75510,Ridge,0.047718,0.03384,37814,5
1,1_KK-1,n_samples=75510,RandomForest,0.043078,0.313552,37814,5


In [69]:
cols = ['cross_sectional_area','component_type_id','nominal_thickness_mmc','h2s_content','operating_temperature']
# 'operating_temperature'-
run_experiment(df_kk_1, cols, experiment_name='1_KK-1')

experiment         context        model      MAE       R2  n_samples  n_features
    1_KK-1 n_samples=75510        Ridge 0.047621 0.038263      37814           5
    1_KK-1 n_samples=75510 RandomForest 0.042900 0.317462      37814           5


Unnamed: 0,experiment,context,model,MAE,R2,n_samples,n_features
0,1_KK-1,n_samples=75510,Ridge,0.047621,0.038263,37814,5
1,1_KK-1,n_samples=75510,RandomForest,0.0429,0.317462,37814,5


In [130]:
cols = ['cross_sectional_area','component_type_id','nominal_thickness_mmc','h2s_content','residues_content']
# ''co2_content''-
run_experiment(df_kk_1, cols, experiment_name='1_KK-1')

experiment         context        model      MAE       R2  n_samples  n_features
    1_KK-1 n_samples=75510        Ridge 0.047687 0.034984      37814           5
    1_KK-1 n_samples=75510 RandomForest 0.042727 0.333729      37814           5


Unnamed: 0,experiment,context,model,MAE,R2,n_samples,n_features
0,1_KK-1,n_samples=75510,Ridge,0.047687,0.034984,37814,5
1,1_KK-1,n_samples=75510,RandomForest,0.042727,0.333729,37814,5


In [128]:

cols = ['cross_sectional_area','component_type_id','nominal_thickness_mmc','h2s_content','residues_content']
# ''co2_content''-
run_experiment(df_kk_1, cols, experiment_name='1_KK-1')

experiment         context        model      MAE       R2  n_samples  n_features
    1_KK-1 n_samples=75510        Ridge 0.047687 0.034984      37814           5
    1_KK-1 n_samples=75510 RandomForest 0.042727 0.333729      37814           5


Unnamed: 0,experiment,context,model,MAE,R2,n_samples,n_features
0,1_KK-1,n_samples=75510,Ridge,0.047687,0.034984,37814,5
1,1_KK-1,n_samples=75510,RandomForest,0.042727,0.333729,37814,5


In [122]:
# Фильтрация по конкретной установке
INSTALLATION_FILTER = 'АВТ-5'
df_avt_5 = DF[DF['installation'] == INSTALLATION_FILTER]

print(f"После фильтрации по '{INSTALLATION_FILTER}': {len(df_avt_5):,} строк")

cols = ['acetic_acid_content', 'ammonia_content', 'ammonium_content', 'butane_content', 'butylene_content', 'chlorine_content', 'co2_content', 'component_type_id',
        'corrosion_inhibitor_content', 'cross_sectional_area', 'diameter_to_thickness_ratio', 'diesel_content', 'ethane_content', 'ethylene_content', 'gasoline_c6_c8_content', 'h2s_content', 
        'heavy_naphtha_content', 'hexane_content', 'hydrochloric_acid_content', 'hydrogen_content',
        'hydrogen_fluoride_content', 'inner_diameter', 'installation', 'is_replaced', 'isobutane_content', 'isopentane_content', 'kerosene_content', 'material_code', 
        'material_resistance_score', 'material_type',  'methane_content', 'naphthenic_acid_content', 'nitrogen_content', 'nominal_thickness_mmc',
        'operating_pressure', 'operating_temperature', 'outer_diameter', 'oxygen_content', 'pentane_content', 'propane_content', 'propylene_content', 'radius', 'residues_content',
        'sodium_hydroxide_content', 'sulfur_content', 'sulfuric_acid_content', 'total_acids', 'total_chlorine_compounds', 'total_components', 
        'total_composition', 'total_sulfur_compounds', 'wall_thickness', 'water_content']



corr_df, best_features = compute_target_correlations(df_avt_5, TARGET, top_k=20)
print(f"Лучшие признаки: {best_features}")

После фильтрации по 'АВТ-5': 76,014 строк
ТОП-20 ПРИЗНАКОВ ПО КОРРЕЛЯЦИИ С corrosion_rate:
 1.     pentane_content                : +0.0658 ↑
 2.     methane_content                : +0.0571 ↑
 3.     butane_content                 : -0.0495 ↓
 4.     ethane_content                 : +0.0450 ↑
 5.     operating_temperature          : +0.0293 ↑
 6.     nominal_thickness_mmc          : +0.0218 ↑
 7.     wall_thickness                 : +0.0218 ↑
 8.     co2_content                    : +0.0210 ↑
 9.     residues_content               : +0.0185 ↑
10.     chlorine_content               : +0.0144 ↑
11.     operating_pressure             : -0.0139 ↓
12.     cross_sectional_area           : +0.0121 ↑
13.     tmin_mmc                       : +0.0090 ↑
14.     outer_diameter                 : +0.0084 ↑
15.     radius                         : +0.0084 ↑
16.     heavy_naphtha_content          : -0.0081 ↓
17.     kerosene_content               : -0.0080 ↓
18.     inner_diameter                 : +

  c /= stddev[:, None]
  c /= stddev[None, :]


In [117]:
cols = ['cross_sectional_area','component_type_id','nominal_thickness_mmc','h2s_content','operating_temperature','operating_pressure']
# 'operating_temperature'-
run_experiment(df_avt_5, cols, experiment_name='1_АВТ-5')

experiment         context        model      MAE       R2  n_samples  n_features
   1_АВТ-5 n_samples=76014        Ridge 0.046765 0.052685      59667           6
   1_АВТ-5 n_samples=76014 RandomForest 0.041357 0.253063      59667           6


Unnamed: 0,experiment,context,model,MAE,R2,n_samples,n_features
0,1_АВТ-5,n_samples=76014,Ridge,0.046765,0.052685,59667,6
1,1_АВТ-5,n_samples=76014,RandomForest,0.041357,0.253063,59667,6


In [124]:
cols = ['cross_sectional_area','component_type_id','nominal_thickness_mmc','h2s_content','operating_temperature','operating_pressure','pentane_content']
# 'operating_temperature'-
run_experiment(df_avt_5, cols, experiment_name='1_АВТ-5')

experiment         context        model      MAE       R2  n_samples  n_features
   1_АВТ-5 n_samples=76014        Ridge 0.046818 0.053213      59667           7
   1_АВТ-5 n_samples=76014 RandomForest 0.041347 0.253288      59667           7


Unnamed: 0,experiment,context,model,MAE,R2,n_samples,n_features
0,1_АВТ-5,n_samples=76014,Ridge,0.046818,0.053213,59667,7
1,1_АВТ-5,n_samples=76014,RandomForest,0.041347,0.253288,59667,7
