In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.linear_model import ElasticNet
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV
from pathlib import Path
from scipy.stats import loguniform, uniform

In [2]:
data_dir = Path("../../data/processed")
X_train = pd.read_parquet(data_dir / "X_train.parquet")
X_val = pd.read_parquet(data_dir / "X_val.parquet")
y_train = pd.read_parquet(data_dir / "y_train.parquet").squeeze()
y_val = pd.read_parquet(data_dir / "y_val.parquet").squeeze()
X_test = pd.read_parquet(data_dir / "X_test.parquet")
test_ids = pd.read_parquet(data_dir / "test_ids.parquet")['id']

In [3]:
X_full_train = pd.concat([X_train, X_val])
y_full_train = pd.concat([y_train, y_val])

In [4]:
pipeline = Pipeline([
    ('poly', PolynomialFeatures(degree=2, include_bias=False)),
    ('scaler', StandardScaler()),
    ('elastic', ElasticNet(max_iter=5000))
])

param_distributions = {
    'elastic__alpha': loguniform(1e-3, 1e-1), 
    'elastic__l1_ratio': uniform(0.001, 0.1), 
    'poly__interaction_only': [True, False] 
}

search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_distributions,
    n_iter=5,
    scoring='r2',        
    cv=2,                
    n_jobs=-1,           
    verbose=1,           
    random_state=69
)

search.fit(X_full_train, y_full_train)

Fitting 2 folds for each of 5 candidates, totalling 10 fits


0,1,2
,estimator,Pipeline(step..._iter=5000))])
,param_distributions,"{'elastic__alpha': <scipy.stats....001FAFEC308D0>, 'elastic__l1_ratio': <scipy.stats....001FA9B2F0D90>, 'poly__interaction_only': [True, False]}"
,n_iter,5
,scoring,'r2'
,n_jobs,-1
,refit,True
,cv,2
,verbose,1
,pre_dispatch,'2*n_jobs'
,random_state,69

0,1,2
,degree,2
,interaction_only,True
,include_bias,False
,order,'C'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,alpha,np.float64(0....9772821857747)
,l1_ratio,np.float64(0....7708368510905)
,fit_intercept,True
,precompute,False
,max_iter,5000
,copy_X,True
,tol,0.0001
,warm_start,False
,positive,False
,random_state,


In [5]:
print("Лучшие найденные параметры:")
print(search.best_params_)
print(f"Лучший R2 на кросс-валидации: {search.best_score_:.5f}")

best_model = search.best_estimator_

Лучшие найденные параметры:
{'elastic__alpha': np.float64(0.0032149772821857747), 'elastic__l1_ratio': np.float64(0.011497708368510905), 'poly__interaction_only': True}
Лучший R2 на кросс-валидации: 0.84558


In [6]:
y_test_pred = best_model.predict(X_test)
y_test_pred = np.clip(y_test_pred, 0, 1) # Обрезка

submission = pd.DataFrame({
    'id': test_ids.values,
    'FloodProbability': y_test_pred
})

submission.to_csv(data_dir / "submission_elastic_net_rs.csv", index=False)
print("Файл submission_elastic_net_rs.csv успешно сохранен!")

Файл submission_elastic_net_rs.csv успешно сохранен!
