In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, HuberRegressor, ElasticNet, Ridge, Lasso
from sklearn.preprocessing import PolynomialFeatures, StandardScaler, OneHotEncoder
from sklearn.ensemble import StackingRegressor
from sklearn.feature_selection import RFECV
from sklearn.pipeline import Pipeline

import warnings
warnings.filterwarnings('ignore')

In [2]:
df_train = pd.read_csv('train.csv').drop('id', axis=1)
df_test = pd.read_csv('test.csv').drop('id', axis=1)
subm = pd.read_csv('submission.csv')

In [3]:
encoder = OneHotEncoder(sparse_output=False)


encoded_columns = encoder.fit_transform(df_train[['Sex']])
encoded_df = pd.DataFrame(encoded_columns, columns=encoder.get_feature_names_out(['Sex']))
df_train = pd.concat([df_train, encoded_df], axis=1)
df_train = df_train.drop(['Sex', 'Sex_Diameter'], axis=1)

df_train['Weight 2'] = (df_train['Weight'] < 10)
df_train['Shucked Weight 2'] = (df_train['Shucked Weight'] < 5)
df_train['Viscera Weight 2'] = (df_train['Viscera Weight'] < 2)
df_train['Shell Weight 2'] = (df_train['Shell Weight'] < 4)
df_train['Size'] = df_train['Length'] * df_train['Diameter'] * df_train['Weight']
df_train['Weight 3'] = df_train['Shucked Weight'] * df_train['Viscera Weight'] * df_train['Shell Weight']


encoded_columns = encoder.fit_transform(df_test[['Sex']])
encoded_df = pd.DataFrame(encoded_columns, columns=encoder.get_feature_names_out(['Sex']))
df_test = pd.concat([df_test, encoded_df], axis=1)
df_test = df_test.drop('Sex', axis=1)

df_test['Weight 2'] = (df_test['Weight'] < 10)
df_test['Shucked Weight 2'] = (df_test['Shucked Weight'] < 5)
df_test['Viscera Weight 2'] = (df_test['Viscera Weight'] < 2)
df_test['Shell Weight 2'] = (df_test['Shell Weight'] < 4)
df_test['Size'] = df_test['Length'] * df_test['Diameter'] * df_test['Weight']
df_test['Weight 3'] = df_test['Shucked Weight'] * df_test['Viscera Weight'] * df_test['Shell Weight']


In [4]:
X = df_train.drop('Age', axis=1)
y = df_train['Age']

poly = PolynomialFeatures(degree=2)
X_poly = poly.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_poly, y, test_size=0.19, random_state=42)

In [5]:
huber_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', HuberRegressor(epsilon=1.15, alpha=0.015))
])

ridge_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', Ridge(alpha=0.1))
])

lasso_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', Lasso(alpha=0.1))
])

elasticnet_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', ElasticNet(alpha=0.05, l1_ratio=0.7))
])

In [11]:
# selector = RFECV(estimator=HuberRegressor(epsilon=1.15, alpha=0.015),
#                  step=1,
#                  min_features_to_select=130,
#                  cv=5,
#                  scoring='neg_mean_absolute_error'
# )
# selector.fit(X_train, y_train)

# print(selector.support_)
# print(selector.ranking_)

selected_features = [ True, True, True, True, True, True, True, True, True, True, True, True,
                      True, True, True, True, True, True, True, True, True, True, True, True,
                      True, True, True, True, True, True, True, True, True, True, True, True,
                      True, True, True, True, True, True, True, True, True, True, True, True,
                      True, True, True, True, True, True, True, True, True, True, True, True,
                      True, True, True, True, True, True, True, True, True, True, True, True,
                      True, True, True, True, True, True, True, True, True, True, True, True,
                      True, True, True, True, True, True, True, True, True, True, True, True,
                      True, True, True, True, True, True, True, True, True, True, True, True,
                      True, False, False, False, False, False, False, True, True, True, False, True,
                      True, True, True, True, True, True, True, True, False, True, True, True,
                      True, True, True, True, True, True, True, True, True, True, True, True,
                      True, True, True, True, True, True, True, True, True,]

# Перестройка данных с выбранными признаками
X_train_rfecv = X_train[:, selected_features]
X_test_rfecv = X_test[:, selected_features]

KeyboardInterrupt: 

In [10]:
stacking_model = StackingRegressor(
    estimators=[
        ('huber', huber_pipeline),
        ('ridge', ridge_pipeline),
        ('lasso', lasso_pipeline),
        ('elasticnet', elasticnet_pipeline)
    ],
    final_estimator=HuberRegressor(epsilon=1.15, alpha=0.015)
)

model = Pipeline([
    ('scaler', StandardScaler()),
    ('stacking_model', stacking_model)
])

model.fit(X_train_rfecv, y_train)
y_pred = model.predict(X_test_rfecv) # 1.282978 - 145
# model.fit(X_train, y_train)
# y_pred = model.predict(X_test) # 1.282994

MAE = mean_absolute_error(y_test, y_pred)
print(f"MAE: {MAE:.6f}")

MAE: 1.282978


## Submission

In [None]:
# t = poly.fit_transform(df_test)[:, selected_features]
# y_pred = model.predict(t)
# subm['Age'] = y_pred
# subm.to_csv('submission.csv', index=False)