In [None]:
import numpy as np
import pandas as pd

In [None]:
# new sklearn packages
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

# already  known
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, PolynomialFeatures, KBinsDiscretizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import root_mean_squared_log_error
from sklearn.compose import TransformedTargetRegressor

In [None]:
# Define Business goal
# Train a regression model that predicts the penguins by body mass.

# Get data
penguins = pd.read_csv("penguins_simple.csv", sep=";")
penguins

In [None]:
# Feature Engineering
fe_baseline = ColumnTransformer(
    [
        ("do-nothing", "passthrough", ['Flipper Lengthn(mm)'])
    ]
)

In [None]:
fe_all_feature = fe_baseline = ColumnTransformer(
    [
        ("ohe", OneHotEncoder(drop="first",sparse_output=False),["species", "Sex"]),  # it will create like Species_chinstrap, Species_Gentoo, # Sex_Male and (assuming Female is dropped)
        ("binning_ohe", KBinsDiscretizer(encode="onehot-dense"),["Flipper Length (mm)"])  # Here we create the Flipper_bin_0 for example length from 170-190 comes flipper_bin_0 and etc.
    ]
)

In [None]:
fe_all_feature_poly = make_pipeline(fe_all_feature, PolynomialFeatures(interaction_only=True))
# interaction_term = true - use because it avoid squaring or cubic terms like A^2 
#PolynomialFeatures(interaction_only=True) - generates interaction terms
#interaction terms - represent combined effect of multiple features
                     #example:  Sex_MALE = 1 and Flipper_bin_2 = 1, so Sex_MALE × Flipper_bin_2 → 1 = help the model to detect more complex relations

In [None]:
 # Train Models
lin_baseline = make_pipeline(fe_baseline, LinearRegression())

In [None]:
lin_all = make_pipeline(fe_all_feature, LinearRegression())

In [None]:
lin_all_poly = make_pipeline(fe_all_feature_poly, LinearRegression())
lin_all_poly

In [None]:
Ridge_all_poly = make_pipeline(fe_all_feature_poly, Ridge())
Ridge_all_poly

In [None]:
def rmsle(y_true, y_pred):
    return mean_squared_log_error(y_true, y_pred, squared = False)

rmsle_scorer = make_scorer(rmsle, greater_is_better=False)

In [None]:
# splitting the data
X = penguins.drop(columns=['Body Mass (g)'])
y = penguins['Body Mass (g)'] # Target Variable

xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Baseline
cross_val_inbaseline = cross_validate(
    estimator=lin_baseline,
    X=xtrain,
    y=ytrain,
    cv=5,
    verbose=1,
    scoring=rmsle_scorer,
    return_train_score=True,
    n_jobs=-1
)