In [4]:
from category_encoders import OrdinalEncoder
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, StackingRegressor
from sklearn.feature_extraction import FeatureHasher
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.linear_model import BayesianRidge, ElasticNet, LinearRegression, Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures, StandardScaler, OneHotEncoder, FunctionTransformer, PowerTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans
from sklearn.svm import SVR

def label_encode(x):
    return LabelEncoder().fit_transform(x.ravel()).reshape(-1, 1)

def transform_x8(x):
    return 2024 - x

# Load your dataset
data = pd.read_csv("train.csv")

# Features and target
X = data.drop(["Y", "X1", "X2", "X3", "X4", "X5", "X9", "X10"], axis=1) 
y = data["Y"]

#Clustering X6 to create new feature 
kmeans = KMeans(n_clusters=7, random_state=42)
X['X6_cluster'] = kmeans.fit_predict(X[['X6']])

# Apply binning to X6 
X["X6_cat"] = pd.cut(
    X["X6"],
    bins=np.linspace(X["X6"].min(), X["X6"].max(), 24), 
    labels=False,
    include_lowest=True
)

# Here I used the stratified sampling trick 
# because X6 has a very high correlation with the label y
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=X["X6_cat"], random_state=42
)


x6_transformer = Pipeline(steps=[
    ("log_transform", FunctionTransformer(lambda x: np.log1p(np.maximum(x, 0.1)))),
    ("scaler", StandardScaler())
])

x7_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('label', FunctionTransformer(label_encode))
])

x8_transformer = Pipeline(steps=[
    ('custom', FunctionTransformer(transform_x8)),
    ('standard', StandardScaler())
])

x11_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("X6", x6_transformer, ["X6"]),
        ("X7", x7_transformer, ["X7"]),
        ("X8", x8_transformer, ["X8"]),
        ("X11", x11_transformer, ["X11"])
    ]
)


model = SVR(kernel="rbf", C=1.0, epsilon=0.1, gamma="scale")

pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", model)
])

# ---- 4. Cross-Validation ----
kf = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(pipeline, X_train, y_train, scoring="neg_mean_absolute_error", cv=kf)
print(f"Cross-Validation MAE: {-np.mean(cv_scores):.4f}")

# ---- 5. Train the Model on Full Training Data ----
pipeline.fit(X_train, y_train)

# ---- 6. Evaluate on Test Set ----
y_pred = pipeline.predict(X_test)

# Calculate MAE
test_mae = mean_absolute_error(y_test, y_pred)
print(f"Test MAE: {test_mae:.4f}")

Cross-Validation MAE: 0.4010
Test MAE: 0.3876
