In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import RidgeCV
from sklearn.feature_selection import SelectKBest, f_regression
import warnings
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
!pip install catboost
from catboost import CatBoostRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.neural_network import MLPRegressor


warnings.filterwarnings("ignore")


Collecting catboost
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [None]:

# Load data
train = pd.read_csv("./dataset/train.csv")
test = pd.read_csv("./dataset/test.csv")
sample_submission = pd.read_csv("./dataset/sample_solution.csv")

print("Train shape:", train.shape)
print("Test shape:", test.shape)

# Display basic info

print("\nTrain data info:")
print(train.head())

print("\nMissing values in train:", train.isnull().sum().sum())
print("Missing values in test:", test.isnull().sum().sum())


Train shape: (2000, 65)
Test shape: (500, 56)

Train data info:
   Component1_fraction  Component2_fraction  Component3_fraction  \
0                 0.21                 0.00                 0.42   
1                 0.02                 0.33                 0.19   
2                 0.08                 0.08                 0.18   
3                 0.25                 0.42                 0.00   
4                 0.26                 0.16                 0.08   

   Component4_fraction  Component5_fraction  Component1_Property1  \
0                 0.25                 0.12             -0.021782   
1                 0.46                 0.00             -0.224339   
2                 0.50                 0.16              0.457763   
3                 0.07                 0.26             -0.577734   
4                 0.50                 0.00              0.120415   

   Component2_Property1  Component3_Property1  Component4_Property1  \
0              1.981251              0.02

In [None]:
# Identify property columns
property_cols = [col for col in train.columns if "_Property" in col]

def add_aggregate_features(df):
    df["prop_mean"] = df[property_cols].mean(axis=1)
    df["prop_std"] = df[property_cols].std(axis=1)
    df["prop_min"] = df[property_cols].min(axis=1)
    df["prop_max"] = df[property_cols].max(axis=1)
    df["prop_range"] = df["prop_max"] - df["prop_min"]
    df["prop_median"] = df[property_cols].median(axis=1)
    return df

train = add_aggregate_features(train)
test = add_aggregate_features(test)


In [None]:
def add_cross_component_stats(df):
    for prop_idx in range(1, 11):
        cols = [f"Component{i}_Property{prop_idx}" for i in range(1, 6)]
        df[f"Property{prop_idx}_mean"] = df[cols].mean(axis=1)
        df[f"Property{prop_idx}_std"] = df[cols].std(axis=1)
        df[f"Property{prop_idx}_min"] = df[cols].min(axis=1)
        df[f"Property{prop_idx}_max"] = df[cols].max(axis=1)
        df[f"Property{prop_idx}_range"] = df[f"Property{prop_idx}_max"] - df[f"Property{prop_idx}_min"]
    return df

train = add_cross_component_stats(train)
test = add_cross_component_stats(test)


In [None]:
train = train.loc[:, ~train.columns.str.contains("^Weighted_Property")]
test = test.loc[:, ~test.columns.str.contains("^Weighted_Property")]
train = train.loc[:, ~train.columns.str.contains("^C[1-5]_P[1-9]_weighted|^C[1-5]_P10_weighted")]
test = test.loc[:, ~test.columns.str.contains("^C[1-5]_P[1-9]_weighted|^C[1-5]_P10_weighted")]

# Generate weighted interaction features
for i in range(1, 6):
    for j in range(1, 11):
        train[f"C{i}_P{j}_weighted"] = train[f"Component{i}_fraction"] * train[f"Component{i}_Property{j}"]
        test[f"C{i}_P{j}_weighted"] = test[f"Component{i}_fraction"] * test[f"Component{i}_Property{j}"]

# Aggregate weighted properties
weighted_train_props = {}
weighted_test_props = {}

for j in range(1, 11):
    cols = [f"C{i}_P{j}_weighted" for i in range(1, 6)]
    weighted_train_props[f"Weighted_Property{j}"] = train[cols].sum(axis=1)
    weighted_test_props[f"Weighted_Property{j}"] = test[cols].sum(axis=1)

train = pd.concat([train, pd.DataFrame(weighted_train_props)], axis=1)
test = pd.concat([test, pd.DataFrame(weighted_test_props)], axis=1)


In [None]:
for j in range(1, 11):
    cols = [f"Component{i}_Property{j}" for i in range(1, 6)]
    train[f"Property{j}_diversity"] = train[cols].max(axis=1) - train[cols].min(axis=1)
    test[f"Property{j}_diversity"] = test[cols].max(axis=1) - test[cols].min(axis=1)


In [None]:
# Rank features
def add_rank_features(df):
    for j in range(1, 11):
        cols = [f"Component{i}_Property{j}" for i in range(1, 6)]
        df[[f"{col}_rank" for col in cols]] = df[cols].rank(axis=1, method='min')
    return df

train = add_rank_features(train)
test = add_rank_features(test)

# Entropy features
from scipy.stats import entropy

def add_entropy_features(df):
    for j in range(1, 11):
        cols = [f"Component{i}_Property{j}" for i in range(1, 6)]
        df[f"Property{j}_entropy"] = df[cols].apply(
            lambda row: entropy(np.abs(row) / np.sum(np.abs(row))), axis=1
        )
    return df

train = add_entropy_features(train)
test = add_entropy_features(test)


In [None]:
def add_spread_features(df):
    for j in range(1, 11):
        cols = [f"Component{i}_Property{j}" for i in range(1, 6)]
        df[f"Property{j}_range"] = df[cols].max(axis=1) - df[cols].min(axis=1)
        df[f"Property{j}_std"] = df[cols].std(axis=1)
        df[f"Property{j}_cv"] = df[cols].std(axis=1) / (df[cols].mean(axis=1) + 1e-6)
    return df
train = add_spread_features(train)
test = add_spread_features(test)

In [None]:
def add_zscore_features(df):
    for j in range(1, 11):
        cols = [f"Component{i}_Property{j}" for i in range(1, 6)]
        row_mean = df[cols].mean(axis=1)
        row_std = df[cols].std(axis=1) + 1e-6
        for col in cols:
            df[f"{col}_zscore"] = (df[col] - row_mean) / row_std
    return df
train = add_zscore_features(train)
test = add_zscore_features(test)

In [None]:
def add_linear_features(df):
    for j in range(1, 11):
        cols = [f"Component{i}_Property{j}" for i in range(1, 6)]

        row_mean = df[cols].mean(axis=1)
        row_std = df[cols].std(axis=1) + 1e-6
        row_min = df[cols].min(axis=1)
        row_max = df[cols].max(axis=1)
        row_range = row_max - row_min
        row_sum = df[cols].sum(axis=1)

        # Z-scores and other statistical features
        for col in cols:
            df[f"{col}_zscore"] = (df[col] - row_mean) / row_std
            df[f"{col}_diff_mean"] = df[col] - row_mean
            df[f"{col}_diff_min"] = df[col] - row_min
            df[f"{col}_diff_max"] = df[col] - row_max

        # Add overall statistical features for this group
        df[f"Property{j}_mean"] = row_mean
        df[f"Property{j}_std"] = row_std
        df[f"Property{j}_min"] = row_min
        df[f"Property{j}_max"] = row_max
        df[f"Property{j}_range"] = row_range
        df[f"Property{j}_sum"] = row_sum

    return df

train = add_linear_features(train)
test = add_linear_features(test)


In [None]:
target_cols = [f"BlendProperty{i}" for i in range(1, 11)]

X = train.drop(columns=target_cols)
y = train[target_cols]
X_test = test[X.columns]


In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
transformers=ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), X.columns)
    ],
    remainder='passthrough'
)
X = transformers.fit_transform(X)
X_test = transformers.transform(X_test)

In [None]:
!pip install catboost --quiet
from catboost import CatBoostRegressor
from sklearn.multioutput import MultiOutputRegressor

cat_model = MultiOutputRegressor(CatBoostRegressor(
    iterations=2000,
    learning_rate=0.03,
    depth=6,
    random_seed=42,
    verbose=100,
    l2_leaf_reg=3,
    task_type='CPU',
))
cat_model.fit(X, y)
cat_preds = cat_model.predict(X_test)

0:	learn: 0.9756385	total: 197ms	remaining: 6m 34s
100:	learn: 0.2704710	total: 11.9s	remaining: 3m 44s
200:	learn: 0.1359504	total: 23.5s	remaining: 3m 30s
300:	learn: 0.0844065	total: 34.9s	remaining: 3m 17s
400:	learn: 0.0606714	total: 46.4s	remaining: 3m 5s
500:	learn: 0.0490316	total: 58s	remaining: 2m 53s
600:	learn: 0.0416095	total: 1m 9s	remaining: 2m 40s
700:	learn: 0.0358545	total: 1m 22s	remaining: 2m 32s
800:	learn: 0.0313410	total: 1m 32s	remaining: 2m 18s
900:	learn: 0.0275133	total: 1m 44s	remaining: 2m 7s
1000:	learn: 0.0243619	total: 1m 55s	remaining: 1m 55s
1100:	learn: 0.0213523	total: 2m 7s	remaining: 1m 44s
1200:	learn: 0.0189954	total: 2m 18s	remaining: 1m 32s
1300:	learn: 0.0166999	total: 2m 30s	remaining: 1m 20s
1400:	learn: 0.0147914	total: 2m 42s	remaining: 1m 9s
1500:	learn: 0.0130562	total: 2m 53s	remaining: 57.6s
1600:	learn: 0.0115649	total: 3m 3s	remaining: 45.8s
1700:	learn: 0.0102636	total: 3m 19s	remaining: 35.1s
1800:	learn: 0.0090736	total: 3m 33s	re

In [None]:
final_preds = 1.0*cat_preds
submission = pd.DataFrame(final_preds, columns=target_cols)
submission.insert(0, "ID", test["ID"].values)
submission.to_csv("submission.csv", index=False)
from google.colab import files
files.download("submission.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>