In [1]:
import polars as pl
from catboost import CatBoostRegressor

In [2]:
# smiles_df = pl.DataFrame({f'smiles_{i}': [row[i] for row in df['smiles']] for i in range(len(df['smiles'][0]))})
# proteins_df = pl.DataFrame({f'proteins_{i}': [row[i] for row in df['proteins']] for i in range(len(df['proteins'][0]))})
# df_final = pl.concat([df,smiles_df, proteins_df],how="horizontal")

In [3]:
df_final = pl.read_parquet('../data/train/production_ready_train.parquet')

In [4]:
# split into train and test
X = df_final.drop("IC50 (nM)")
y = df_final["IC50 (nM)"]

In [5]:
def train_test_split_df(df, seed=42, test_size=0.2):
    return df.with_columns(
        pl.int_range(pl.len(), dtype=pl.UInt32)
        .shuffle(seed=seed)
        .gt(pl.len() * test_size)
        .alias("split")
    ).partition_by("split", include_key=False)


def train_test_split(X, y, seed=42, test_size=0.2):
    # Convert Series to DataFrame
    if isinstance(y, pl.Series):
        y = y.to_frame()

    (X_train, X_test) = train_test_split_df(X, seed=seed, test_size=test_size)
    (y_train, y_test) = train_test_split_df(y, seed=seed, test_size=test_size)

    # Convert back to Series if y was originally a Series
    if isinstance(y, pl.DataFrame) and y.shape[1] == 1:
        y_train = y_train.to_series()
        y_test = y_test.to_series()

    return (X_train, X_test, y_train, y_test)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

In [34]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
X_train = X_train.to_pandas()
X_test = X_test.to_pandas()
y_train = y_train.to_pandas()
y_test = y_test.to_pandas()

In [37]:
y_train

0          29000.0
1            190.0
2            970.0
3          11000.0
4            780.0
            ...   
1350231       90.0
1350232      118.0
1350233     1600.0
1350234     2600.0
1350235     1000.0
Name: IC50 (nM), Length: 1350236, dtype: float64

In [40]:
params_grid = {
    'depth': 10, 
    'n_estimators': 1000,  
    'l2_leaf_reg': 7,
    'eta': 0.1,
    'task_type': 'GPU',
}

In [41]:
# Initialize CatBoost regressor with GPU support and snapshot settings
baseline_model = CatBoostRegressor(**params_grid)

In [42]:
baseline_model.fit(X_train_scaled, y_train)



0:	learn: 225070816023.1753235	total: 554ms	remaining: 9m 13s
1:	learn: 221039945629.0165405	total: 1.11s	remaining: 9m 13s
2:	learn: 216433108566.6161194	total: 1.66s	remaining: 9m 11s
3:	learn: 212960083061.2776489	total: 2.2s	remaining: 9m 8s
4:	learn: 212577996485.0564270	total: 2.75s	remaining: 9m 6s
5:	learn: 212169673008.2386475	total: 3.3s	remaining: 9m 6s
6:	learn: 209539853230.1597595	total: 3.85s	remaining: 9m 5s
7:	learn: 207882765813.7933960	total: 4.39s	remaining: 9m 4s
8:	learn: 206451272285.7713318	total: 4.94s	remaining: 9m 4s
9:	learn: 204400660533.4352722	total: 5.49s	remaining: 9m 3s
10:	learn: 203019567380.0791626	total: 5.96s	remaining: 8m 55s
11:	learn: 200505407563.2885132	total: 6.08s	remaining: 8m 20s
12:	learn: 200167371677.3717651	total: 6.38s	remaining: 8m 4s
13:	learn: 196601565713.3500366	total: 6.93s	remaining: 8m 8s
14:	learn: 196275222753.8362427	total: 7.47s	remaining: 8m 10s
15:	learn: 194521270575.6303406	total: 8.03s	remaining: 8m 13s
16:	learn: 19

<catboost.core.CatBoostRegressor at 0x7effe95e3390>

In [43]:
preds = baseline_model.predict(X_test)

In [None]:
preds

In [44]:
from sklearn.metrics import mean_squared_error, r2_score
print("Mean Squared Error:", mean_squared_error(y_test, preds))
print("R2 Score:", r2_score(y_test, preds))

Mean Squared Error: 8.902090598134902e+22
R2 Score: -6.961219929157991e-06
