In [1]:
import polars as pl
from catboost import CatBoostRegressor

In [2]:
# smiles_df = pl.DataFrame({f'smiles_{i}': [row[i] for row in df['smiles']] for i in range(len(df['smiles'][0]))})
# proteins_df = pl.DataFrame({f'proteins_{i}': [row[i] for row in df['proteins']] for i in range(len(df['proteins'][0]))})
# df_final = pl.concat([df,smiles_df, proteins_df],how="horizontal")

In [3]:
df_final = pl.read_parquet('../data/train/production_ready_train.parquet')

In [4]:
import numpy as np

df_final = df_final.with_columns(
    (np.log1p(pl.col('IC50 (nM)'))).alias('IC50 (nM)')
)

In [5]:
# split into train and test
X = df_final.drop("IC50 (nM)")
y = df_final["IC50 (nM)"]

In [6]:
def train_test_split_df(df, seed=42, test_size=0.2):
    return df.with_columns(
        pl.int_range(pl.len(), dtype=pl.UInt32)
        .shuffle(seed=seed)
        .gt(pl.len() * test_size)
        .alias("split")
    ).partition_by("split", include_key=False)


def train_test_split(X, y, seed=42, test_size=0.2):
    # Convert Series to DataFrame
    if isinstance(y, pl.Series):
        y = y.to_frame()

    (X_train, X_test) = train_test_split_df(X, seed=seed, test_size=test_size)
    (y_train, y_test) = train_test_split_df(y, seed=seed, test_size=test_size)

    # Convert back to Series if y was originally a Series
    if isinstance(y, pl.DataFrame) and y.shape[1] == 1:
        y_train = y_train.to_series()
        y_test = y_test.to_series()

    return (X_train, X_test, y_train, y_test)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

In [8]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [9]:
X_train = X_train.to_pandas()
X_test = X_test.to_pandas()
y_train = y_train.to_pandas()
y_test = y_test.to_pandas()

In [10]:
y_train

0          10.275086
1           5.252273
2           6.878326
3           9.305741
4           6.660575
             ...    
1350231     4.510860
1350232     4.779123
1350233     7.378384
1350234     7.863651
1350235     6.908755
Name: IC50 (nM), Length: 1350236, dtype: float64

In [11]:
params_grid = {
    'depth': 10, 
    'n_estimators': 10000,  
    'l2_leaf_reg': 7,
    'eta': 0.01,
    'task_type': 'GPU',
}

In [12]:
# Initialize CatBoost regressor with GPU support and snapshot settings
baseline_model = CatBoostRegressor(**params_grid)

In [13]:
baseline_model.fit(X_train_scaled, y_train)

0:	learn: 3.3420781	total: 209ms	remaining: 34m 46s
1:	learn: 3.3358410	total: 367ms	remaining: 30m 32s
2:	learn: 3.3296980	total: 525ms	remaining: 29m 8s
3:	learn: 3.3237640	total: 679ms	remaining: 28m 17s
4:	learn: 3.3177009	total: 838ms	remaining: 27m 54s
5:	learn: 3.3116999	total: 995ms	remaining: 27m 38s
6:	learn: 3.3060008	total: 1.15s	remaining: 27m 23s
7:	learn: 3.3002354	total: 1.31s	remaining: 27m 13s
8:	learn: 3.2946487	total: 1.47s	remaining: 27m 10s
9:	learn: 3.2889610	total: 1.62s	remaining: 27m 2s
10:	learn: 3.2834891	total: 1.78s	remaining: 26m 58s
11:	learn: 3.2781655	total: 1.94s	remaining: 26m 53s
12:	learn: 3.2729572	total: 2.1s	remaining: 26m 50s
13:	learn: 3.2677755	total: 2.25s	remaining: 26m 47s
14:	learn: 3.2625278	total: 2.41s	remaining: 26m 43s
15:	learn: 3.2572624	total: 2.57s	remaining: 26m 41s
16:	learn: 3.2520431	total: 2.73s	remaining: 26m 42s
17:	learn: 3.2470262	total: 2.88s	remaining: 26m 39s
18:	learn: 3.2421165	total: 3.04s	remaining: 26m 38s
19:	le

<catboost.core.CatBoostRegressor at 0x7f47d730c390>

In [20]:
preds = baseline_model.predict(X_test_scaled)

In [21]:
preds

array([4.93494047, 8.76894214, 6.76856622, ..., 3.84048481, 4.06671251,
       5.07833208])

In [23]:
preds_train = baseline_model.predict(X_train_scaled)

In [25]:
from sklearn.metrics import mean_squared_error, r2_score
print("Train metrics")
print("Mean Squared Error:", mean_squared_error(y_train, preds_train ))
print("R2 Score:", r2_score(y_train, preds_train ))

Train metrics
Mean Squared Error: 3.842598148688721
R2 Score: 0.6572766904635159


In [26]:
from sklearn.metrics import mean_squared_error, r2_score
print("Test metrics")
print("Mean Squared Error:", mean_squared_error(y_test, preds))
print("R2 Score:", r2_score(y_test, preds))

Test metrics
Mean Squared Error: 4.457687002780962
R2 Score: 0.6014266600794375


### 1. **Mean Squared Error (MSE): 4.4577**
   - **What it is**: MSE measures the average squared difference between the predicted and actual values. It’s calculated by squaring the residuals (errors) and averaging them.
   - **Interpretation**: An MSE of 4.4577 means that, on average, the squared error (or squared distance) between predictions and actual values is around 4.4577. A lower MSE indicates better model performance, as it suggests that predictions are closer to the actual values. However, MSE can be difficult to interpret directly since it is sensitive to outliers and squared, which makes it scale with the units of the output variable squared.

### 2. **R² Score: 0.6014**
   - **What it is**: The R² score (or coefficient of determination) measures the proportion of the variance in the target variable that is predictable from the independent variables. It ranges from 0 to 1, where 1 indicates a perfect fit and 0 indicates that the model fails to explain any variance.
   - **Interpretation**: An R² score of 0.6014 means that the model explains about 60.14% of the variability in the data. In other words, 60.14% of the differences between the actual values and the mean are explained by your model, while the remaining 39.86% is due to unexplained variance (potentially due to factors not included in the model or inherent noise).

### **Overall Assessment**
   - A **high MSE** and **R² below 1** indicate that the model has room for improvement. While it captures some of the patterns in the data (60% of variance explained), there is still a significant portion of the variance unaccounted for, suggesting that the model might be too simplistic or that additional features could help improve performance.