In [45]:
import polars as pl
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, KFold
from sklearn.linear_model import LinearRegression

In [24]:
input_matrix = pl.read_excel(
    source="dane_leki.xlsx"
)

In [25]:
input_matrix.head()

__UNNAMED__0,Nazwa,logK HSA,logKCTAB,CATS3D_00_DD,CATS3D_09_AL,CATS3D_00_AA,Zbiór
i64,str,f64,f64,i64,i64,i64,str
1,"""acetaminophen""",-0.79,-0.63,2,0,2,"""t"""
2,"""acetylsalicylic acid""",-0.23,1.22,1,0,4,"""t"""
3,"""bromazepam""",0.38,0.57,1,0,3,"""t"""
4,"""carbamazepine""",0.69,0.68,0,0,3,"""t"""
5,"""chlorpromazine""",1.18,1.5,0,0,2,"""t"""


In [26]:
Y_obs = input_matrix.select(
    pl.nth(2)
)

In [27]:
Y_obs.head()

logK HSA
f64
-0.79
-0.23
0.38
0.69
1.18


In [28]:
descriptors = input_matrix.select(
    pl.nth([3,4,5,6])
)

In [29]:
descriptors.head()

logKCTAB,CATS3D_00_DD,CATS3D_09_AL,CATS3D_00_AA
f64,i64,i64,i64
-0.63,2,0,2
1.22,1,0,4
0.57,1,0,3
0.68,0,0,3
1.5,0,0,2


In [30]:
pca_model_first = PCA(n_components=4)

In [31]:
pca_model_first.fit(descriptors)

In [32]:
pca_model_first.explained_variance_

array([4.67581967, 3.27064572, 0.96644472, 0.27022295])

In [33]:
pca_model_first.components_

array([[ 0.05905699,  0.02081031,  0.64914805,  0.75808048],
       [ 0.30296835, -0.55414496,  0.58475165, -0.50911595],
       [-0.22498561,  0.74897495,  0.4718871 , -0.40711247],
       [ 0.92417743,  0.36266627, -0.11829997,  0.01934869]])

In [34]:
pca_model = PCA(n_components=2)

In [35]:
PC = pl.DataFrame(
    pca_model.fit_transform(descriptors),
    schema=["PC1", "PC2"])

In [36]:
PC

PC1,PC2
f64,f64
-2.300697,-0.649955
-0.696091,-0.553551
-1.492558,-0.241364
-1.506872,0.346107
-2.216526,1.103657
…,…
0.98374,3.79413
2.682795,1.400557
1.407812,-0.971157
0.557913,0.388726


In [37]:
pca_model.explained_variance_ratio_

array([0.50917477, 0.35615794])

In [38]:
X_training, X_validation, Y_training, Y_validation = train_test_split(
    PC,
    Y_obs,
    test_size=0.33,
    random_state=42
)

In [39]:
X_training

PC1,PC2
f64,f64
1.407812,-0.971157
-0.696091,-0.553551
-2.216526,1.103657
-3.006217,0.128802
-1.492558,-0.241364
…,…
2.739027,0.697231
-0.251225,2.288619
0.711546,-2.789428
0.783145,-3.083021


In [40]:
Y_training

logK HSA
f64
0.08
-0.23
1.18
-0.42
0.38
…
0.06
2.05
-1.25
-1.25


In [41]:
KFold_model = KFold(
    n_splits=10,
    shuffle=True,
    random_state=0
)

In [44]:
validation_sets = []
for training_set, validation_set in KFold_model.split(X_training, Y_training):
    validation_sets.append(validation_set)
    print(f"Training: {training_set}\nValidation: {validation_set}")

Training: [ 0  2  3  4  5  7  8  9 10 11 12 13 14 15 16 17]
Validation: [1 6]
Training: [ 0  1  2  3  4  5  6  7  9 11 12 13 14 15 16 17]
Validation: [ 8 10]
Training: [ 0  1  2  3  5  6  7  8  9 10 11 12 13 15 16 17]
Validation: [ 4 14]
Training: [ 0  1  3  4  5  6  7  8  9 10 11 12 13 14 15 17]
Validation: [ 2 16]
Training: [ 0  1  2  3  4  5  6  7  8 10 11 12 13 14 15 16]
Validation: [ 9 17]
Training: [ 0  1  2  3  4  5  6  8  9 10 11 12 14 15 16 17]
Validation: [ 7 13]
Training: [ 0  1  2  4  5  6  7  8  9 10 12 13 14 15 16 17]
Validation: [ 3 11]
Training: [ 1  2  3  4  6  7  8  9 10 11 12 13 14 15 16 17]
Validation: [0 5]
Training: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 16 17]
Validation: [15]
Training: [ 0  1  2  3  4  5  6  7  8  9 10 11 13 14 15 16 17]
Validation: [12]


In [46]:
descriptors

logKCTAB,CATS3D_00_DD,CATS3D_09_AL,CATS3D_00_AA
f64,i64,i64,i64
-0.63,2,0,2
1.22,1,0,4
0.57,1,0,3
0.68,0,0,3
1.5,0,0,2
…,…,…,…
0.73,0,5,2
1.63,1,4,5
1.32,3,2,5
0.47,1,2,4


In [None]:
for validation_set in validation_sets:
    x=descriptors.with_row_index().filter(~pl.col("index").is_in(validation_set)).drop(pl.col("index"))
    y=Y_obs.with_row_index().filter(~pl.col("index").is_in(validation_set)).drop(pl.col("index"))
    PCR_model = LinearRegression().fit(
        X=x,
        y=y
    )
    

0.8805905298394814
0.8681736783867922
0.8530049196738184
0.8697977279954625
0.8827892345735276
0.8956960364692089
0.8694536596810742
0.8821163774810632
0.8798020170260845
0.865975561626449
