<a href="https://colab.research.google.com/github/LorenzoTarricone/Compressed-Sensing-to-Pooling-and-Deconvolution-of-Pd-Catalyzed-Cross-Coupling-Reactions-Experiments/blob/main/Python%20Notebooks/Embeddings_to_D_matrix.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Generating matrix D from the embeddings

The detailed process is given for the Buchwald-Hartwig embeddings, but it applies in the same way for the Suzuki-Miyaura embedding below

## Buchwald-Hartwig
First we load the dataset with the embeddings

In [None]:
import numpy as np
import pandas as pd

embedding_neutral = pd.read_csv("/content/drive/MyDrive/Catalyst Mixing/Lorenzo - Data (other)/Copy of embedding_df_our_table.csv")

#rename the columns to ["catalyst_name", "dim_0", "dim_1"]
embedding_neutral.columns = ["catalyst_name", "dim_0", "dim_1"]

In [None]:
embedding_neutral

Unnamed: 0,catalyst_name,dim_0,dim_1
0,meCgPPh,520,88
1,RuPhos,628,90
2,VPhos,753,90
3,IPENT Cl,1305,117
4,DiMel-HeptCl,1423,127
5,Piperidinyl-amino-pincer,402,186
6,cPhos,586,193
7,SPhos,694,196
8,Jackie Phos,836,191
9,GPhos,974,208


**IMPORTANT**: we need here to use the same order of the catalyst that we will use in the decoding algorithm

Therefore here we use that ordered list of names to subset and reorder the embeddings from the df

In [None]:
names = ['meCgPPh', 'TrixiePhos', 'P(oTol)_3', 'RuPhos', 'RockPhos', 'VPhos', 'PPh_3', 'tBuBrettPhos', 'CyJohnPhos', 'CPhos', 'DPEPhos', 'AlPhos', 'Triisobutyl-Phosphatrane', 'SPhos', 'XantPhos', 'Me3(OMe)-tBuXPhos', 'Amphos', 'Jackie CPhos', 'dppp', 'Mordal-Phos', 'Jackie Phos', 'dppf', 'P(tBu)_3', 'GPhos', 'BINAP', 'BippyPhos', 'cataCXlum A', 'dppdtbpf', 'QPhos', 'cBridP', 'dtbpf', 'JosiPhosSL J009-1', 'PCy_3', 'XPhos', 'tBuXPhos']

In [None]:
filtered_embedding_neutral = embedding_neutral[embedding_neutral['catalyst_name'].isin(names)].copy()
filtered_embedding_neutral['catalyst_name'] = pd.Categorical(filtered_embedding_neutral['catalyst_name'], categories=names, ordered=True)
filtered_embedding_neutral = filtered_embedding_neutral.sort_values('catalyst_name').reset_index(drop=True)
display(filtered_embedding_neutral)

Unnamed: 0,catalyst_name,dim_0,dim_1
0,meCgPPh,520,88
1,TrixiePhos,537,319
2,P(oTol)_3,68,373
3,RuPhos,628,90
4,RockPhos,721,404
5,VPhos,753,90
6,PPh_3,142,274
7,tBuBrettPhos,844,400
8,CyJohnPhos,652,311
9,CPhos,580,196


In [None]:
missing_names = set(names) - set(embedding_neutral["catalyst_name"])
print("Names in 'names' list but not in 'embedding_bh[\"catalyst_name\"]':")
print(missing_names)

Names in 'names' list but not in 'embedding_bh["catalyst_name"]':
set()


Calculate the distance matrix between all the embeddings (here using L2 distance)

In [None]:
from scipy.spatial.distance import cdist

distance_matrix_neutral = cdist(filtered_embedding_neutral[['dim_0', 'dim_1']], filtered_embedding_neutral[['dim_0', 'dim_1']])
distance_matrix_df_neutral = pd.DataFrame(distance_matrix_neutral, index=filtered_embedding_neutral['catalyst_name'], columns=filtered_embedding_neutral['catalyst_name'])
display(distance_matrix_df_neutral)

catalyst_name,meCgPPh,TrixiePhos,P(oTol)_3,RuPhos,RockPhos,VPhos,PPh_3,tBuBrettPhos,CyJohnPhos,CPhos,...,BippyPhos,cataCXlum A,dppdtbpf,QPhos,cBridP,dtbpf,JosiPhosSL J009-1,PCy_3,XPhos,tBuXPhos
catalyst_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
meCgPPh,0.0,231.624696,534.349137,108.018517,374.509012,233.008584,421.283752,449.799956,259.138959,123.547562,...,575.003478,885.416286,516.860716,548.760421,891.650156,450.124427,537.023277,728.434623,339.560009,434.234959
TrixiePhos,231.624696,0.0,472.098507,246.418343,202.684484,314.79676,397.555028,317.505905,115.277925,130.299655,...,364.671085,771.621669,322.342985,320.705472,749.112809,231.624696,306.235204,555.0,122.918672,202.805325
P(oTol)_3,534.349137,472.098507,0.0,627.446412,653.73542,741.157203,123.600162,776.469574,587.281874,541.731483,...,723.662905,1216.99014,333.208643,618.650952,1171.693219,410.079261,521.874506,954.677956,548.534411,517.32485
RuPhos,108.018517,246.418343,627.446412,0.0,327.482824,125.0,519.665277,377.83065,222.299348,116.361506,...,538.988868,794.693652,562.694411,535.023364,810.355478,478.004184,544.824742,663.666332,324.260698,434.736702
RockPhos,374.509012,202.684484,653.73542,327.482824,0.0,315.626361,593.414695,123.065023,115.801554,251.286689,...,212.983567,570.884402,419.659386,237.876018,546.648882,303.329853,295.392959,359.310451,106.470653,195.20502
VPhos,233.008584,314.79676,741.157203,125.0,315.626361,0.0,638.104223,323.080485,242.985596,202.891597,...,525.008571,697.433151,636.592491,548.292805,726.843174,538.15704,581.557392,606.488252,352.164734,469.703098
PPh_3,421.283752,397.555028,123.600162,519.665277,593.414695,638.104223,0.0,713.218059,511.340395,444.890998,...,697.097554,1163.928692,354.746388,603.557785,1129.973451,398.722961,519.509384,920.220082,493.283894,488.46699
tBuBrettPhos,449.799956,317.505905,776.469574,377.83065,123.065023,323.080485,713.218059,0.0,211.624668,333.634531,...,234.650804,454.948349,536.008395,308.457452,444.072066,419.318495,390.366494,286.832704,229.427548,304.630924
CyJohnPhos,259.138959,115.277925,587.281874,222.299348,115.801554,242.985596,511.340395,211.624668,0.0,135.67977,...,319.405698,666.090835,409.213881,314.574316,653.073503,302.120837,338.710791,473.837525,109.44405,226.770809
CPhos,123.547562,130.299655,541.731483,116.361506,251.286689,202.891597,444.890998,333.634531,135.67977,0.0,...,452.173639,779.616572,447.393563,432.261495,777.606584,361.734709,432.511272,607.237186,220.791757,324.394821


transform the distance in a similarity score where if you call the distance $d_{i,j}$ then the similarity will be defined as $s_{i,j} = \lambda e^{\frac{-d_{i,j}^2}{2\sigma^2}}$

$\lambda$ and $\sigma$ are choosen arbitrarily to make the numbers inside the matrix have some variablility between them and values with a magnitude comparable to the scores that the algorithm will give for every catalyst. It requires a bit of trial and error.

In [None]:
# Define your alpha and sigma values
lambdaa = 5.0
sigma = 300

D_prime_bh = lambdaa * np.exp(-distance_matrix_df_neutral**2 / (2 * sigma**2))

#put zeroes on the main diagonal
np.fill_diagonal(D_prime_bh.values, 0)

display(D_prime_bh)

catalyst_name,meCgPPh,TrixiePhos,P(oTol)_3,RuPhos,RockPhos,VPhos,PPh_3,tBuBrettPhos,CyJohnPhos,CPhos,...,BippyPhos,cataCXlum A,dppdtbpf,QPhos,cBridP,dtbpf,JosiPhosSL J009-1,PCy_3,XPhos,tBuXPhos
catalyst_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
meCgPPh,0.0,3.711301,1.023436,4.68617,2.293852,3.698067,1.86533,1.624886,3.443077,4.59348,...,0.79661,0.06419,1.133487,0.938426,0.060359,1.622253,1.007275,0.262256,2.634983,1.753971
TrixiePhos,3.711301,0.0,1.44952,3.568315,3.979715,2.883195,2.077951,2.855887,4.644158,4.549947,...,2.388421,0.182987,2.807201,2.823671,0.221314,3.711301,2.969632,0.903199,4.597437,3.978631
P(oTol)_3,1.023436,1.44952,0.0,0.561175,0.465408,0.236382,4.593148,0.175514,0.735891,0.979251,...,0.272548,0.001335,2.698283,0.596406,0.002436,1.964402,1.101162,0.03162,0.93972,1.130469
RuPhos,4.68617,3.568315,0.561175,0.0,2.755592,4.584277,1.115329,2.262225,3.799612,4.637687,...,0.995509,0.149702,0.861061,1.019345,0.13019,1.405032,0.961136,0.432781,2.787929,1.749728
RockPhos,2.293852,3.979715,0.465408,2.755592,0.0,2.87483,0.706875,4.596518,4.641037,3.520603,...,3.886181,0.817775,1.87954,3.651276,0.950563,2.998994,3.079221,2.440477,4.694822,4.04606
VPhos,3.698067,2.883195,0.236382,4.584277,2.87483,0.0,0.520661,2.799787,3.601775,3.977858,...,1.081272,0.335257,0.526265,0.941105,0.265652,1.000477,0.763761,0.647879,2.510389,1.467801
PPh_3,1.86533,2.077951,4.593148,1.115329,0.706875,0.520661,0.0,0.296247,1.169799,1.665021,...,0.33613,0.002694,2.485066,0.660769,0.004152,2.067243,1.116333,0.045273,1.293834,1.328277
tBuBrettPhos,1.624886,2.855887,0.175514,2.262225,4.596518,2.799787,0.296247,0.0,3.898658,2.694029,...,3.682322,1.583377,1.013388,2.947181,1.671769,1.882528,2.144377,3.165673,3.732246,2.985844
CyJohnPhos,3.443077,4.644158,0.735891,3.799612,4.641037,3.601775,1.169799,3.898658,0.0,4.513919,...,2.836753,0.425099,1.972155,2.885438,0.46765,3.011214,2.643429,1.436333,4.678107,3.757461
CPhos,4.59348,4.549947,0.979251,4.637687,3.520603,3.977858,1.665021,2.694029,4.513919,0.0,...,1.605674,0.170804,1.644493,1.770713,0.1738,2.416892,1.76859,0.644616,3.813739,2.786582


Create the matrix that can be given to the algorithm for the lasso

In [None]:


display(D_bh)

array([[-1.94820001,  1.94820001,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-1.59820959,  0.        ,  1.59820959, ...,  0.        ,
         0.        ,  0.        ],
       [-2.75338564,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 0.        ,  0.        ,  0.        , ..., -1.33342755,
         1.33342755,  0.        ],
       [ 0.        ,  0.        ,  0.        , ..., -2.3072357 ,
         0.        ,  2.3072357 ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
        -4.54268584,  4.54268584]])

In [None]:
#save the matrix as a numpy object
base_path = "/content/drive/MyDrive/Catalyst Mixing/Lorenzo - Data (other)"
np.save(f"{base_path}/D_bh.npy", D_bh)


## Suzuki-Miyaura

In [None]:
import numpy as np
import pandas as pd

embedding_sm = pd.read_csv("/content/drive/MyDrive/Catalyst Mixing/Lorenzo - Data (other)/suzuki_miyaura_binary_precomputed_epoch_10 (1).csv")

In [None]:
embedding_sm

Unnamed: 0,catalyst_name,dim_0,dim_1
0,meCgPPh,-0.600561,2.245974
1,RuPhos,-0.33836,1.235898
2,VPhos,0.004458,1.735893
3,IPENT Cl,1.332169,1.119553
4,DiMeIHept Cl,1.680322,1.053596
5,Piperidinyl-amino-pincer,-1.183701,1.628299
6,cPhos,-0.456465,1.139444
7,SPhos,-0.000301,0.64898
8,Jackie Phos,0.23354,1.151026
9,GPhos,0.614424,1.052583


**IMPORTANT**: we need here to use the same order of the catalyst that we will use in the decoding algorithm

Therefore here we use that ordered list of names to subset and reorder the embeddings from the df

In [None]:
names = ['meCgPPh', 'TrixiePhos','IPENT Cl','RuPhos', 'RockPhos','VPhos', 'PPh3',  'tBuBrettPhos', 'SIPr','CPhos',  'DPEPhos',  'AlPhos','Triisobutyl-Phosphatrane','SPhos', 'XantPhos','Me3(OMe)-tBuXPhos','Amphos','Jackie CPhos' ,  'dppp', 'Mordal-Phos', 'Jackie Phos', 'dppf','P(tBu)3','GPhos', 'BINAP','BippyPhos','cataCXlumA', 'dppdtbpf','QPhos', 'cBridP', 'dtbpf','JosiPhosSL J009-1','P(Cy3)', 'XPhos','tBuXPhos' ]

In [None]:
filtered_embedding_sm = embedding_sm[embedding_sm['catalyst_name'].isin(names)].copy()
filtered_embedding_sm['catalyst_name'] = pd.Categorical(filtered_embedding_sm['catalyst_name'], categories=names, ordered=True)
filtered_embedding_sm = filtered_embedding_sm.sort_values('catalyst_name').reset_index(drop=True)
display(filtered_embedding_sm)

Unnamed: 0,catalyst_name,dim_0,dim_1
0,meCgPPh,-0.600561,2.245974
1,TrixiePhos,-0.83291,0.847774
2,IPENT Cl,1.332169,1.119553
3,RuPhos,-0.33836,1.235898
4,RockPhos,0.207945,0.320605
5,VPhos,0.004458,1.735893
6,tBuBrettPhos,0.113176,0.419853
7,SIPr,1.352926,0.660473
8,CPhos,-0.52379,0.624841
9,DPEPhos,-1.306655,0.594654


In [None]:
missing_names = set(names) - set(embedding_sm["catalyst_name"])
print("Names in 'names' list but not in 'embedding_sm[\"catalyst_name\"]':")
print(missing_names)

Names in 'names' list but not in 'embedding_sm["catalyst_name"]':
{'P(tBu)3', 'cataCXlumA', 'PPh3', 'P(Cy3)'}


Calculate the distance matrix between all the embeddings (here using L2 distance)

In [None]:
from scipy.spatial.distance import cdist

distance_matrix_sm = cdist(filtered_embedding_sm[['dim_0', 'dim_1']], filtered_embedding_sm[['dim_0', 'dim_1']])
distance_matrix_df_sm = pd.DataFrame(distance_matrix_sm, index=filtered_embedding_sm['catalyst_name'], columns=filtered_embedding_sm['catalyst_name'])
display(distance_matrix_df_sm)

catalyst_name,meCgPPh,TrixiePhos,IPENT Cl,RuPhos,RockPhos,VPhos,tBuBrettPhos,SIPr,CPhos,DPEPhos,...,GPhos,BINAP,BippyPhos,dppdtbpf,QPhos,cBridP,dtbpf,JosiPhosSL J009-1,XPhos,tBuXPhos
catalyst_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
meCgPPh,0.0,1.417374,2.237023,1.043552,2.088236,0.791348,1.960647,2.515934,1.622949,1.795946,...,1.703047,2.703506,3.108121,3.031388,3.120867,3.760131,2.825881,4.107444,2.786351,3.001268
TrixiePhos,1.417374,0.0,2.18207,0.628665,1.166742,1.220632,1.038361,2.193846,0.381122,0.537125,...,1.461753,1.32885,1.847899,1.634362,1.785337,2.6872,1.482837,2.71892,1.411367,1.632726
IPENT Cl,2.237023,2.18207,0.0,1.674576,1.379202,1.463793,1.405533,0.459549,1.920761,2.690522,...,0.720863,2.481367,2.343766,2.800911,2.580015,2.354879,2.429294,3.554456,2.530154,2.632645
RuPhos,1.043552,0.628665,1.674576,0.0,1.065932,0.606234,0.932638,1.786495,0.638573,1.161373,...,0.970258,1.711043,2.067533,2.055225,2.098219,2.734858,1.81079,3.108714,1.792198,1.998053
RockPhos,2.088236,1.166742,1.379202,1.065932,0.0,1.429842,0.137228,1.194358,0.792462,1.539193,...,0.837267,1.109933,1.145467,1.444064,1.295232,1.672492,1.086842,2.328142,1.165317,1.296488
VPhos,0.791348,1.220632,1.463793,0.606234,1.429842,0.0,1.320523,1.724788,1.230237,1.73823,...,0.915953,2.270835,2.540641,2.621892,2.618287,3.069839,2.346263,3.64841,2.349379,2.544083
tBuBrettPhos,1.960647,1.038361,1.405533,0.932638,0.137228,1.320523,0.0,1.262884,0.669138,1.430551,...,0.807215,1.120359,1.230355,1.465149,1.353227,1.802909,1.122678,2.391312,1.182716,1.332519
SIPr,2.515934,2.193846,0.459549,1.786495,1.194358,1.724788,1.262884,0.0,1.877054,2.660395,...,0.836143,2.236052,1.998293,2.528804,2.261999,1.909703,2.151861,3.191967,2.273355,2.344874
CPhos,1.622949,0.381122,1.920761,0.638573,0.792462,1.230237,0.669138,1.877054,0.0,0.783446,...,1.215933,1.083972,1.520491,1.421854,1.501006,2.318781,1.203707,2.487595,1.166357,1.378993
DPEPhos,1.795946,0.537125,2.690522,1.161373,1.539193,1.73823,1.430551,2.660395,0.783446,0.0,...,1.974903,1.276636,1.910569,1.495929,1.759668,2.850615,1.472592,2.554733,1.349008,1.562849


transform the distance in a similarity score where if you call the distance $d_{i,j}$ then the similarity will be defined as $s_{i,j} = \lambda e^{\frac{-d_{i,j}^2}{2\sigma^2}}$

$\lambda$ and $\sigma$ are choosen arbitrarily to make the numbers inside the matrix have some variablility between them and values with a magnitude comparable to the scores that the algorithm will give for every catalyst. It requires a bit of trial and error.

In [None]:
# Define your alpha and sigma values
lambdaa = 5.0
sigma = np.sqrt(3.0)

D_prime_sm = lambdaa * np.exp(-distance_matrix_df_sm**2 / (2 * sigma**2))

#put zeroes on the main diagonal
np.fill_diagonal(D_prime_sm.values, 0)

display(D_prime_sm)

catalyst_name,meCgPPh,TrixiePhos,IPENT Cl,RuPhos,RockPhos,VPhos,tBuBrettPhos,SIPr,CPhos,DPEPhos,...,GPhos,BINAP,BippyPhos,dppdtbpf,QPhos,cBridP,dtbpf,JosiPhosSL J009-1,XPhos,tBuXPhos
catalyst_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
meCgPPh,0.0,3.577317,2.171445,4.17009,2.417297,4.504451,2.634645,1.740984,3.223423,2.920829,...,3.083431,1.478873,0.999369,1.081,0.986231,0.473789,1.321151,0.300469,1.370917,1.114237
TrixiePhos,3.577317,0.0,2.261134,4.681263,3.985078,3.900537,4.177608,2.241798,4.880408,4.765269,...,3.501942,3.725238,2.830112,3.203513,2.939383,1.500698,3.465893,1.458415,3.587462,3.206367
IPENT Cl,2.171445,2.261134,0.0,3.13325,3.641534,3.49846,3.597302,4.827073,2.70351,1.496237,...,4.585186,1.791836,2.001502,1.352455,1.648762,1.984159,1.869853,0.608806,1.720288,1.575072
RuPhos,4.17009,4.681263,3.13325,0.0,4.137407,4.702925,4.325245,2.937359,4.671477,3.993388,...,4.273947,3.069434,2.452209,2.473036,2.400517,1.43744,2.894882,0.998755,2.927383,2.570419
RockPhos,2.417297,3.985078,3.641534,4.137407,0.0,3.556214,4.984332,3.942005,4.503126,3.368897,...,4.448655,4.071912,4.017884,3.53207,3.780414,3.136894,4.106484,2.026,3.987286,3.778363
VPhos,4.504451,3.900537,3.49846,4.702925,3.556214,0.0,3.738962,3.04537,3.885263,3.021834,...,4.347537,2.116978,1.705108,1.589975,1.594989,1.039548,1.9976,0.54387,1.992734,1.700142
tBuBrettPhos,2.634645,4.177608,3.597302,4.325245,4.984332,3.738962,0.0,3.832913,4.64046,3.555013,...,4.485449,4.056162,3.885076,3.496146,3.684866,2.908656,4.052647,1.927793,3.960228,3.719181
SIPr,1.740984,2.241798,4.827073,2.937359,3.942005,3.04537,3.832913,0.0,2.779347,1.536983,...,4.45005,2.173016,2.570008,1.722247,2.131156,2.722663,2.311015,0.915147,2.11294,1.99977
CPhos,3.223423,4.880408,2.70351,4.671477,4.503126,3.885263,4.64046,2.779347,0.0,4.513802,...,3.907987,4.11075,3.401181,3.56974,3.434717,2.040742,3.927302,1.782618,3.985674,3.641884
DPEPhos,2.920829,4.765269,1.496237,3.993388,3.368897,3.021834,3.555013,1.536983,4.513802,0.0,...,2.610124,3.810668,2.721163,3.443438,2.984303,1.290595,3.483427,1.684824,3.691873,3.327946


In [None]:
type(D_prime_sm)

Create the matrix that can be given to the algorithm for the lasso

In [None]:
N = D_prime_sm.shape[0]
num_rows = N * (N - 1) // 2
D_sm = np.zeros((num_rows, N))

row_index = 0
for i in range(N):
    for j in range(i + 1, N):
        d_ij = D_prime_sm.iloc[i, j]
        D_sm[row_index, i] = -d_ij
        D_sm[row_index, j] = d_ij
        row_index += 1

display(D_sm)

array([[-3.57731683,  3.57731683,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-2.17144518,  0.        ,  2.17144518, ...,  0.        ,
         0.        ,  0.        ],
       [-4.17009003,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 0.        ,  0.        ,  0.        , ..., -3.73743593,
         3.73743593,  0.        ],
       [ 0.        ,  0.        ,  0.        , ..., -4.07074213,
         0.        ,  4.07074213],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
        -4.95903112,  4.95903112]])

In [None]:
#save the matrix as a numpy object
base_path = "/content/drive/MyDrive/Catalyst Mixing/Lorenzo - Data (other)"
np.save(f"{base_path}/D_sm.npy", D_bh)


----------------
## Neutral embeddings

In [None]:
import numpy as np
import pandas as pd

embedding_bh = pd.read_csv("/content/drive/MyDrive/Catalyst Mixing/Lorenzo - Data (other)/buchwald_hartwig_binary_precomputed_epoch_10.csv")

In [None]:
embedding_bh

Unnamed: 0,catalyst_name,dim_0,dim_1
0,meCgPPh,-1.203136,2.572347
1,RuPhos,-0.612443,0.774926
2,VPhos,-0.724382,2.419098
3,IPENT Cl,2.340083,2.163032
4,DiMeIHept Cl,2.624199,2.158684
5,Piperidinyl-amino-pincer,-0.885154,2.175207
6,cPhos,-0.456465,1.139444
7,SPhos,-0.567981,0.225638
8,Jackie Phos,-0.688213,1.348426
9,GPhos,0.911677,2.005251


**IMPORTANT**: we need here to use the same order of the catalyst that we will use in the decoding algorithm

Therefore here we use that ordered list of names to subset and reorder the embeddings from the df

In [None]:
names = ['meCgPPh', 'TrixiePhos', 'P(oTol)_3', 'RuPhos', 'RockPhos', 'VPhos', 'PPh_3', 'tBuBrettPhos', 'CyJohnPhos', 'CPhos', 'DPEPhos', 'AlPhos', 'Triisobutyl-Phosphatrane', 'SPhos', 'XantPhos', 'Me3(OMe)-tBuXPhos', 'Amphos', 'Jackie CPhos', 'dppp', 'Mordal-Phos', 'Jackie Phos', 'dppf', 'P(tBu)_3', 'GPhos', 'BINAP', 'BippyPhos', 'cataCXlum A', 'dppdtbpf', 'QPhos', 'cBridP', 'dtbpf', 'JosiPhosSL J009-1', 'PCy_3', 'XPhos', 'tBuXPhos']

In [None]:
filtered_embedding_bh = embedding_bh[embedding_bh['catalyst_name'].isin(names)].copy()
filtered_embedding_bh['catalyst_name'] = pd.Categorical(filtered_embedding_bh['catalyst_name'], categories=names, ordered=True)
filtered_embedding_bh = filtered_embedding_bh.sort_values('catalyst_name').reset_index(drop=True)
display(filtered_embedding_bh)

Unnamed: 0,catalyst_name,dim_0,dim_1
0,meCgPPh,-1.203136,2.572347
1,TrixiePhos,-0.393513,0.336345
2,P(oTol)_3,-0.925923,-0.028898
3,RuPhos,-0.612443,0.774926
4,RockPhos,-0.479155,0.044804
5,VPhos,-0.724382,2.419098
6,PPh_3,-2.501466,1.242695
7,tBuBrettPhos,-0.351777,0.2721
8,CyJohnPhos,-0.524102,-0.044125
9,CPhos,-0.610103,2.105331


In [None]:
missing_names = set(names) - set(embedding_bh["catalyst_name"])
print("Names in 'names' list but not in 'embedding_bh[\"catalyst_name\"]':")
print(missing_names)

Names in 'names' list but not in 'embedding_bh["catalyst_name"]':
set()


Calculate the distance matrix between all the embeddings (here using L2 distance)

In [None]:
from scipy.spatial.distance import cdist

distance_matrix_bh = cdist(filtered_embedding_bh[['dim_0', 'dim_1']], filtered_embedding_bh[['dim_0', 'dim_1']])
distance_matrix_df_bh = pd.DataFrame(distance_matrix_bh, index=filtered_embedding_bh['catalyst_name'], columns=filtered_embedding_bh['catalyst_name'])
display(distance_matrix_df_bh)

catalyst_name,meCgPPh,TrixiePhos,P(oTol)_3,RuPhos,RockPhos,VPhos,PPh_3,tBuBrettPhos,CyJohnPhos,CPhos,...,BippyPhos,cataCXlum A,dppdtbpf,QPhos,cBridP,dtbpf,JosiPhosSL J009-1,PCy_3,XPhos,tBuXPhos
catalyst_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
meCgPPh,0.0,2.378065,2.615975,1.891994,2.629187,0.502683,1.858396,2.452743,2.703149,0.754845,...,3.420731,4.414989,4.59786,3.266837,5.808609,2.697387,3.054614,5.439724,2.65355,3.396471
TrixiePhos,2.378065,0.0,0.645649,0.490187,0.30386,2.10887,2.294544,0.076612,0.402258,1.782196,...,1.042959,2.245595,2.554152,0.896532,3.499779,0.332876,0.784316,3.062125,0.360957,1.097471
P(oTol)_3,2.615975,0.645649,0.0,0.862788,0.452806,2.456279,2.024669,0.648262,0.402109,2.15747,...,1.06275,2.540196,2.04307,1.05153,3.5962,0.547332,0.512622,2.983558,0.392306,0.832487
RuPhos,1.891994,0.490187,0.862788,0.0,0.742189,1.647978,1.946077,0.566375,0.823802,1.330407,...,1.52969,2.68844,2.898945,1.38648,3.9816,0.80556,1.196385,3.550631,0.774522,1.532026
RockPhos,2.629187,0.30386,0.452806,0.742189,0.0,2.386925,2.350465,0.260555,0.099642,2.064684,...,0.811576,2.165019,2.252302,0.715317,3.33122,0.107715,0.480603,2.828118,0.069015,0.799844
VPhos,0.502683,2.10887,2.456279,1.647978,2.386925,0.0,2.131186,2.17909,2.471353,0.33393,...,3.142745,4.016761,4.488629,2.96825,5.443266,2.439826,2.84334,5.141832,2.422228,3.179991
PPh_3,1.858396,2.294544,2.024669,1.946077,2.350465,2.131186,0.0,2.358647,2.359211,2.078796,...,3.086541,4.514199,3.454429,3.051505,5.619179,2.457115,2.529694,4.952042,2.323912,2.815084
tBuBrettPhos,2.452743,0.076612,0.648262,0.566375,0.260555,2.17909,2.358647,0.0,0.360131,1.851342,...,0.969063,2.17448,2.511631,0.820151,3.423446,0.269774,0.73524,2.987116,0.324507,1.039481
CyJohnPhos,2.703149,0.402258,0.402109,0.823802,0.099642,2.471353,2.359211,0.360131,0.0,2.151176,...,0.762491,2.165415,2.152845,0.692917,3.296658,0.152262,0.384037,2.76789,0.049625,0.710044
CPhos,0.754845,1.782196,2.15747,1.330407,2.064684,0.33393,2.078796,1.851342,2.151176,0.0,...,2.812384,3.691444,4.197675,2.635829,5.112038,2.114177,2.526718,4.808624,2.102389,2.860976


transform the distance in a similarity score where if you call the distance $d_{i,j}$ then the similarity will be defined as $s_{i,j} = \lambda e^{\frac{-d_{i,j}^2}{2\sigma^2}}$

$\lambda$ and $\sigma$ are choosen arbitrarily to make the numbers inside the matrix have some variablility between them and values with a magnitude comparable to the scores that the algorithm will give for every catalyst. It requires a bit of trial and error.

In [None]:
# Define your alpha and sigma values
lambdaa = 5.0
sigma = np.sqrt(3.0)

D_prime_bh = lambdaa * np.exp(-distance_matrix_df_bh**2 / (2 * sigma**2))

#put zeroes on the main diagonal
np.fill_diagonal(D_prime_bh.values, 0)

display(D_prime_bh)

catalyst_name,meCgPPh,TrixiePhos,P(oTol)_3,RuPhos,RockPhos,VPhos,PPh_3,tBuBrettPhos,CyJohnPhos,CPhos,...,BippyPhos,cataCXlum A,dppdtbpf,QPhos,cBridP,dtbpf,JosiPhosSL J009-1,PCy_3,XPhos,tBuXPhos
catalyst_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
meCgPPh,0.0,1.9482,1.59821,2.753386,1.579857,4.793797,2.81182,1.834515,1.479349,4.547023,...,0.711204,0.194126,0.147497,0.844284,0.018063,1.487041,1.055829,0.036069,1.546328,0.731081
TrixiePhos,1.9482,0.0,4.664408,4.80372,4.923646,2.382655,2.079129,4.995111,4.866959,2.944878,...,4.17095,2.157582,1.685657,4.373119,0.649229,4.908509,4.512777,1.047776,4.892595,4.090623
P(oTol)_3,1.59821,4.664408,0.0,4.416603,4.832026,1.829215,2.524956,4.66178,4.867056,2.301724,...,4.142081,1.70575,2.493652,4.15849,0.57926,4.756486,4.785741,1.134095,4.873378,4.454577
RuPhos,2.753386,4.80372,4.416603,0.0,4.561404,3.179739,2.659759,4.739703,4.46527,3.722668,...,3.385312,1.499032,1.232187,3.629338,0.356026,4.487445,3.938822,0.61157,4.524274,3.381279
RockPhos,1.579857,4.923646,4.832026,4.561404,0.0,1.93454,1.99104,4.943745,4.991733,2.457025,...,4.480175,2.28924,2.146762,4.591277,0.786576,4.990341,4.811175,1.31837,4.996032,4.494313
VPhos,4.793797,2.382655,1.829215,3.179739,1.93454,0.0,2.345377,2.266036,1.80671,4.907934,...,0.963962,0.339724,0.17403,1.151447,0.035838,1.85394,1.299536,0.060995,1.880566,0.92686
PPh_3,2.81182,2.079129,2.524956,2.659759,1.99104,2.345377,0.0,1.978295,1.977418,2.433196,...,1.021885,0.16748,0.684266,1.059176,0.025911,1.827963,1.720954,0.083938,2.032657,1.334631
tBuBrettPhos,1.834515,4.995111,4.66178,4.739703,4.943745,2.266036,1.978295,0.0,4.893081,2.82411,...,4.275598,2.27363,1.747273,4.469739,0.709006,4.939718,4.569216,1.130087,4.913011,4.175989
CyJohnPhos,1.479349,4.866959,4.867056,4.46527,4.991733,1.80671,1.977418,4.893081,0.0,2.31215,...,4.53824,2.288586,2.309384,4.615479,0.817187,4.980718,4.878595,1.394545,4.997948,4.597032
CPhos,4.547023,2.944878,2.301724,3.722668,2.457025,4.907934,2.433196,2.82411,2.31215,0.0,...,1.338015,0.515979,0.26519,1.570675,0.064181,2.373773,1.725277,0.105994,2.393518,1.277929


Create the matrix that can be given to the algorithm for the lasso

In [None]:


display(D_bh)

array([[-1.94820001,  1.94820001,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-1.59820959,  0.        ,  1.59820959, ...,  0.        ,
         0.        ,  0.        ],
       [-2.75338564,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 0.        ,  0.        ,  0.        , ..., -1.33342755,
         1.33342755,  0.        ],
       [ 0.        ,  0.        ,  0.        , ..., -2.3072357 ,
         0.        ,  2.3072357 ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
        -4.54268584,  4.54268584]])

In [None]:
#save the matrix as a numpy object
base_path = "/content/drive/MyDrive/Catalyst Mixing/Lorenzo - Data (other)"
np.save(f"{base_path}/D_bh.npy", D_bh)
