In [1]:
import duckdb
import pandas as pd
# import findspark
# findspark.init()
import pyspark
from pyspark.conf import SparkConf

from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.functions import udf, pandas_udf, PandasUDFType
from pyspark.sql.types import ArrayType, FloatType, IntegerType, StructType, StructField, StringType, LongType
from pyspark.ml.feature import StringIndexer, OneHotEncoder, Word2Vec
from imblearn.over_sampling import SMOTE
import rdkit
from rdkit import Chem
from rdkit.Chem import AllChem

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import average_precision_score
# from sklearn.preprocessing import OneHotEncoder

import mlflow

import numpy as np

import torch
from torch.utils.data import Dataset, DataLoader

import sklearn
import time
# import h2o
# from h2o.estimators import H2OGradientBoostingEstimator
# from h2o.frame import H2OFrame
# from pysparkling import H2OContext
# import pysparkling

In [2]:
# schema = StructType([
#     StructField("id", LongType(), True),
#     StructField("buildingblock1_smiles", StringType(), True),
#     StructField("buildingblock2_smiles", StringType(), True),
#     StructField("buildingblock3_smiles", StringType(), True),
#     StructField("molecule_smiles", StringType(), True),
#     StructField("protein_name", StringType(), True),
#     StructField("binds", LongType(), True)
# ])
# train_1 = spark.read.parquet("train_1.parquet", schema=schema)
# train_0 = spark.read.parquet("train_0.parquet", schema=schema)

In [2]:
def vector_to_array(v):
    return v.toArray().tolist()
vector_to_array_udf = udf(vector_to_array, ArrayType(FloatType()))

def smiles_to_fingerprint(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=15) # 10, 15
        return [bit for bit in fp]
    else:
        return [0] * 15

# Register the UDF
smiles_to_fingerprint_udf = udf(smiles_to_fingerprint, ArrayType(IntegerType()))

# Register the UDF using pandas_udf
@pandas_udf(ArrayType(IntegerType()), PandasUDFType.SCALAR)
def smiles_to_fingerprint_udf(smiles_series: pd.Series) -> pd.Series:
    return smiles_series.apply(smiles_to_fingerprint)



# ### Distinct Counts
# # buildingblock 1: 271
# # buildingblock 2: 693
# # buildingblock 3: 872
# # molecule_smiles: 29,656
# # 4 repeats for buildingblock triplets but they are binded to different target proteins



In [3]:
spark = SparkSession.builder.appName('belka') \
    .config("spark.executor.cores", "4") \
    .config("spark.executor.memory", "4g") \
    .config("spark.cores.max", "16") \
    .getOrCreate()
    # .config("spark.ext.h2o.backend.cluster.mode", "internal")\
    # .config("spark.executor.instances", "1")\
    # .config("spark.executor.memory", "2g")\
    # .config("spark.driver.memory", "2g")\
train_data = spark.read.parquet("train.parquet") # train_data.count() # 295,246,830
sample_data = train_data.sample(.0001)


# pandas_sample = sample_data.toPandas()
# train_rdd = train_data.rdd
# h2o.init()
# hc = H2OContext.getOrCreate()

# bind1_data = train_data.where(F.col("binds") == 1)
# bind0_data = train_data.where(F.col("binds") == 0)

24/06/15 17:27:14 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [12]:


import math
math.log(29656, 2)

14.856036399819157

In [6]:
encoded_data = (
    sample_data.withColumn("encoded_buildingblock1", smiles_to_fingerprint_udf(F.col("buildingblock1_smiles")))
                .withColumn("encoded_buildingblock2", smiles_to_fingerprint_udf(F.col("buildingblock2_smiles")))
                .withColumn("encoded_buildingblock3", smiles_to_fingerprint_udf(F.col("buildingblock3_smiles")))
                .withColumn("encoded_molecule_vector", smiles_to_fingerprint_udf(F.col("molecule_smiles")))
                .withColumn("encoded_sEH", F.when(F.col("protein_name") == "sEH", 1).otherwise(0))
                .withColumn("encoded_HSA", F.when(F.col("protein_name") == "HSA", 1).otherwise(0))
                .withColumn("encoded_BRD4", F.when(F.col("protein_name") == "BRD4", 1).otherwise(0))
)

In [7]:
pandas_encoded = encoded_data.toPandas()

                                                                                

In [16]:
pandas_encoded['molecule_smiles'].nunique()


29378

In [8]:
train_1 = spark.read.parquet("train_1.parquet")
#### Get a sample
# sample_data = train_data.sample(.0001) # sample_data.count(.0001) 29509

train_1 = (
    train_1
    .withColumn("buildingblock1_array", F.array(F.col("buildingblock1_smiles")))
    .withColumn("buildingblock2_array", F.array(F.col("buildingblock2_smiles")))
    .withColumn("buildingblock3_array", F.array(F.col("buildingblock3_smiles")))
    .withColumn("molecule_array", F.array(F.col("molecule_smiles")))
    .withColumn("encoded_sEH", F.when(F.col("protein_name") == "sEH", 1).otherwise(0))
    .withColumn("encoded_HSA", F.when(F.col("protein_name") == "HSA", 1).otherwise(0))
    .withColumn("encoded_BRD4", F.when(F.col("protein_name") == "BRD4", 1).otherwise(0))
    # .withColumn("encoded_protein_name", F.when(F.col("protein_name") == "sEH", F.array(F.lit(1),F.lit(0),F.lit(0)))\
    #                                             .when(F.col("protein_name") == "HSA", F.array(F.lit(0),F.lit(1),F.lit(0)))\
    #                                             .when(F.col("protein_name") == "BRD4", F.array(F.lit(0),F.lit(0),F.lit(1)))\
    #                                                  )
)

sample_train_1 = train_1.sample(.33)

# pandas_train_1 = train_1.toPandas()


# print(bind1_data.count()) # 1_589_906
# print(bind0_data.count()) # 293_656_924

# 293656924 / 1589906 # 184.70080872705682

In [9]:
vector_size = 20

word2Vec1 = Word2Vec(vectorSize=vector_size, minCount=0, inputCol="molecule_array", outputCol="molecule_vector")
model1 = word2Vec1.fit(sample_train_1)
result1 = model1.transform(sample_train_1)
print("Done")

24/06/15 17:26:01 ERROR Executor: Exception in task 0.0 in stage 15.0 (TID 99)3]
java.lang.OutOfMemoryError: Java heap space
24/06/15 17:26:01 ERROR Executor: Exception in task 9.0 in stage 15.0 (TID 108)
java.lang.OutOfMemoryError: Java heap space
24/06/15 17:26:01 ERROR Executor: Exception in task 3.0 in stage 15.0 (TID 102)
java.lang.OutOfMemoryError: Java heap space
	at java.util.Arrays.copyOf(Arrays.java:3236)
	at java.io.ByteArrayOutputStream.grow(ByteArrayOutputStream.java:118)
	at java.io.ByteArrayOutputStream.ensureCapacity(ByteArrayOutputStream.java:93)
	at java.io.ByteArrayOutputStream.write(ByteArrayOutputStream.java:153)
	at org.apache.spark.util.ByteBufferOutputStream.write(ByteBufferOutputStream.scala:41)
	at java.io.ObjectOutputStream$BlockDataOutputStream.drain(ObjectOutputStream.java:1877)
	at java.io.ObjectOutputStream$BlockDataOutputStream.setBlockDataMode(ObjectOutputStream.java:1786)
	at java.io.ObjectOutputStream.writeObject0(ObjectOutputStream.java:1189)
	at jav

Py4JJavaError: An error occurred while calling o323.fit.
: org.apache.spark.SparkException: Job 8 cancelled because SparkContext was shut down
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$cleanUpAfterSchedulerStop$1(DAGScheduler.scala:1085)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$cleanUpAfterSchedulerStop$1$adapted(DAGScheduler.scala:1083)
	at scala.collection.mutable.HashSet.foreach(HashSet.scala:79)
	at org.apache.spark.scheduler.DAGScheduler.cleanUpAfterSchedulerStop(DAGScheduler.scala:1083)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onStop(DAGScheduler.scala:2463)
	at org.apache.spark.util.EventLoop.stop(EventLoop.scala:84)
	at org.apache.spark.scheduler.DAGScheduler.stop(DAGScheduler.scala:2369)
	at org.apache.spark.SparkContext.$anonfun$stop$12(SparkContext.scala:2069)
	at org.apache.spark.util.Utils$.tryLogNonFatalError(Utils.scala:1419)
	at org.apache.spark.SparkContext.stop(SparkContext.scala:2069)
	at org.apache.spark.SparkContext.$anonfun$new$37(SparkContext.scala:661)
	at org.apache.spark.util.SparkShutdownHook.run(ShutdownHookManager.scala:214)
	at org.apache.spark.util.SparkShutdownHookManager.$anonfun$runAll$2(ShutdownHookManager.scala:188)
	at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
	at org.apache.spark.util.Utils$.logUncaughtExceptions(Utils.scala:1996)
	at org.apache.spark.util.SparkShutdownHookManager.$anonfun$runAll$1(ShutdownHookManager.scala:188)
	at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
	at scala.util.Try$.apply(Try.scala:213)
	at org.apache.spark.util.SparkShutdownHookManager.runAll(ShutdownHookManager.scala:188)
	at org.apache.spark.util.SparkShutdownHookManager$$anon$2.run(ShutdownHookManager.scala:178)
	at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
	at java.util.concurrent.FutureTask.run(FutureTask.java:266)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:750)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:868)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2196)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2217)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2236)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2261)
	at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1030)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:414)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:1029)
	at org.apache.spark.mllib.feature.Word2Vec.learnVocab(Word2Vec.scala:191)
	at org.apache.spark.mllib.feature.Word2Vec.fit(Word2Vec.scala:312)
	at org.apache.spark.ml.feature.Word2Vec.fit(Word2Vec.scala:183)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:750)


24/06/15 17:26:04 WARN BlockManager: Putting block taskresult_105 failed due to exception java.lang.IllegalStateException: Block broadcast_26 not found.
24/06/15 17:26:04 ERROR Executor: Exception in task 6.0 in stage 15.0 (TID 105): Block taskresult_105 does not exist
24/06/15 17:26:04 WARN BlockManager: Block taskresult_111 could not be removed as it was not found on disk or in memory
24/06/15 17:26:04 WARN BlockManager: Block taskresult_100 could not be removed as it was not found on disk or in memory
java.util.concurrent.RejectedExecutionException: Task scala.concurrent.impl.CallbackRunnable@3d85a01 rejected from java.util.concurrent.ThreadPoolExecutor@5a9d0173[Shutting down, pool size = 2, active threads = 2, queued tasks = 0, completed tasks = 222]
	at java.util.concurrent.ThreadPoolExecutor$AbortPolicy.rejectedExecution(ThreadPoolExecutor.java:2063)
	at java.util.concurrent.ThreadPoolExecutor.reject(ThreadPoolExecutor.java:830)
	at java.util.concurrent.ThreadPoolExecutor.execute

In [None]:
### Vectorize Inputs
vector_size = 20

word2Vec1 = Word2Vec(vectorSize=vector_size, minCount=0, inputCol="buildingblock1_array", outputCol="buildingblock1_vector")
model1 = word2Vec1.fit(sample_train_1)
result1 = model1.transform(sample_train_1)

print("Done 1")

word2Vec2 = Word2Vec(vectorSize=vector_size, minCount=0, inputCol="buildingblock2_array", outputCol="buildingblock2_vector")
model2 = word2Vec2.fit(result1)
result2 = model2.transform(result1)

print("Done 2")

word2Vec3 = Word2Vec(vectorSize=vector_size, minCount=0, inputCol="buildingblock3_array", outputCol="buildingblock3_vector")
model3 = word2Vec3.fit(result2)
result3 = model3.transform(result2)

print("Done 3")

word2Vec4 = Word2Vec(vectorSize=vector_size, minCount=0, inputCol="molecule_array", outputCol="molecule_vector")
model4 = word2Vec4.fit(result3)
result4 = model4.transform(result3)

print("Done 4")


24/06/15 17:24:30 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
24/06/15 17:24:30 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS


Done 1
Done 2
Done 3


24/06/15 17:24:33 ERROR Executor: Exception in task 12.0 in stage 30.0 (TID 246)
java.lang.OutOfMemoryError: Java heap space
	at java.util.Arrays.copyOf(Arrays.java:3236)
	at java.io.ByteArrayOutputStream.grow(ByteArrayOutputStream.java:118)
	at java.io.ByteArrayOutputStream.ensureCapacity(ByteArrayOutputStream.java:93)
	at java.io.ByteArrayOutputStream.write(ByteArrayOutputStream.java:153)
	at org.apache.spark.util.ByteBufferOutputStream.write(ByteBufferOutputStream.scala:41)
	at java.io.ObjectOutputStream$BlockDataOutputStream.write(ObjectOutputStream.java:1853)
	at java.io.ObjectOutputStream.write(ObjectOutputStream.java:709)
	at org.apache.spark.util.Utils$.writeByteBuffer(Utils.scala:233)
	at org.apache.spark.scheduler.DirectTaskResult.$anonfun$writeExternal$1(TaskResult.scala:53)
	at org.apache.spark.scheduler.DirectTaskResult$$Lambda$1433/1830896828.apply$mcV$sp(Unknown Source)
	at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
	at org.apache.spark.util.U

In [None]:

# vectorized_samples = result4.select('buildingblock1_vector', 'buildingblock2_vector', 'buildingblock3_vector', 
#                                     'molecule_vector', "encoded_sEH", "encoded_HSA", "encoded_BRD4", 'binds')

# array_samples =(
#     vectorized_samples
#     .withColumn("buildingblock1_vector", vector_to_array_udf(vectorized_samples["buildingblock1_vector"]))
#     .withColumn("buildingblock2_vector", vector_to_array_udf(vectorized_samples["buildingblock2_vector"]))
#     .withColumn("buildingblock3_vector", vector_to_array_udf(vectorized_samples["buildingblock3_vector"]))
#     .withColumn("molecule_vector", vector_to_array_udf(vectorized_samples["molecule_vector"]))
# )

# pandas_vector_df = array_samples.toPandas()

In [None]:
### Unpacking arrays
rename_block1 = {i:f"buildingblock1_feature_{i}" for i in range(vector_size)}
values_df = pd.DataFrame(pandas_vector_df['buildingblock1_vector'].tolist(), index=pandas_vector_df.index)
result_df = pd.concat([pandas_vector_df.drop(columns=['buildingblock1_vector']), values_df], axis=1)
result_df = result_df.rename(columns=rename_block1)

rename_block2 = {i:f"buildingblock2_feature_{i}" for i in range(vector_size)}
values_df = pd.DataFrame(result_df['buildingblock2_vector'].tolist(), index=result_df.index)
result_df = pd.concat([result_df.drop(columns=['buildingblock2_vector']), values_df], axis=1)
result_df = result_df.rename(columns=rename_block2)

rename_block3 = {i:f"buildingblock3_feature_{i}" for i in range(vector_size)}
values_df = pd.DataFrame(result_df['buildingblock3_vector'].tolist(), index=result_df.index)
result_df = pd.concat([result_df.drop(columns=['buildingblock3_vector']), values_df], axis=1)
result_df = result_df.rename(columns=rename_block3)

rename_molecule = {i:f"molecule_feature_{i}" for i in range(vector_size)}
values_df = pd.DataFrame(result_df['molecule_vector'].tolist(), index=result_df.index)
result_df = pd.concat([result_df.drop(columns=['molecule_vector']), values_df], axis=1)
result_df = result_df.rename(columns=rename_molecule)
result_df

In [36]:
train_x = result_df.drop('binds', axis=1)
train_y = result_df['binds']

smote = SMOTE()
smote_x, smote_y = smote.fit_resample(train_x, train_y)

In [None]:
### Vector embedding molecule
train_sample['molecule'] = pandas_train_1['molecule_smiles'].apply(Chem.MolFromSmiles)
train_sample['ecfp'] = train_sample['molecule'].apply(generate_ecfp)


# ### Vector Embedding / Encoding protein name

oneHot = OneHotEncoder(sparse=False)
protein_oneHot = oneHot.fit_transform(train_sample['protein_name'].values.reshape(-1,1))

train_sample['encoded_protein_name'] = protein_oneHot.tolist()

# X = [ecfp + protein for ecfp, protein in zip(train_sample['ecfp'], train_sample['encoded_protein_name'])]
# y = train_sample['binds'].tolist()

# x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)

# rf = RandomForestClassifier(n_estimators=100, random_state=42)
# rf.fit(x_train, y_train)

In [47]:
array_samples

ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/py4j/java_gateway.py", line 1207, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/py4j/java_gateway.py", line 1033, in send_command
    response = connection.send_command(command)
  File "/opt/conda/lib/python3.10/site-packages/py4j/java_gateway.py", line 1211, in send_command
    raise Py4JNetworkError(
py4j.protocol.Py4JNetworkError: Error while receiving


Py4JError: An error occurred while calling o28.sessionState

ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/py4j/java_gateway.py", line 1207, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/py4j/java_gateway.py", line 1033, in send_command
    response = connection.send_command(command)
  File "/opt/conda/lib/python3.10/site-packages/py4j/java_gateway.py", line 1211, in send_command
    raise Py4JNetworkError(
py4j.protocol.Py4JNetworkError: Error while receiving


Py4JError: An error occurred while calling o28.sessionState

In [19]:
binds_distribution = train_data.select('binds').groupby('binds').count()
binds_distribution.show()
"""
Imbalanced labels. Most of the attempts do not bind at all.
0: 293656924
1: 1589906
"""

[Stage 22:>                                                       (0 + 14) / 28]

+-----+---------+
|binds|    count|
+-----+---------+
|    0|293656924|
|    1|  1589906|
+-----+---------+





'\nImbalanced labels. Most of the attempts do not bind at all.\n'

In [16]:
result_distribution = train_data.groupby(['protein_name', 'binds']).count()
result_distribution.show()
"""
sEH tends to be more likely to have a bind than the other two.
"""



+------------+-----+--------+
|protein_name|binds|   count|
+------------+-----+--------+
|         HSA|    0|98007200|
|         sEH|    1|  724532|
|         sEH|    0|97691078|
|        BRD4|    1|  456964|
|        BRD4|    0|97958646|
|         HSA|    1|  408410|
+------------+-----+--------+





In [9]:
smote_x

Unnamed: 0,encoded_sEH,encoded_HSA,encoded_BRD4,buildingblock1_feature_0,buildingblock1_feature_1,buildingblock1_feature_2,buildingblock1_feature_3,buildingblock1_feature_4,buildingblock1_feature_5,buildingblock1_feature_6,...,molecule_feature_90,molecule_feature_91,molecule_feature_92,molecule_feature_93,molecule_feature_94,molecule_feature_95,molecule_feature_96,molecule_feature_97,molecule_feature_98,molecule_feature_99
0,0,0,1,0.004743,-0.003168,-0.001753,0.004073,-0.000642,-0.004946,-0.004860,...,0.003224,-0.002650,0.002614,0.002073,0.003384,-0.004905,0.003251,0.002515,0.000040,-0.003766
1,1,0,0,0.004743,-0.003168,-0.001753,0.004073,-0.000642,-0.004946,-0.004860,...,-0.003790,0.000773,-0.003055,-0.002940,-0.002983,0.001386,-0.000769,-0.000984,0.002133,0.003209
2,1,0,0,0.004743,-0.003168,-0.001753,0.004073,-0.000642,-0.004946,-0.004860,...,-0.002371,0.004300,-0.002916,0.001571,0.002015,-0.000846,0.002903,-0.003949,0.000774,-0.000572
3,0,0,1,0.004743,-0.003168,-0.001753,0.004073,-0.000642,-0.004946,-0.004860,...,-0.000575,-0.003165,-0.002085,-0.000168,-0.003081,0.003692,0.002740,-0.003338,0.001573,-0.000786
4,1,0,0,0.004743,-0.003168,-0.001753,0.004073,-0.000642,-0.004946,-0.004860,...,0.002413,-0.001162,-0.004356,0.004006,-0.004457,0.000437,-0.001919,0.001346,-0.000856,-0.003480
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58473,0,0,1,-0.002027,-0.001044,0.002356,-0.000664,-0.002745,0.003259,-0.000844,...,-0.002477,-0.000542,-0.002662,-0.003487,0.002688,-0.001243,-0.004494,-0.001651,-0.004723,0.003575
58474,1,0,0,0.003593,-0.000719,0.002105,0.001571,0.002132,0.002886,0.002412,...,0.002405,-0.000999,0.003029,-0.000855,-0.002392,0.001951,-0.000102,0.001409,0.000165,0.001402
58475,1,0,0,0.003593,-0.000719,0.002105,0.001571,0.002132,0.002886,0.002412,...,-0.003349,0.002678,-0.000857,0.001163,-0.001608,-0.000616,0.001745,0.001060,-0.000906,-0.002312
58476,0,1,0,-0.002797,-0.001667,-0.004635,0.000070,0.002202,-0.003527,-0.002676,...,-0.000688,-0.001556,-0.000648,0.000579,-0.002238,0.003533,0.003189,0.001837,0.001058,-0.001380


In [10]:
smote_y

0        0
1        0
2        0
3        0
4        0
        ..
58473    1
58474    1
58475    1
58476    1
58477    1
Name: binds, Length: 58478, dtype: int64

# Tutorial

In [7]:
con = duckdb.connect()
train_sample = con.query(f"""
(
select * from parquet_scan('/home/jovyan/train.parquet')
where binds = 0
order by random()
limit 30000)
union all 
(select * from parquet_scan('/home/jovyan/train.parquet')
where binds = 1
order by random()
limit 30000
)""").df()
con.close()

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Unnamed: 0,id,buildingblock1_smiles,buildingblock2_smiles,buildingblock3_smiles,molecule_smiles,protein_name,binds,molecule,ecfp
0,203731153,O=C(Nc1ccc(I)cc1C(=O)O)OCC1c2ccccc2-c2ccccc21,Nc1noc2ccc(F)cc12,NCc1cccnc1OC(F)F,O=C(N[Dy])c1cc(I)ccc1Nc1nc(NCc2cccnc2OC(F)F)nc...,HSA,0,<rdkit.Chem.rdchem.Mol object at 0xffff1461c3c0>,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ..."
1,105894800,O=C(N[C@@H](Cc1cccs1)C(=O)O)OCC1c2ccccc2-c2ccc...,COCC1(CN)CCCCC1,Nc1ncc(Cl)nc1Cl,COCC1(CNc2nc(Nc3ncc(Cl)nc3Cl)nc(N[C@@H](Cc3ccc...,sEH,0,<rdkit.Chem.rdchem.Mol object at 0xffff1461c430>,"[0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, ..."
2,189346899,O=C(Nc1ccc(C(=O)O)cc1C(F)(F)F)OCC1c2ccccc2-c2c...,Nc1cc(F)c(F)cc1Br,CC(C)(C)c1cc(N)n[nH]1,CC(C)(C)c1cc(Nc2nc(Nc3cc(F)c(F)cc3Br)nc(Nc3ccc...,BRD4,0,<rdkit.Chem.rdchem.Mol object at 0xffff1461c5f0>,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,155901527,O=C(Nc1cc(-n2cccn2)ccc1C(=O)O)OCC1c2ccccc2-c2c...,CCOC(=O)c1cncnc1N,CCN1C(=O)C[C@H](CN)[C@H]1c1ccncc1,CCOC(=O)c1cncnc1Nc1nc(NC[C@H]2CC(=O)N(CC)[C@@H...,sEH,0,<rdkit.Chem.rdchem.Mol object at 0xffff1461c4a0>,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,280634398,O=C(O)[C@@H]1CCCN1C(=O)OCC1c2ccccc2-c2ccccc21,Cl.NCc1cc(C(F)(F)F)co1,Nc1cc(CO)ccn1,O=C(N[Dy])[C@@H]1CCCN1c1nc(NCc2cc(C(F)(F)F)co2...,HSA,0,<rdkit.Chem.rdchem.Mol object at 0xffff1461c660>,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...,...,...,...,...,...
59995,237392508,O=C(O)CC(NC(=O)OCC1c2ccccc2-c2ccccc21)c1ccc(Br...,Nc1cccc(-n2cncn2)c1,CNC(=O)c1ccc(N)cc1F,CNC(=O)c1ccc(Nc2nc(Nc3cccc(-n4cncn4)c3)nc(NC(C...,BRD4,1,<rdkit.Chem.rdchem.Mol object at 0xffff7855b450>,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
59996,61032186,Cc1ccccc1[C@@H](CC(=O)O)NC(=O)OCC1c2ccccc2-c2c...,CC(=O)SCCN.Cl,COC(=O)c1ccnc(N)c1,COC(=O)c1ccnc(Nc2nc(NCCSC(C)=O)nc(N[C@H](CC(=O...,BRD4,1,<rdkit.Chem.rdchem.Mol object at 0xffff7855b4c0>,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ..."
59997,63117670,Cn1cc(C[C@@H](NC(=O)OCC2c3ccccc3-c3ccccc32)C(=...,Nc1cnc(Br)cn1,Nc1cc2cccnc2c2ncccc12,Cn1cc(C[C@@H](Nc2nc(Nc3cnc(Br)cn3)nc(Nc3cc4ccc...,HSA,1,<rdkit.Chem.rdchem.Mol object at 0xffff7855b530>,"[0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, ..."
59998,112380474,O=C(N[C@H](C(=O)O)C1CCCC1)OCC1c2ccccc2-c2ccccc21,COC(=O)c1cc(Cl)ccc1N,Cn1cc(N)cn1,COC(=O)c1cc(Cl)ccc1Nc1nc(Nc2cnn(C)c2)nc(N[C@H]...,BRD4,1,<rdkit.Chem.rdchem.Mol object at 0xffff7855b5a0>,"[0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [28]:
def generate_ecfp(molecule, radius=2, bits = 1024):
    if molecule is None:
        return None
    return list(AllChem.GetMorganFingerprintAsBitVect(molecule, radius, nBits=bits))


ecfp_udf = udf(generate_ecfp, StringType())

get_molecule_udf = udf(lambda x: Chem.MolFromSmiles(x), StringType())

In [32]:
test_sample_data = sample_data.withColumn("molecule", get_molecule_udf(sample_data["molecule_smiles"].astype(StringType())))
# test_sample_data = test_sample_data.withColumn('ecfp', ecfp_udf(test_sample_data["molecule"]))

In [33]:
# test_sample_data.show()

24/06/14 21:04:07 ERROR Executor: Exception in task 0.0 in stage 8.0 (TID 76)
org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/opt/conda/lib/python3.11/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 588, in main
    func, profiler, deserializer, serializer = read_udfs(pickleSer, infile, eval_type)
                                               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/conda/lib/python3.11/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 447, in read_udfs
    udfs.append(read_single_udf(pickleSer, infile, eval_type, runner_conf, udf_index=i))
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/conda/lib/python3.11/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 249, in read_single_udf
    f, return_type = read_command(pickleSer, infile)
                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/conda

PythonException: 
  An exception was thrown from the Python worker. Please see the stack trace below.
Traceback (most recent call last):
  File "/opt/conda/lib/python3.11/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 588, in main
    func, profiler, deserializer, serializer = read_udfs(pickleSer, infile, eval_type)
                                               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/conda/lib/python3.11/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 447, in read_udfs
    udfs.append(read_single_udf(pickleSer, infile, eval_type, runner_conf, udf_index=i))
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/conda/lib/python3.11/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 249, in read_single_udf
    f, return_type = read_command(pickleSer, infile)
                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/conda/lib/python3.11/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 69, in read_command
    command = serializer._read_with_length(file)
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/conda/lib/python3.11/site-packages/pyspark/python/lib/pyspark.zip/pyspark/serializers.py", line 160, in _read_with_length
    return self.loads(obj)
           ^^^^^^^^^^^^^^^
  File "/opt/conda/lib/python3.11/site-packages/pyspark/python/lib/pyspark.zip/pyspark/serializers.py", line 430, in loads
    return pickle.loads(obj, encoding=encoding)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
TypeError: code() argument 13 must be str, not int


In [None]:
### Vector embedding molecule
train_sample['molecule'] = pandas_train_1['molecule_smiles'].apply(Chem.MolFromSmiles)
train_sample['ecfp'] = train_sample['molecule'].apply(generate_ecfp)


# ### Vector Embedding / Encoding protein name

oneHot = OneHotEncoder(sparse=False)
protein_oneHot = oneHot.fit_transform(train_sample['protein_name'].values.reshape(-1,1))

train_sample['encoded_protein_name'] = protein_oneHot.tolist()

# X = [ecfp + protein for ecfp, protein in zip(train_sample['ecfp'], train_sample['encoded_protein_name'])]
# y = train_sample['binds'].tolist()

# x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)

# rf = RandomForestClassifier(n_estimators=100, random_state=42)
# rf.fit(x_train, y_train)

In [14]:
predictions = rf.predict(x_test)
prob_predictions = rf.predict_proba(x_test)[:, 1]
print("Rounded Predictions:", average_precision_score(y_test, predictions))
print("Probability Predictions:", average_precision_score(y_test, prob_predictions))

Rounded Predictions: 0.869963320497054
Probability Predictions: 0.9615514515765019


# Test

In [None]:
for test_csv in pd.read_csv("test.csv", chunksize=100_000):
    ### Vectorize molecule_smiles
    test_csv['molecule'] = test_csv['molecule_smiles'].apply(Chem.MolFromSmiles)
    test_csv['ecfp'] = test_csv['molecule'].apply(generate_ecfp)
    
    ### Encode protein_name
    oneHotEncoderTest = OneHotEncoder(sparse=False)
    test_csv['encoded_protein_name'] = oneHotEncoderTest.transform(test_csv['protein_name'].values.reshape(-1,1)).tolist()

    x_test = [ecfp + protein for ecfp, protein in zip(test_csv['ecfp'], test_csv['encoded_protein_name'])]
    y_test = test_csv['binds'].tolist()

    test_pred = rf.transform(x_test)
    accuracy_precision_score(y_test, test_pred)