In [84]:
import pandas as pd
# import findspark
# findspark.init()
import pyspark
from pyspark.conf import SparkConf

from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, FloatType

from pyspark.ml.feature import StringIndexer, OneHotEncoder, Word2Vec
from imblearn.over_sampling import SMOTE
import rdkit
from rdkit import Chem
from rdkit.Chem import AllChem

import numpy as np

# import h2o
# from h2o.estimators import H2OGradientBoostingEstimator
# from h2o.frame import H2OFrame
# from pysparkling import H2OContext
# import pysparkling

In [114]:
def vector_to_array(v):
    return v.toArray().tolist()

vector_to_array_udf = udf(vector_to_array, ArrayType(FloatType()))


In [65]:
spark = SparkSession.builder.appName('belka').getOrCreate()
    # .config("spark.ext.h2o.backend.cluster.mode", "internal")\
    # .config("spark.executor.instances", "1")\
    # .config("spark.executor.memory", "2g")\
    # .config("spark.driver.memory", "2g")\
train_data = spark.read.parquet("train.parquet") # train_data.count() # 295,246,830
# h2o.init()
# hc = H2OContext.getOrCreate()

In [125]:
### Get a sample
sample_data = train_data.sample(.0001) # sample_data.count() 29509

sample_data = (
    sample_data
    .withColumn("buildingblock1_array", F.array(F.col("buildingblock1_smiles")))
    .withColumn("buildingblock2_array", F.array(F.col("buildingblock2_smiles")))
    .withColumn("buildingblock3_array", F.array(F.col("buildingblock3_smiles")))
    .withColumn("molecule_array", F.array(F.col("molecule_smiles")))
    .withColumn("encoded_sEH", F.when(F.col("protein_name") == "sEH", 1).otherwise(0))
    .withColumn("encoded_HSA", F.when(F.col("protein_name") == "HSA", 1).otherwise(0))
    .withColumn("encoded_BRD4", F.when(F.col("protein_name") == "BRD4", 1).otherwise(0))
)

### Vectorize Inputs
word2Vec1 = Word2Vec(vectorSize=100, minCount=0, inputCol="buildingblock1_array", outputCol="buildingblock1_vector")
model1 = word2Vec1.fit(sample_data)
result1 = model1.transform(sample_data)

word2Vec2 = Word2Vec(vectorSize=100, minCount=0, inputCol="buildingblock2_array", outputCol="buildingblock2_vector")
model2 = word2Vec2.fit(result1)
result2 = model2.transform(result1)

word2Vec3 = Word2Vec(vectorSize=100, minCount=0, inputCol="buildingblock3_array", outputCol="buildingblock3_vector")
model3 = word2Vec3.fit(result2)
result3 = model3.transform(result2)

word2Vec4 = Word2Vec(vectorSize=100, minCount=0, inputCol="molecule_array", outputCol="molecule_vector")
model4 = word2Vec4.fit(result3)
result4 = model4.transform(result3)

vectorized_samples = result4.select('buildingblock1_vector', 'buildingblock2_vector', 'buildingblock3_vector', 'molecule_vector', "encoded_sEH", "encoded_HSA", "encoded_BRD4", 'binds')

# df_with_array =(
#     vectorized_samples
#     .withColumn("buildingblock1_vector", vector_to_array_udf(vectorized_samples["buildingblock1_vector"]))
#     .withColumn("buildingblock2_vector", vector_to_array_udf(vectorized_samples["buildingblock2_vector"]))
#     .withColumn("buildingblock3_vector", vector_to_array_udf(vectorized_samples["buildingblock3_vector"]))
#     .withColumn("molecule_vector", vector_to_array_udf(vectorized_samples["molecule_vector"]))
# )

pandas_sample = vectorized_samples.toPandas()

### Split features and target
# pandas_X = pandas_sample.drop('binds', axis=1)
# pandas_Y = pandas_sample['binds']

### Distinct Counts
# buildingblock 1: 271
# buildingblock 2: 693
# buildingblock 3: 872
# molecule_smiles: 29,656
# 4 repeats for buildingblock triplets but they are binded to different target proteins

                                                                                

In [132]:
rename_block1 = {i:f"buildingblock1_feature_{i}" for i in range(100)}
values_df = pd.DataFrame(pandas_sample['buildingblock1_vector'].tolist(), index=pandas_sample.index)
result_df = pd.concat([pandas_sample.drop(columns=['buildingblock1_vector']), values_df], axis=1)
result_df = result_df.rename(columns=rename_block1)

rename_block2 = {i:f"buildingblock2_feature_{i}" for i in range(100)}
values_df = pd.DataFrame(result_df['buildingblock2_vector'].tolist(), index=result_df.index)
result_df = pd.concat([result_df.drop(columns=['buildingblock2_vector']), values_df], axis=1)
result_df = result_df.rename(columns=rename_block2)

rename_block3 = {i:f"buildingblock3_feature_{i}" for i in range(100)}
values_df = pd.DataFrame(result_df['buildingblock3_vector'].tolist(), index=result_df.index)
result_df = pd.concat([result_df.drop(columns=['buildingblock3_vector']), values_df], axis=1)
result_df = result_df.rename(columns=rename_block3)

rename_molecule = {i:f"molecule_feature_{i}" for i in range(100)}
values_df = pd.DataFrame(result_df['molecule_vector'].tolist(), index=result_df.index)
result_df = pd.concat([result_df.drop(columns=['molecule_vector']), values_df], axis=1)
result_df = result_df.rename(columns=rename_molecule)
result_df

Unnamed: 0,encoded_sEH,encoded_HSA,encoded_BRD4,binds,buildingblock1_feature_0,buildingblock1_feature_1,buildingblock1_feature_2,buildingblock1_feature_3,buildingblock1_feature_4,buildingblock1_feature_5,...,molecule_feature_90,molecule_feature_91,molecule_feature_92,molecule_feature_93,molecule_feature_94,molecule_feature_95,molecule_feature_96,molecule_feature_97,molecule_feature_98,molecule_feature_99
0,0,1,0,0,-0.002727,-0.000942,0.000521,-0.004563,-0.002576,0.002270,...,0.001769,0.001085,-0.004528,-0.004441,-0.001788,-0.001371,-0.002214,-0.001359,0.003159,0.002058
1,0,1,0,0,-0.002727,-0.000942,0.000521,-0.004563,-0.002576,0.002270,...,-0.003852,0.000531,-0.000092,-0.002495,0.003172,0.000950,0.002907,-0.003875,-0.004797,0.001894
2,0,1,0,0,-0.002727,-0.000942,0.000521,-0.004563,-0.002576,0.002270,...,-0.004736,0.004489,0.003640,-0.004358,-0.000925,0.000233,0.002575,-0.004697,0.002837,-0.001405
3,0,0,1,0,-0.002727,-0.000942,0.000521,-0.004563,-0.002576,0.002270,...,0.002806,-0.004771,-0.003787,-0.003892,-0.001408,0.001045,-0.000370,-0.004266,-0.002928,-0.002925
4,0,0,1,0,-0.002727,-0.000942,0.000521,-0.004563,-0.002576,0.002270,...,0.004610,0.003689,0.004344,0.002354,-0.003949,0.001182,-0.001326,0.002692,-0.002172,-0.002440
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29543,0,1,0,0,0.002279,0.002328,-0.000433,-0.004854,0.004399,-0.000839,...,0.002543,0.004035,-0.001119,0.003560,0.004645,0.002707,0.000538,-0.000568,0.002093,-0.000608
29544,1,0,0,0,0.002279,0.002328,-0.000433,-0.004854,0.004399,-0.000839,...,-0.002761,0.001234,-0.004738,0.004724,-0.002045,0.004147,0.001958,0.002179,-0.003595,0.004718
29545,1,0,0,0,0.002279,0.002328,-0.000433,-0.004854,0.004399,-0.000839,...,-0.000340,-0.001008,-0.001829,0.004770,-0.003897,0.000213,0.000973,0.000519,-0.002310,0.004751
29546,0,0,1,0,0.002279,0.002328,-0.000433,-0.004854,0.004399,-0.000839,...,0.004844,-0.003415,0.002218,-0.003524,-0.003184,0.000521,0.002512,0.004322,-0.000791,-0.003969


In [140]:
# smote_y[smote_y['binds'] == 1] # 29371, 177
smote_y.value_counts()

binds
0    29371
1    29371
Name: count, dtype: int64

In [19]:
binds_distribution = train_data.select('binds').groupby('binds').count()
binds_distribution.show()
"""
Imbalanced labels. Most of the attempts do not bind at all.
0: 293656924
1: 1589906
"""

[Stage 22:>                                                       (0 + 14) / 28]

+-----+---------+
|binds|    count|
+-----+---------+
|    0|293656924|
|    1|  1589906|
+-----+---------+





'\nImbalanced labels. Most of the attempts do not bind at all.\n'

In [16]:
result_distribution = train_data.groupby(['protein_name', 'binds']).count()
result_distribution.show()
"""
sEH tends to be more likely to have a bind than the other two.
"""



+------------+-----+--------+
|protein_name|binds|   count|
+------------+-----+--------+
|         HSA|    0|98007200|
|         sEH|    1|  724532|
|         sEH|    0|97691078|
|        BRD4|    1|  456964|
|        BRD4|    0|97958646|
|         HSA|    1|  408410|
+------------+-----+--------+





In [133]:
train_x = result_df.drop('binds', axis=1)
# train_x = pandas_sample[['encoded_sEH', 'encoded_HSA', 'encoded_BRD4']]
train_y = result_df['binds']

# train_X = np.stack(train_X.values).reshape((train_X.shape[0], -1))
# train_y = train_y.values


smote = SMOTE()
smote_x, smote_y = smote.fit_resample(train_x, train_y)


In [134]:
smote_x

Unnamed: 0,encoded_sEH,encoded_HSA,encoded_BRD4,buildingblock1_feature_0,buildingblock1_feature_1,buildingblock1_feature_2,buildingblock1_feature_3,buildingblock1_feature_4,buildingblock1_feature_5,buildingblock1_feature_6,...,molecule_feature_90,molecule_feature_91,molecule_feature_92,molecule_feature_93,molecule_feature_94,molecule_feature_95,molecule_feature_96,molecule_feature_97,molecule_feature_98,molecule_feature_99
0,0,1,0,-0.002727,-0.000942,0.000521,-0.004563,-0.002576,0.002270,-0.002814,...,0.001769,0.001085,-0.004528,-0.004441,-0.001788,-0.001371,-0.002214,-0.001359,0.003159,0.002058
1,0,1,0,-0.002727,-0.000942,0.000521,-0.004563,-0.002576,0.002270,-0.002814,...,-0.003852,0.000531,-0.000092,-0.002495,0.003172,0.000950,0.002907,-0.003875,-0.004797,0.001894
2,0,1,0,-0.002727,-0.000942,0.000521,-0.004563,-0.002576,0.002270,-0.002814,...,-0.004736,0.004489,0.003640,-0.004358,-0.000925,0.000233,0.002575,-0.004697,0.002837,-0.001405
3,0,0,1,-0.002727,-0.000942,0.000521,-0.004563,-0.002576,0.002270,-0.002814,...,0.002806,-0.004771,-0.003787,-0.003892,-0.001408,0.001045,-0.000370,-0.004266,-0.002928,-0.002925
4,0,0,1,-0.002727,-0.000942,0.000521,-0.004563,-0.002576,0.002270,-0.002814,...,0.004610,0.003689,0.004344,0.002354,-0.003949,0.001182,-0.001326,0.002692,-0.002172,-0.002440
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58737,1,0,0,-0.000037,0.004276,-0.002660,0.000089,-0.001366,-0.000111,0.000568,...,-0.001125,-0.004376,0.002188,-0.003733,0.003819,-0.002197,-0.002971,0.000558,-0.001117,-0.002731
58738,0,1,0,-0.000503,-0.001584,-0.000908,0.000382,0.002009,0.002393,-0.002321,...,-0.003277,0.002444,0.000266,0.003406,0.001919,0.001332,0.001070,0.003709,0.001200,-0.001888
58739,0,1,0,0.001867,0.001599,-0.004130,0.000021,-0.001769,0.001053,-0.000944,...,-0.004137,0.001341,0.002839,-0.000721,0.001808,0.004410,-0.000209,0.003784,0.004414,0.001611
58740,0,1,0,-0.004033,0.000812,-0.000025,-0.003410,-0.002216,-0.001047,0.003492,...,-0.003229,-0.001292,0.003629,0.000606,-0.002850,0.000578,-0.003317,0.004681,0.003407,-0.003105


In [14]:
h2o_df = hc.as_h2o_frame(sample_data, "h2o_df")

AttributeError: 'H2OContext' object has no attribute 'as_h2o_frame'

In [16]:
# # // Scala code for SMOTE implementation
# import org.apache.spark.sql.{SparkSession, DataFrame}
# import org.apache.spark.ml.feature.{VectorAssembler, StringIndexer}
# import org.apache.spark.ml.linalg.Vectors
# import org.apache.spark.sql.functions._

# # def smote(df: DataFrame, targetColumn: String, k: Int = 5): DataFrame = {
# #   // Implement SMOTE logic in Scala here
# #   // Placeholder for the actual SMOTE logic

# #   df // Placeholder: return the original DataFrame for now
# # }

# # // PySpark code to call Scala SMOTE implementation
# # from pyspark.sql import SparkSession

# # spark = SparkSession.builder.appName("Scala SMOTE with PySpark DataFrame").getOrCreate()

# # # Load data into a PySpark DataFrame
# # df = spark.read.csv('path/to/your/data.csv', header=True, inferSchema=True)

# # # Call Scala SMOTE implementation
# # oversampled_df = spark._jvm.smote(df._jdf, "target_column")

# # # Convert back to PySpark DataFrame
# # oversampled_pyspark_df = DataFrame(oversampled_df, spark)


SyntaxError: invalid syntax (3923943021.py, line 2)

In [8]:
testData = spark.read.parquet("test.parquet")
pdtestData = testData.toPandas()
pdtestData

Unnamed: 0,id,buildingblock1_smiles,buildingblock2_smiles,buildingblock3_smiles,molecule_smiles,protein_name
0,295246830,C#CCCC[C@H](NC(=O)OCC1c2ccccc2-c2ccccc21)C(=O)O,C=Cc1ccc(N)cc1,C=Cc1ccc(N)cc1,C#CCCC[C@H](Nc1nc(Nc2ccc(C=C)cc2)nc(Nc2ccc(C=C...,BRD4
1,295246831,C#CCCC[C@H](NC(=O)OCC1c2ccccc2-c2ccccc21)C(=O)O,C=Cc1ccc(N)cc1,C=Cc1ccc(N)cc1,C#CCCC[C@H](Nc1nc(Nc2ccc(C=C)cc2)nc(Nc2ccc(C=C...,HSA
2,295246832,C#CCCC[C@H](NC(=O)OCC1c2ccccc2-c2ccccc21)C(=O)O,C=Cc1ccc(N)cc1,C=Cc1ccc(N)cc1,C#CCCC[C@H](Nc1nc(Nc2ccc(C=C)cc2)nc(Nc2ccc(C=C...,sEH
3,295246833,C#CCCC[C@H](NC(=O)OCC1c2ccccc2-c2ccccc21)C(=O)O,C=Cc1ccc(N)cc1,CC(O)Cn1cnc2c(N)ncnc21,C#CCCC[C@H](Nc1nc(Nc2ccc(C=C)cc2)nc(Nc2ncnc3c2...,BRD4
4,295246834,C#CCCC[C@H](NC(=O)OCC1c2ccccc2-c2ccccc21)C(=O)O,C=Cc1ccc(N)cc1,CC(O)Cn1cnc2c(N)ncnc21,C#CCCC[C@H](Nc1nc(Nc2ccc(C=C)cc2)nc(Nc2ncnc3c2...,HSA
...,...,...,...,...,...,...
1674891,296921721,[N-]=[N+]=NCCC[C@H](NC(=O)OCC1c2ccccc2-c2ccccc...,Nc1noc2ccc(F)cc12,COC1CCC(CCN)CC1,COC1CCC(CCNc2nc(Nc3noc4ccc(F)cc34)nc(N[C@@H](C...,HSA
1674892,296921722,[N-]=[N+]=NCCC[C@H](NC(=O)OCC1c2ccccc2-c2ccccc...,Nc1noc2ccc(F)cc12,COC1CCC(CCN)CC1,COC1CCC(CCNc2nc(Nc3noc4ccc(F)cc34)nc(N[C@@H](C...,sEH
1674893,296921723,[N-]=[N+]=NCCC[C@H](NC(=O)OCC1c2ccccc2-c2ccccc...,Nc1noc2ccc(F)cc12,NCc1cccs1,[N-]=[N+]=NCCC[C@H](Nc1nc(NCc2cccs2)nc(Nc2noc3...,BRD4
1674894,296921724,[N-]=[N+]=NCCC[C@H](NC(=O)OCC1c2ccccc2-c2ccccc...,Nc1noc2ccc(F)cc12,NCc1cccs1,[N-]=[N+]=NCCC[C@H](Nc1nc(NCc2cccs2)nc(Nc2noc3...,HSA
