In [122]:

import torch.nn as nn
import torch
import numpy as np
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt


In [123]:
#Read the data into a pandas data frame 
df = pd.read_parquet("de_train.parquet")

In [124]:
from rdkit import Chem
from rdkit.Chem import rdFMCS

In [125]:


def count_substructure_occurrences(target_smiles, query_smiles):
    try:
        # Convert SMILES to RDKit molecules
        target_mol = Chem.MolFromSmiles(target_smiles)
        query_mol = Chem.MolFromSmiles(query_smiles)

        # Check if the molecules are valid
        if target_mol is None or query_mol is None:
            print("Error: Invalid SMILES.")
            return None

        # Use SubstructMatch to find occurrences
        occurrences = target_mol.GetSubstructMatches(query_mol)

        return len(occurrences)
    except Exception as e:
        print(f"Error: {e}")
        return None

# Example usage
target_smiles = "CCOCC"
query_smiles = "c1ccccc1"
occurrence_count = count_substructure_occurrences(target_smiles, query_smiles)

if occurrence_count is not None:
    print(f"The substructure '{query_smiles}' is found {occurrence_count} times in the molecule '{target_smiles}'.")

The substructure 'c1ccccc1' is found 0 times in the molecule 'CCOCC'.


In [126]:
functional_groups  = ["O", "C=O", "N", "C(=O)N", "C(=O)O", "N=O", "S", "P([O])([O])", "S(=O)([O])([O])", "C=C", "C#C", "c1ccccc1", "F", "Cl", "Br", "I", "C" ]
functional_column_names = [(i, 'int') for i in functional_groups]
SMILES = df["SMILES"].tolist()

functional_groups_block = [[count_substructure_occurrences(i,j) for j in functional_groups ] for i in SMILES]


In [127]:
print(f"Here a few entries from the SMILES column: \n{df['SMILES'].head(10)}.\n")
print(functional_groups_block[1])

Here a few entries from the SMILES column: 
0               Clc1ccccc1C(c1ccccc1)(c1ccccc1)n1ccnc1
1               Clc1ccccc1C(c1ccccc1)(c1ccccc1)n1ccnc1
2               Clc1ccccc1C(c1ccccc1)(c1ccccc1)n1ccnc1
3               Clc1ccccc1C(c1ccccc1)(c1ccccc1)n1ccnc1
4    C[C@@H]1C[C@H]2[C@@H]3CCC4=CC(=O)C=C[C@]4(C)[C...
5    C[C@@H]1C[C@H]2[C@@H]3CCC4=CC(=O)C=C[C@]4(C)[C...
6    C[C@@H]1C[C@H]2[C@@H]3CCC4=CC(=O)C=C[C@]4(C)[C...
7    C[C@@H]1C[C@H]2[C@@H]3CCC4=CC(=O)C=C[C@]4(C)[C...
8    CC[C@H](Nc1ncnc2[nH]cnc12)c1nc2cccc(F)c2c(=O)n...
9    CC[C@H](Nc1ncnc2[nH]cnc12)c1nc2cccc(F)c2c(=O)n...
Name: SMILES, dtype: object.

[0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 1, 0, 0, 22]


In [128]:
def create_dataframe(column_names, data):
    df = pd.DataFrame(data, columns=[name for name, _ in column_names])
    return df

functional_group_df = create_dataframe(functional_column_names, functional_groups_block)


In [129]:
#normalizes data between -1 and 1
#returns normalized data and the factors used to normalzie 
def normalize(df):
    min = df.min()
    max=df.max()
    df_normalized = (df - min) / (max-min)
    return df_normalized, min.reset_index(drop=True), max.reset_index(drop=True)


#unnormalize
def unnormalize(normalized_df, min, max):
    return min + normalized_df*(max-min)

In [130]:
############################DATA PREP###############################



####One hot incodes inputs####

# Initialize the OneHotEncoder
encoder = OneHotEncoder(sparse_output=False)

# Fit and transform the data
cells = df["cell_type"].values.reshape(-1, 1)
hot_cells = encoder.fit_transform(cells)
cell_mapping = encoder.categories_[0]

compounds = df['sm_name'].values.reshape(-1, 1)
hot_compounds = encoder.fit_transform(compounds)
compound_mapping = encoder.categories_[0]

#Puts together inputs
hot_encoded = np.hstack((hot_cells, hot_compounds))
hot_encoded_df = pd.DataFrame(data = hot_encoded)
inputs_df = pd.concat([hot_encoded_df, functional_group_df], axis=1)

####Normalizes Outputs####

outputs = df.loc[:, 'A1BG':'ZZEF1']
outputs_norm_df, norm_min, norm_max = normalize(outputs)


####Puts Inputs and Outputs Together####
prepped_df = pd.concat([inputs_df, outputs_norm_df], axis=1)

print(type(hot_cells))


<class 'numpy.ndarray'>


In [131]:
from torch.utils.data import DataLoader, Dataset

#Break into training and validation and split inputs from outputs
val, train = train_test_split(prepped_df, train_size=.2, random_state=3)
trainIn_df = train.loc[:, 0:"C"]
trainOut_df = train.loc[:, 'A1BG':'ZZEF1']
valIn_df = val.loc[:,0:"C"]
valOut_df = val.loc[:, 'A1BG':'ZZEF1']

#Transforms data frames into tensors
trainIn_t = torch.tensor(trainIn_df.values).float()
trainOut_t = torch.tensor(trainOut_df.values).float()
valIn_t = torch.tensor(valIn_df.values).float()
valOut_t = torch.tensor(valOut_df.values).float()




In [132]:

import torch.nn.functional as F

class CustomNetwork(nn.Module):
    def __init__(self):
        super(CustomNetwork, self).__init__()
        # Define the layers
        self.layer1 = nn.Linear(169, 512)  # First hidden layer
        self.batch_norm1 = nn.BatchNorm1d(512)
        self.dropout1 = nn.Dropout(0.3)

        self.layer2 = nn.Linear(512, 1024) # Second hidden layer
        self.batch_norm2 = nn.BatchNorm1d(1024)
        self.dropout2 = nn.Dropout(0.3)

        self.layer3 = nn.Linear(1024, 512) # Third hidden layer
        self.batch_norm3 = nn.BatchNorm1d(512)
        self.dropout3 = nn.Dropout(0.3)

        self.layer4 = nn.Linear(512, 256) # Fourth hidden layer
        self.batch_norm4 = nn.BatchNorm1d(256)
        self.dropout4 = nn.Dropout(0.3)

        self.output_layer = nn.Linear(256, 18211) # Output layer

    def forward(self, x):
        # Forward pass through the network
        x = F.relu(self.batch_norm1(self.layer1(x)))
        x = self.dropout1(x)

        x = F.relu(self.batch_norm2(self.layer2(x)))
        x = self.dropout2(x)

        x = F.relu(self.batch_norm3(self.layer3(x)))
        x = self.dropout3(x)

        x = F.relu(self.batch_norm4(self.layer4(x)))
        x = self.dropout4(x)

        x = self.output_layer(x)
        return x


In [133]:

import torch.optim as optim

import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

def train_model(model, trainIn_t, trainOut_t, valIn_t, valOut_t, batch_size, num_epochs):
    criterion = nn.BCEWithLogitsLoss()  # Adjust according to your needs
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=5, factor=0.3, verbose=True)

    num_samples = trainIn_t.shape[0]
    num_batches = int(np.ceil(num_samples / batch_size))

    for epoch in range(num_epochs):
        model.train()  # Set the model to training mode
        epoch_loss = 0.0

        # Shuffle the data at the beginning of each epoch
        permutation = torch.randperm(num_samples)

        for batch_idx in range(num_batches):
            start_idx = batch_idx * batch_size
            end_idx = min(start_idx + batch_size, num_samples)
            indices = permutation[start_idx:end_idx]

            batch_trainIn = trainIn_t[indices]
            batch_trainOut = trainOut_t[indices]

            optimizer.zero_grad()

            # Forward pass
            outputs = model(batch_trainIn)

            # Compute loss
            loss = criterion(outputs, batch_trainOut)
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item() * len(indices)

        epoch_loss /= num_samples

        # Validation
        model.eval()
        with torch.no_grad():
            val_output_predictions = model(valIn_t)
            val_loss = criterion(val_output_predictions, valOut_t)

        scheduler.step(val_loss)

        print(f'Epoch {epoch+1}/{num_epochs}, Training Loss: {epoch_loss:.4f}, Validation Loss: {val_loss:.4f}')

    return model




In [134]:
model = CustomNetwork()
trained_model = train_model(model, trainIn_t, trainOut_t, valIn_t, valOut_t, batch_size=100, num_epochs=100)

Epoch 1/100, Training Loss: 0.6765, Validation Loss: 0.6876
Epoch 2/100, Training Loss: 0.5996, Validation Loss: 0.6132
Epoch 3/100, Training Loss: 0.5581, Validation Loss: 0.5614
Epoch 4/100, Training Loss: 0.5421, Validation Loss: 0.5390
Epoch 5/100, Training Loss: 0.5363, Validation Loss: 0.5328
Epoch 6/100, Training Loss: 0.5335, Validation Loss: 0.5313
Epoch 7/100, Training Loss: 0.5310, Validation Loss: 0.5297
Epoch 8/100, Training Loss: 0.5286, Validation Loss: 0.5293
Epoch 9/100, Training Loss: 0.5279, Validation Loss: 0.5289
Epoch 10/100, Training Loss: 0.5277, Validation Loss: 0.5286
Epoch 11/100, Training Loss: 0.5271, Validation Loss: 0.5286
Epoch 12/100, Training Loss: 0.5265, Validation Loss: 0.5280
Epoch 13/100, Training Loss: 0.5257, Validation Loss: 0.5284
Epoch 14/100, Training Loss: 0.5256, Validation Loss: 0.5285
Epoch 15/100, Training Loss: 0.5256, Validation Loss: 0.5284
Epoch 16/100, Training Loss: 0.5247, Validation Loss: 0.5286
Epoch 17/100, Training Loss: 0.52

In [135]:
def hot_encode(cell,compound):
    cell_vec = np.zeros(cell_mapping.size)
    cell_dict = {value: index for index, value in enumerate(cell_mapping)}

    compound_vec = np.zeros(compound_mapping.size)
    compound_dict = {value: index for index, value in enumerate(compound_mapping)}

    sm_names = df["sm_name"].tolist()
    functional_group_dict= dict(zip(sm_names, functional_groups_block))
    

    cell_vec[cell_dict[cell]]=1
    compound_vec[compound_dict[compound]]=1
    functional_group_vec = np.array(functional_group_dict[compound])
    vector = np.concatenate((cell_vec, compound_vec, functional_group_vec), axis = 0)
    tensor = torch.from_numpy(vector)
    return tensor.float()


In [136]:
a = trainIn_t[1].reshape(1,-1)
b = hot_encode("NK cells", "Clotrimazole").reshape(1,-1)
print(a.dtype)
print(b.dtype)
# print(type(a[0,1]))
# print(type(b[0,1]))
# print(a.shape)
# print(b.shape)
# print(a)
# print(b)


torch.float32
torch.float32


In [137]:
trained_model.eval()
trained_model(b)

tensor([[-1.9072, -1.2558,  0.0329,  ..., -0.2862, -0.1279,  0.2803]],
       grad_fn=<AddmmBackward0>)

In [141]:
print(norm_min)
print(norm_max)

0        -3.258689
1        -4.067518
2       -28.552191
3       -17.801989
4        -6.378904
           ...    
18206    -2.617712
18207    -8.310726
18208    -3.456731
18209    -7.189487
18210    -4.819105
Length: 18211, dtype: float64
0        22.085428
1        14.421991
2        27.557166
3        27.603715
4        35.278040
           ...    
18206    23.089832
18207    10.844942
18208     4.822008
18209     7.826692
18210     3.534737
Length: 18211, dtype: float64


In [144]:
def get_expression(cell_type, compound_name):
    tensor = model(hot_encode(cell_type,compound_name).reshape(1,-1))
    np_array = tensor.detach().numpy()
    df = pd.DataFrame(np_array)
    return unnormalize(df,norm_min,norm_max)


test = get_expression("NK cells", "Clotrimazole")
print(type(test))
print(test)




<class 'pandas.core.frame.DataFrame'>
       0          1          2          3          4          5         6       
0 -51.594232 -27.286203 -26.708463 -38.290813 -76.910906 -74.564577 -7.274192  \

       7          8         9      ...      18201      18202     18203   
0 -33.653929 -20.709475 -4.425873  ... -18.984896 -31.384534  1.034231  \

       18204      18205      18206      18207     18208    18209     18210  
0 -31.431096 -42.970734 -57.971333 -12.824981 -5.826235 -9.11075 -2.477523  

[1 rows x 18211 columns]


In [146]:
# Read the sample submission and test set ID map
sample_submission = pd.read_csv("sample_submission.csv")
testDf = pd.read_csv("id_map.csv")

# Initialize an empty list to collect the predicted values
predicted_values = []

# Loop through the test set to get the predicted values
for idx, row in testDf.iterrows():
    cell_type = row['cell_type']
    sm_name = row['sm_name']
    
    # Call your get_expression method here
    expression_values = get_expression(cell_type, sm_name)
    
    # Append the values to the list
    predicted_values.append(expression_values)

# Convert the list of predicted values to a numpy array
predicted_values_array = np.array(predicted_values)

# Replace the values in the sample submission DataFrame
sample_submission.iloc[:, 1:] = predicted_values_array

# Save the DataFrame to a new CSV file
sample_submission.to_csv("my_submission.csv", index=False)



ValueError: Must have equal len keys and value when setting with an iterable