In [2]:

import torch.nn as nn
import torch
import numpy as np
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt


In [3]:
#Read the data into a pandas data frame 
df = pd.read_parquet("de_train.parquet")

In [4]:
from rdkit import Chem
from rdkit.Chem import rdFMCS

In [38]:


def count_substructure_occurrences(target_smiles, query_smiles):
    try:
        # Convert SMILES to RDKit molecules
        target_mol = Chem.MolFromSmiles(target_smiles)
        query_mol = Chem.MolFromSmiles(query_smiles)

        # Check if the molecules are valid
        if target_mol is None or query_mol is None:
            print("Error: Invalid SMILES.")
            return None

        # Use SubstructMatch to find occurrences
        occurrences = target_mol.GetSubstructMatches(query_mol)

        return len(occurrences)
    except Exception as e:
        print(f"Error: {e}")
        return None

# Example usage
target_smiles = "CCOCC"
query_smiles = "c1ccccc1"
occurrence_count = count_substructure_occurrences(target_smiles, query_smiles)

if occurrence_count is not None:
    print(f"The substructure '{query_smiles}' is found {occurrence_count} times in the molecule '{target_smiles}'.")

The substructure 'c1ccccc1' is found 0 times in the molecule 'CCOCC'.


In [58]:
functional_groups  = ["O", "C=O", "N", "C(=O)N", "C(=O)O", "N=O", "S", "P([O])([O])", "S(=O)([O])([O])", "C=C", "C#C", "c1ccccc1", "F", "Cl", "Br", "I", "C" ]
functional_column_names = [(i, 'int') for i in functional_groups]
SMILES = df["SMILES"].tolist()

functional_groups_block = [[count_substructure_occurrences(i,j) for j in functional_groups ] for i in SMILES]


In [83]:
print(f"Here a few entries from the SMILES column: \n{df['SMILES'].head(10)}.\n")

Here a few entries from the SMILES column: 
0               Clc1ccccc1C(c1ccccc1)(c1ccccc1)n1ccnc1
1               Clc1ccccc1C(c1ccccc1)(c1ccccc1)n1ccnc1
2               Clc1ccccc1C(c1ccccc1)(c1ccccc1)n1ccnc1
3               Clc1ccccc1C(c1ccccc1)(c1ccccc1)n1ccnc1
4    C[C@@H]1C[C@H]2[C@@H]3CCC4=CC(=O)C=C[C@]4(C)[C...
5    C[C@@H]1C[C@H]2[C@@H]3CCC4=CC(=O)C=C[C@]4(C)[C...
6    C[C@@H]1C[C@H]2[C@@H]3CCC4=CC(=O)C=C[C@]4(C)[C...
7    C[C@@H]1C[C@H]2[C@@H]3CCC4=CC(=O)C=C[C@]4(C)[C...
8    CC[C@H](Nc1ncnc2[nH]cnc12)c1nc2cccc(F)c2c(=O)n...
9    CC[C@H](Nc1ncnc2[nH]cnc12)c1nc2cccc(F)c2c(=O)n...
Name: SMILES, dtype: object.



In [62]:
def create_dataframe(column_names, data):
    df = pd.DataFrame(data, columns=[name for name, _ in column_names])
    return df

functional_group_df = create_dataframe(functional_column_names, functional_groups_block)

sm_names = df["sm_name"].tolist
functional_group_dict= {key: value for key, value in zip(sm_names, values)}

In [52]:
#normalizes data between -1 and 1
#returns normalized data and the factors used to normalzie 
def normalize(df):
    min = df.min()
    max=df.max()
    df_normalized = (df - min) / (max-min)
    return df_normalized, min.reset_index(drop=True), max.reset_index(drop=True)


#unnormalize
def unnormalize(normalized_df, min, max):
    return min + normalized_df*(max-min)

In [63]:
############################DATA PREP###############################



####One hot incodes inputs####

# Initialize the OneHotEncoder
encoder = OneHotEncoder(sparse_output=False)

# Fit and transform the data
cells = df["cell_type"].values.reshape(-1, 1)
hot_cells = encoder.fit_transform(cells)
cell_mapping = encoder.categories_[0]

compounds = df['sm_name'].values.reshape(-1, 1)
hot_compounds = encoder.fit_transform(compounds)
compound_mapping = encoder.categories_[0]

#Puts together inputs
hot_encoded = np.hstack((hot_cells, hot_compounds))
hot_encoded_df = pd.DataFrame(data = hot_encoded)
inputs_df = pd.concat([hot_encoded_df, functional_group_df], axis=1)

####Normalizes Outputs####

outputs = df.loc[:, 'A1BG':'ZZEF1']
outputs_norm_df, norm_min, norm_max = normalize(outputs)


####Puts Inputs and Outputs Together####
prepped_df = pd.concat([inputs_df, outputs_norm_df], axis=1)

print(type(hot_cells))


<class 'numpy.ndarray'>


In [78]:
from torch.utils.data import DataLoader, Dataset

#Break into training and validation and split inputs from outputs
val, train = train_test_split(prepped_df, train_size=.2, random_state=3)
trainIn_df = train.loc[:, 0:"C"]
trainOut_df = train.loc[:, 'A1BG':'ZZEF1']
valIn_df = val.loc[:,0:"C"]
valOut_df = val.loc[:, 'A1BG':'ZZEF1']

#Transforms data frames into tensors
trainIn_t = torch.tensor(trainIn_df.values).float()
trainOut_t = torch.tensor(trainOut_df.values).float()
valIn_t = torch.tensor(valIn_df.values).float()
valOut_t = torch.tensor(valOut_df.values).float()




In [79]:

import torch.nn.functional as F

class CustomNetwork(nn.Module):
    def __init__(self):
        super(CustomNetwork, self).__init__()
        # Define the layers
        self.layer1 = nn.Linear(169, 512)  # First hidden layer
        self.batch_norm1 = nn.BatchNorm1d(512)
        self.dropout1 = nn.Dropout(0.3)

        self.layer2 = nn.Linear(512, 1024) # Second hidden layer
        self.batch_norm2 = nn.BatchNorm1d(1024)
        self.dropout2 = nn.Dropout(0.3)

        self.layer3 = nn.Linear(1024, 512) # Third hidden layer
        self.batch_norm3 = nn.BatchNorm1d(512)
        self.dropout3 = nn.Dropout(0.3)

        self.layer4 = nn.Linear(512, 256) # Fourth hidden layer
        self.batch_norm4 = nn.BatchNorm1d(256)
        self.dropout4 = nn.Dropout(0.3)

        self.output_layer = nn.Linear(256, 18211) # Output layer

    def forward(self, x):
        # Forward pass through the network
        x = F.relu(self.batch_norm1(self.layer1(x)))
        x = self.dropout1(x)

        x = F.relu(self.batch_norm2(self.layer2(x)))
        x = self.dropout2(x)

        x = F.relu(self.batch_norm3(self.layer3(x)))
        x = self.dropout3(x)

        x = F.relu(self.batch_norm4(self.layer4(x)))
        x = self.dropout4(x)

        x = self.output_layer(x)
        return x


In [80]:

import torch.optim as optim

import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

def train_model(model, trainIn_t, trainOut_t, valIn_t, valOut_t, batch_size, num_epochs):
    criterion = nn.BCEWithLogitsLoss()  # Adjust according to your needs
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=5, factor=0.3, verbose=True)

    num_samples = trainIn_t.shape[0]
    num_batches = int(np.ceil(num_samples / batch_size))

    for epoch in range(num_epochs):
        model.train()  # Set the model to training mode
        epoch_loss = 0.0

        # Shuffle the data at the beginning of each epoch
        permutation = torch.randperm(num_samples)

        for batch_idx in range(num_batches):
            start_idx = batch_idx * batch_size
            end_idx = min(start_idx + batch_size, num_samples)
            indices = permutation[start_idx:end_idx]

            batch_trainIn = trainIn_t[indices]
            batch_trainOut = trainOut_t[indices]

            optimizer.zero_grad()

            # Forward pass
            outputs = model(batch_trainIn)

            # Compute loss
            loss = criterion(outputs, batch_trainOut)
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item() * len(indices)

        epoch_loss /= num_samples

        # Validation
        model.eval()
        with torch.no_grad():
            val_output_predictions = model(valIn_t)
            val_loss = criterion(val_output_predictions, valOut_t)

        scheduler.step(val_loss)

        print(f'Epoch {epoch+1}/{num_epochs}, Training Loss: {epoch_loss:.4f}, Validation Loss: {val_loss:.4f}')

    return model




In [82]:
model = CustomNetwork()
trained_model = train_model(model, trainIn_t, trainOut_t, valIn_t, valOut_t, batch_size=100, num_epochs=100)

Epoch 1/100, Training Loss: 0.6758, Validation Loss: 0.6881
Epoch 2/100, Training Loss: 0.6006, Validation Loss: 0.6145
Epoch 3/100, Training Loss: 0.5599, Validation Loss: 0.5566
Epoch 4/100, Training Loss: 0.5431, Validation Loss: 0.5379
Epoch 5/100, Training Loss: 0.5371, Validation Loss: 0.5324
Epoch 6/100, Training Loss: 0.5338, Validation Loss: 0.5323
Epoch 7/100, Training Loss: 0.5310, Validation Loss: 0.5307
Epoch 8/100, Training Loss: 0.5294, Validation Loss: 0.5295
Epoch 9/100, Training Loss: 0.5287, Validation Loss: 0.5290
Epoch 10/100, Training Loss: 0.5274, Validation Loss: 0.5284
Epoch 11/100, Training Loss: 0.5277, Validation Loss: 0.5281
Epoch 12/100, Training Loss: 0.5264, Validation Loss: 0.5279
Epoch 13/100, Training Loss: 0.5257, Validation Loss: 0.5278
Epoch 14/100, Training Loss: 0.5261, Validation Loss: 0.5278
Epoch 15/100, Training Loss: 0.5254, Validation Loss: 0.5282
Epoch 16/100, Training Loss: 0.5256, Validation Loss: 0.5282
Epoch 17/100, Training Loss: 0.52

In [None]:
def hot_encode(cell,compound):
    cell_vec = np.zeros(cell_mapping.size)
    cell_dict = {value: index for index, value in enumerate(cell_mapping)}

    compound_vec = np.zeros(compound_mapping.size)
    compound_dict = {value: index for index, value in enumerate(compound_mapping)}


    cell_vec[cell_dict[cell]]=1
    compound_vec[compound_dict[compound]]=1
    vector = np.concatenate((cell_vec, compound_vec), axis = 0)
    tensor = torch.from_numpy(vector)
    return tensor


In [None]:
model(hot_encode("NK cells", "Clotrimazole"))

tensor([0.1321, 0.2412, 0.4950,  ..., 0.4223, 0.4664, 0.5452],
       dtype=torch.float64, grad_fn=<SigmoidBackward0>)

In [None]:
def get_expression(cell_type, compound_name):
    tensor = model(hot_encode(cell_type,compound_name))
    np_array = tensor.detach().numpy()
    df = pd.DataFrame(np_array)
    return unnormalize(df[0],norm_min,norm_max)


df = get_expression("NK cells", "Clotrimazole")
print(df)




0        0.089716
1        0.392768
2       -0.776956
3       -0.502081
4        0.056250
           ...   
18206    0.132327
18207    0.040884
18208    0.039621
18209   -0.185311
18210   -0.264756
Length: 18211, dtype: float64


In [None]:
# Read the sample submission and test set ID map
sample_submission = pd.read_csv("sample_submission.csv")
testDf = pd.read_csv("id_map.csv")

# Initialize an empty list to collect the predicted values
predicted_values = []

# Loop through the test set to get the predicted values
for idx, row in testDf.iterrows():
    cell_type = row['cell_type']
    sm_name = row['sm_name']
    
    # Call your get_expression method here
    expression_values = get_expression(cell_type, sm_name)
    
    # Append the values to the list
    predicted_values.append(expression_values)

# Convert the list of predicted values to a numpy array
predicted_values_array = np.array(predicted_values)

# Replace the values in the sample submission DataFrame
sample_submission.iloc[:, 1:] = predicted_values_array

# Save the DataFrame to a new CSV file
sample_submission.to_csv("my_submission.csv", index=False)



FileNotFoundError: [Errno 2] No such file or directory: 'sample_submission.csv'

In [None]:
row_one = copy_df.iloc[0][5:]
row_one_output = get_expression(copy_df.iloc[0][0], copy_df.iloc[0][1])

for i in range( len(row_one)):
    print(f"{row_one[i]}     {row_one_output[i]}")