<a href="https://colab.research.google.com/github/KuzonFyre/Single-Cell-Perturbations/blob/main/Copy_of_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:

import torch.nn as nn
import torch
import os
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
#Read the data into a pandas data frame
df = pd.read_parquet("/content/drive/MyDrive/de_train.parquet")

In [3]:
#normalizes data between -1 and 1
#returns normalized data and the factors used to normalzie
def normalize(df):
    min = df.min()
    max=df.max()
    df_normalized = (df - min) / (max-min)
    return df_normalized, min.reset_index(drop=True), max.reset_index(drop=True)


#unnormalize
def unnormalize(normalized_df, min, max):
    return min + normalized_df*(max-min)

In [4]:
############################DATA PREP###############################



####One hot incodes inputs####

# Initialize the OneHotEncoder
encoder = OneHotEncoder(sparse_output=False)

# Fit and transform the data
cells = df["cell_type"].values.reshape(-1, 1)
hot_cells = encoder.fit_transform(cells)
cell_mapping = encoder.categories_[0]

compounds = df['sm_name'].values.reshape(-1, 1)
hot_compounds = encoder.fit_transform(compounds)
compound_mapping = encoder.categories_[0]

#Puts together inputs
inputs = np.hstack((hot_cells, hot_compounds))
inputs_df = pd.DataFrame(data = inputs)


####Normalizes Outputs####

outputs = df.loc[:, 'A1BG':'ZZEF1']
outputs_norm_df, norm_min, norm_max = normalize(outputs)


####Puts Inputs and Outputs Together####
prepped_df = pd.concat([inputs_df, outputs_norm_df], axis=1)



In [5]:
# X, y = train_test_split(prepped_df,train_size=.2, random_state=3)

In [6]:
#Break into training and validation and split inputs from outputs
val, train = train_test_split(prepped_df, train_size=.2, random_state=3)
trainIn_df = train.loc[:, 0:151]
trainOut_df = train.loc[:, 'A1BG':'ZZEF1']
valIn_df = val.loc[:,0:151]
valOut_df = val.loc[:, 'A1BG':'ZZEF1']

#Transforms data frames into tensors
trainIn_t = torch.tensor(trainIn_df.values)
trainOut_t = torch.tensor(trainOut_df.values)
valIn_t = torch.tensor(valIn_df.values)
valOut_t = torch.tensor(valOut_df.values)




In [7]:
import torch
from torch.utils.data import DataLoader, TensorDataset

class MLPRegressor(nn.Module):
    def __init__(self, input_size, hidden_sizes, output_size):
        super(MLPRegressor, self).__init__()
        layers = []
        all_sizes = [input_size] + hidden_sizes + [output_size]

        for i in range(len(all_sizes) - 1):
            layers.append(nn.Linear(all_sizes[i], all_sizes[i + 1]))
            if i < len(all_sizes) - 2:  # No activation on the last layer
                layers.append(nn.ReLU())

        self.model = nn.Sequential(*layers)

    def forward(self, x):
        return self.model(x)


# Define dataset and dataloader for batch processing
train_dataset = TensorDataset(trainIn_t, trainOut_t)
train_loader = DataLoader(dataset=train_dataset, batch_size=20, shuffle=True)

# Define the model
input_size = 152
hidden_sizes = [1024, 512]  # You can adjust these sizes and add more layers if needed
output_size = 18211
model = MLPRegressor(input_size, hidden_sizes, output_size).double()

# Move model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Loss and optimizer
loss_fn = nn.MSELoss()  # Use MSELoss for regression
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training loop
def train(num_epochs):
    model.train()
    for epoch in range(num_epochs):
        epoch_losses = []
        for batch_idx, (X_batch, y_batch) in enumerate(train_loader):
            # Move to device
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)

            # Forward pass
            outputs = model(X_batch)
            loss = loss_fn(outputs, y_batch)

            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            epoch_losses.append(loss.item())

        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {np.mean(epoch_losses):.4f}")

# Call the train function
train(10)


Epoch [1/10], Loss: 0.0427
Epoch [2/10], Loss: 0.0058
Epoch [3/10], Loss: 0.0048
Epoch [4/10], Loss: 0.0047
Epoch [5/10], Loss: 0.0044
Epoch [6/10], Loss: 0.0043
Epoch [7/10], Loss: 0.0045
Epoch [8/10], Loss: 0.0039
Epoch [9/10], Loss: 0.0034
Epoch [10/10], Loss: 0.0030


In [8]:

# #Train model (epochs, mini batch size)
# train(20,20)


In [9]:
# # compute accuracy on training set
# def deviationtrain():
#     with torch.no_grad():
#         y_pred = model1(trainIn_t)

#     accuracy = (abs((y_pred-trainOut_t)).sum()/trainOut_t.sum())
#     RSS = ((y_pred-trainOut_t)**2).float().sum()
#     return (f"Deviation: {100*accuracy}%    RSS: {RSS}")

# deviationtrain()

NameError: ignored

In [11]:
def hot_encode(cell,compound):
    cell_vec = np.zeros(cell_mapping.size)
    cell_dict = {value: index for index, value in enumerate(cell_mapping)}

    compound_vec = np.zeros(compound_mapping.size)
    compound_dict = {value: index for index, value in enumerate(compound_mapping)}


    cell_vec[cell_dict[cell]]=1
    compound_vec[compound_dict[compound]]=1
    vector = np.concatenate((cell_vec, compound_vec), axis = 0)
    tensor = torch.from_numpy(vector)
    return tensor


In [12]:
model(hot_encode("NK cells", "Clotrimazole"))

tensor([0.1524, 0.2396, 0.5161,  ..., 0.4467, 0.4590, 0.5669],
       dtype=torch.float64, grad_fn=<ViewBackward0>)

In [13]:
def get_expression(cell_type, compound_name):
    tensor = model(hot_encode(cell_type,compound_name))
    np_array = tensor.detach().numpy()
    df = pd.DataFrame(np_array)
    return unnormalize(df[0],norm_min,norm_max)


df = get_expression("NK cells", "Clotrimazole")
print(df)




0        0.603001
1        0.361692
2        0.406551
3        0.128234
4        0.369398
           ...   
18206    0.743457
18207    0.128856
18208    0.241062
18209   -0.296729
18210   -0.083388
Length: 18211, dtype: float64


In [14]:
# Read the sample submission and test set ID map
sample_submission = pd.read_csv("/content/drive/MyDrive/sample_submission.csv")
testDf = pd.read_csv("/content/drive/MyDrive/id_map.csv")

# Initialize an empty list to collect the predicted values
predicted_values = []

# Loop through the test set to get the predicted values
for idx, row in testDf.iterrows():
    cell_type = row['cell_type']
    sm_name = row['sm_name']

    # Call your get_expression method here
    expression_values = get_expression(cell_type, sm_name)

    # Append the values to the list
    predicted_values.append(expression_values)

# Convert the list of predicted values to a numpy array
predicted_values_array = np.array(predicted_values)

# Replace the values in the sample submission DataFrame
sample_submission.iloc[:, 1:] = predicted_values_array

# Save the DataFrame to a new CSV file
sample_submission.to_csv("/content/drive/MyDrive/my_submission.csv", index=False)

