# Autoencoder

In [95]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder


### Prepare the dataset

In [96]:
# Load data from ./data directory
data = pd.read_csv("./data/test_bioactivity_data_raw.csv")

Convert text data to floats

In [97]:
# Select only string columns for one-hot encoding
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

# Load the data from CSV

# Drop any rows with missing values
data = data.dropna()

# Separate features and target variable
X = data.drop(columns=['standard_value'])  # Features
y = data['standard_value']  # Target

# Convert categorical variables to one-hot encoding
categorical_cols = ['molecule_chembl_id', 'canonical_smiles', 'bioactivity_class']
numerical_cols = list(set(X.columns) - set(categorical_cols))

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), categorical_cols),
        ('num', StandardScaler(), numerical_cols)
    ])

X_processed = preprocessor.fit_transform(X)



In [98]:
X_processed.shape

(3463, 5197)

In [99]:
input_dim=X_processed[0].shape[1]
input_dim

5197

In [100]:
class Autoencoder(nn.Module):
    def __init__(self, input_size):
        super().__init__()
        
        self.encoder=nn.Sequential(
            nn.Linear(input_size, int(input_size/2)),
            nn.ReLU(),
            nn.Linear(int(input_size/2), int(input_size/4)),
        )
        
        self.decoder=nn.Sequential(
            nn.Linear(int(input_size/4), int(input_size/2)),
            nn.ReLU(),
            nn.Linear(int(input_size/2), input_size),
            nn.Sigmoid()
        )
        
    def forward(self, x):
        encoded=self.encoder(x)
        decoded=self.decoder(encoded)
        return decoded
    

In [101]:
train, test = train_test_split(X_processed, test_size=0.2)

In [102]:
model = Autoencoder(input_dim)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-5)

In [103]:
for epoch in range(X_processed.shape[0]):
    input = X_processed[epoch].toTensor()
    # Forward pass
    output = model(input)
    loss = criterion(output, input)
    
    # Backward pass and optimization step
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()


AttributeError: 'csr_matrix' object has no attribute 'toTensor'