In [7]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_regression
import numpy as np
from transformers import EsmModel, EsmTokenizer
from sklearn.random_projection import SparseRandomProjection
from sklearn.preprocessing import StandardScaler

In [None]:
# !pip install scikit-learn
# !pip install torch
# !pip install transformers

In [8]:
loaded_data = np.load('processed_dataset.npz')
features = loaded_data['features']
targets = loaded_data['targets']

In [9]:
print(f"Number of features: {features.shape[1]}")

Number of features: 576


In [10]:
# Apply log transformation to targets
targets = np.log1p(targets)  # log(kiba_score + 1)

In [12]:
'''
# Feature Selection using SelectKBest
k_best = 20  # Number of top features to select
selector = SelectKBest(score_func=f_regression, k=k_best)
selected_features = selector.fit_transform(features, targets)
'''

'\n# Feature Selection using SelectKBest\nk_best = 20  # Number of top features to select\nselector = SelectKBest(score_func=f_regression, k=k_best)\nselected_features = selector.fit_transform(features, targets)\n'

In [13]:
# Normalize selected features
scaler_features = StandardScaler()
scaled_features = scaler_features.fit_transform(features)

In [15]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(scaled_features, targets, test_size=0.2, random_state=42)

In [16]:
# Convert data to PyTorch tensors
class KibaDataset(Dataset):
    def __init__(self, features, target):
        self.features = torch.tensor(features, dtype=torch.float32)
        self.target = torch.tensor(target, dtype=torch.float32)

    def __len__(self):
        return len(self.target)

    def __getitem__(self, idx):
        return self.features[idx], self.target[idx]

In [17]:
train_dataset = KibaDataset(X_train, y_train)
test_dataset = KibaDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [18]:
# Define the Model
class KibaModel(nn.Module):
    def __init__(self, input_size):
        super(KibaModel, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_size, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.4),
            nn.Linear(256, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 64),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )
    def forward(self, x):
        return self.model(x)

In [19]:
# Custom weight initialization
def initialize_weights(m):
    if isinstance(m, nn.Linear):
        nn.init.kaiming_uniform_(m.weight, nonlinearity='relu')  # He initialization for ReLU activation
        if m.bias is not None:
            nn.init.zeros_(m.bias)

In [20]:
# Initialize model
input_size = X_train.shape[1]
model = KibaModel(input_size)

# Apply custom weight initialization
model.apply(initialize_weights)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
model.to(device)

cuda


KibaModel(
  (model): Sequential(
    (0): Linear(in_features=576, out_features=256, bias=True)
    (1): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Dropout(p=0.4, inplace=False)
    (4): Linear(in_features=256, out_features=128, bias=True)
    (5): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): ReLU()
    (7): Dropout(p=0.3, inplace=False)
    (8): Linear(in_features=128, out_features=64, bias=True)
    (9): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (10): ReLU()
    (11): Linear(in_features=64, out_features=1, bias=True)
  )
)

In [21]:
# Step 4: Define Loss and Optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

In [22]:
# Gradient clipping function
def clip_gradient(optimizer, grad_clip):
    for group in optimizer.param_groups:
        for param in group['params']:
            if param.grad is not None:
                param.grad.data.clamp_(-grad_clip, grad_clip)

In [23]:
# Step 5: Train the Model
def train_model(model, loader, criterion, optimizer, device, grad_clip=None, epochs=10):
    model.train()
    for epoch in range(epochs):
        running_loss = 0.0
        for inputs, targets in loader:
            inputs, targets = inputs.to(device), targets.to(device)

            # Forward pass
            outputs = model(inputs)
            loss = criterion(outputs.squeeze(), targets)

            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            
            # Gradient clipping
            if grad_clip is not None:
                clip_gradient(optimizer, grad_clip)
                
            optimizer.step()

            running_loss += loss.item()
        print(f"Epoch {epoch+1}/{epochs}, Loss: {running_loss / len(loader):.4f}")

In [24]:
# Step 6: Evaluate the Model
def evaluate_model(model, loader, device):
    model.eval()
    total_loss = 0.0
    with torch.no_grad():
        for inputs, targets in loader:
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            loss = criterion(outputs.squeeze(), targets)
            total_loss += loss.item()
    print(f"Validation Loss: {total_loss / len(loader):.4f}")


In [25]:
# Train and Evaluate
train_model(model, train_loader, criterion, optimizer, device, grad_clip=None, epochs=20)
evaluate_model(model, test_loader, device)

Epoch 1/20, Loss: 9.3480
Epoch 2/20, Loss: 9.1621
Epoch 3/20, Loss: 9.0649
Epoch 4/20, Loss: 9.0151
Epoch 5/20, Loss: 8.9873
Epoch 6/20, Loss: 8.9727
Epoch 7/20, Loss: 8.9540
Epoch 8/20, Loss: 8.9421
Epoch 9/20, Loss: 8.9339
Epoch 10/20, Loss: 8.9202
Epoch 11/20, Loss: 8.9115
Epoch 12/20, Loss: 8.9054
Epoch 13/20, Loss: 8.8964
Epoch 14/20, Loss: 8.8909
Epoch 15/20, Loss: 8.8820
Epoch 16/20, Loss: 8.8775
Epoch 17/20, Loss: 8.8723
Epoch 18/20, Loss: 8.8643
Epoch 19/20, Loss: 8.8582
Epoch 20/20, Loss: 8.8522
Validation Loss: 8.8390


In [26]:
# Save the model
torch.save(model.state_dict(), 'regression_model.pth')