In [5]:
pip install torch torchvision torchaudio


SyntaxError: invalid syntax (3767206408.py, line 1)

In [31]:
import torch
import torch.nn as nn
from torch import nn, optim
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score, recall_score, precision_score, f1_score,
    matthews_corrcoef, roc_auc_score, confusion_matrix
)
from torch.utils.data import DataLoader, Dataset

# ---- Load Data ----
file_path = '/Users/monikapandey/Library/CloudStorage/OneDrive-LouisianaStateUniversity/ML_ANTIFUNGAL/top_30_features_from_mo.tsv'
df = pd.read_csv(file_path, sep='\t')
df = df.dropna()
print(df.Label)
#df['Label'] = df.Label.astype('int')

target_column = 'Label'

# Drop SMILES and other non-numeric columns from features
#non_numeric = df.select_dtypes(exclude=[np.number]).columns.tolist()

features = df.iloc[:,1:-1]
targets = df['Label']

# ---- PyTorch Dataset ----
class CSVDataset(Dataset):
    def __init__(self, features, targets, scaler=None, fit_scaler=False):
        if scaler is None:
            self.scaler = StandardScaler()
        else:
            self.scaler = scaler
        if fit_scaler:
            self.features = self.scaler.fit_transform(features)
        else:
            self.features = self.scaler.transform(features)
        self.targets = targets.values.astype('float32')
    def __len__(self):
        return len(self.features)
    def __getitem__(self, idx):
        return {
            'features': torch.tensor(self.features[idx], dtype=torch.float32),
            'targets': torch.tensor(self.targets[idx], dtype=torch.float32)
        }

# ---- MLP Model ----
class MLP(nn.Module):
    def __init__(self, n_inputs):
        super(MLP, self).__init__()
        self.hidden1 = nn.Linear(n_inputs, 32)
        self.hidden2 = nn.Linear(32, 16)
        self.output = nn.Linear(16, 1)
    def forward(self, x):
        x = torch.relu(self.hidden1(x))
        x = torch.relu(self.hidden2(x))
        x = torch.sigmoid(self.output(x))
        return x.view(-1)

def train_model(train_dl, model, criterion, optimizer, epochs=50):
    model.train()
    for epoch in range(epochs):
        for batch in train_dl:
            inputs, targets = batch['features'], batch['targets']
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()

def evaluate_model(test_dl, model):
    model.eval()
    predictions, actuals, probs = [], [], []
    with torch.no_grad():
        for batch in test_dl:
            inputs, targets = batch['features'], batch['targets']
            outputs = model(inputs)
            probs.extend(outputs.cpu().numpy())
            preds = (outputs > 0.5).int().cpu().numpy()
            predictions.extend(preds)
            actuals.extend(targets.cpu().numpy())
    return np.array(predictions), np.array(actuals), np.array(probs)

# ---- Cross Validation ----
kf = KFold(n_splits=5, shuffle=True, random_state=42)
results = []

for fold, (train_index, test_index) in enumerate(kf.split(features)):
    print(f"Fold {fold+1}/5")
    train_features, test_features = features.iloc[train_index], features.iloc[test_index]
    train_targets, test_targets = targets.iloc[train_index], targets.iloc[test_index]
    
    scaler = StandardScaler()
    train_dataset = CSVDataset(train_features, train_targets, scaler=scaler, fit_scaler=True)
    test_dataset = CSVDataset(test_features, test_targets, scaler=scaler, fit_scaler=False)
    
    train_dl = DataLoader(train_dataset, batch_size=32, shuffle=True)
    test_dl = DataLoader(test_dataset, batch_size=1024, shuffle=False)
    
    n_inputs = train_dl.dataset.features.shape[1]
    model = MLP(n_inputs)
    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    train_model(train_dl, model, criterion=criterion, optimizer=optimizer, epochs=50)
    
    preds, actuals, probs = evaluate_model(test_dl, model)
    
    acc = accuracy_score(actuals, preds)
    rec = recall_score(actuals, preds)
    prec = precision_score(actuals, preds)
    f1 = f1_score(actuals, preds)
    mcc = matthews_corrcoef(actuals, preds)
    auc = roc_auc_score(actuals, probs)
    tn, fp, fn, tp = confusion_matrix(actuals, preds).ravel()
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
    
    print(f"  Accuracy:    {acc:.4f}")
    print(f"  Recall:      {rec:.4f} (Sensitivity)")
    print(f"  Specificity: {specificity:.4f}")
    print(f"  F1-score:    {f1:.4f}")
    print(f"  MCC:         {mcc:.4f}")
    print(f"  AUC:         {auc:.4f}")
    
    results.append({
        'fold': fold+1,
        'accuracy': acc,
        'recall': rec,
        'specificity': specificity,
        'f1': f1,
        'mcc': mcc,
        'auc': auc
    })

# ---- Summary and Save ----
results_df = pd.DataFrame(results)
print("\n=== Average Cross-Validated Metrics ===")
for metric in ['accuracy', 'recall', 'specificity', 'f1', 'mcc', 'auc']:
    print(f"{metric.capitalize():<12}: {results_df[metric].mean():.4f} ± {results_df[metric].std():.4f}")

results_df.to_csv('mlp_antifungal_cv_results.csv', index=False)


0        0
1        1
2        1
3        0
4        0
        ..
18421    1
18422    1
18423    0
18424    0
18425    1
Name: Label, Length: 18407, dtype: int64
Fold 1/5
  Accuracy:    0.9856
  Recall:      0.9906 (Sensitivity)
  Specificity: 0.9809
  F1-score:    0.9853
  MCC:         0.9713
  AUC:         0.9969
Fold 2/5
  Accuracy:    0.9859
  Recall:      0.9920 (Sensitivity)
  Specificity: 0.9796
  F1-score:    0.9861
  MCC:         0.9718
  AUC:         0.9974
Fold 3/5
  Accuracy:    0.9837
  Recall:      0.9876 (Sensitivity)
  Specificity: 0.9797
  F1-score:    0.9839
  MCC:         0.9674
  AUC:         0.9970
Fold 4/5
  Accuracy:    0.9840
  Recall:      0.9853 (Sensitivity)
  Specificity: 0.9827
  F1-score:    0.9840
  MCC:         0.9679
  AUC:         0.9974
Fold 5/5
  Accuracy:    0.9880
  Recall:      0.9925 (Sensitivity)
  Specificity: 0.9835
  F1-score:    0.9882
  MCC:         0.9761
  AUC:         0.9981

=== Average Cross-Validated Metrics ===
Accuracy    : 0.9854 ±