In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/ml-mutational-learning/LY16_test_data.csv
/kaggle/input/ml-mutational-learning/media-4.xlsx
/kaggle/input/ml-mutational-learning/LY16_train_data.csv
/kaggle/input/ml-mutational-learning/ACE2_test_data.csv
/kaggle/input/ml-mutational-learning/media-1.xlsx
/kaggle/input/ml-mutational-learning/REGN87_test_data.csv
/kaggle/input/ml-mutational-learning/LY16_unseen.tsv
/kaggle/input/ml-mutational-learning/P0DTC2.fasta.txt
/kaggle/input/ml-mutational-learning/mmc2.csv
/kaggle/input/ml-mutational-learning/REGN87_unseen.tsv
/kaggle/input/ml-mutational-learning/ACE2_train_data.csv
/kaggle/input/ml-mutational-learning/REGN87_train_data.csv
/kaggle/input/ml-mutational-learning/encoded_seqs_train.npy
/kaggle/input/ml-mutational-learning/REGN33_train_data.csv
/kaggle/input/ml-mutational-learning/encoded_seqs_unseen.npy
/kaggle/input/ml-mutational-learning/REGN33_test_data.csv
/kaggle/input/ml-mutational-learning/media-3.xlsx
/kaggle/input/ml-mutational-learning/encoded_seqs_test.npy
/k

In [3]:
import torch.nn as nn
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
import random
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

In [7]:
train = pd.read_csv("/kaggle/input/ml-mutational-learning/REGN33_train_data.csv")
test = pd.read_csv("/kaggle/input/ml-mutational-learning/REGN33_test_data.csv")
unseen = pd.read_csv("/kaggle/input/ml-mutational-learning/REGN33_unseen.tsv", sep = '\t')

In [8]:
low_test = test[test.Distance <= 5]
high_test = test[test.Distance > 5]

In [13]:
list(train.columns)

['Unnamed: 0',
 'junction_aa',
 'v_call',
 'consensus_count',
 'j_call',
 'clonal_frequency',
 'Label',
 'Distance',
 'Antibody']

In [14]:
def process_df(df):
    df.columns = ["unkn", "seq", "v_call", "consensus_count", 'j_call', 'clonal_frequency', "Label", "Distance", "ab"]
    # Function to encode sequences
    def encode_seq(sequence):
      alphabet = ['A', 'C', 'D', 'E', 'F', 'G','H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']
      char_to_int = dict((c, i) for i, c in enumerate(alphabet))
      integer_encoded = [char_to_int[char] for char in sequence]
      onehot_encoded = list()
      for value in integer_encoded:
          letter = [0 for _ in range(len(alphabet))]
          letter[value] = 1
          onehot_encoded.append(letter)
      return np.array(onehot_encoded)
    df['Encoded_Sequences'] = df.seq.apply(lambda x: encode_seq(x))
    features = []
    for arr in list(df.Encoded_Sequences):
        features.append(arr)
    features = np.array(features)
    labels = np.array(list(df.Label))
    features = torch.from_numpy(features)
    labels = torch.from_numpy(labels)
    features = features.float()
    return features, labels

In [15]:
X_train, y_train = process_df(train)

In [16]:
X_test, y_test = process_df(test)
X_test_lw, y_test_lw = process_df(low_test)
X_test_hi, y_test_hi = process_df(high_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [17]:
SEED = 0xDEAD
random.seed(SEED)
np.random.seed(SEED)
torch.random.manual_seed(SEED)
torch.cuda.random.manual_seed_all(SEED)

In [18]:
class My_Data(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels
    def __len__(self):
        return len(self.features)
    def __getitem__(self, idx):
        sample = {'features': self.features[idx], 'labels': self.labels[idx]}
        return sample

In [19]:
train_dataset = My_Data(X_train, y_train)
test_dataset = My_Data(X_test, y_test)
test_lw = My_Data(X_test_lw, y_test_lw)
test_hi = My_Data(X_test_hi, y_test_hi)
trainloader = DataLoader(train_dataset, batch_size=150,
                        shuffle=True, num_workers=2)

testloader = DataLoader(test_dataset, batch_size=150,
                        shuffle=False, num_workers=2)
testloader_lw = DataLoader(test_lw, batch_size=150,
                        shuffle=False, num_workers=2)
testloader_hw = DataLoader(test_hi, batch_size=150,
                        shuffle=False, num_workers=2)

In [20]:
class CNNModel(nn.Module):
    def __init__(self, embed_size, hidden_size, num_classes=1):
        super().__init__()
        self.cnn = nn.Sequential(
            nn.Conv1d(embed_size, hidden_size, kernel_size=3, padding=1, stride=2),
            nn.BatchNorm1d(hidden_size),
            nn.ReLU(),
            nn.Conv1d(hidden_size, hidden_size, kernel_size=3, padding=1, stride=2),
            nn.BatchNorm1d(hidden_size),
            nn.ReLU(),
            nn.Conv1d(hidden_size, hidden_size, kernel_size=3, padding=1, stride=2),
            nn.BatchNorm1d(hidden_size),
            nn.ReLU(),
            nn.AdaptiveMaxPool1d(1),
            nn.Flatten(),
        )
        self.cl = nn.Sequential(
            nn.Linear(hidden_size, num_classes)
        )

    def forward(self, x):
        x = x.permute(0, 2, 1)
        x = self.cnn(x)
        prediction = self.cl(x)
        return prediction

In [21]:
from tqdm.notebook import tqdm, trange

def training(model, criterion, optimizer, num_epochs, trainloader, testloader):
    for e in trange(num_epochs, leave=False):
        model.train()
        num_iter = 0
        train_loss = 0
        for batch in trainloader:
            optimizer.zero_grad()
            input_embeds = batch["features"].to(device)
            labels = batch["labels"].to(device)
            labels = labels.unsqueeze(1)
            labels = labels.float()
            prediction = model(input_embeds)
            loss = criterion(prediction, labels)
            train_loss += loss.item()
            loss.backward()
            optimizer.step()
            num_iter += 1
        print(f"Train_Loss: {train_loss / num_iter}")
        valid_loss = 0
        num_iter = 0
        model.eval()
        with torch.no_grad():
            for batch in testloader:
                input_embeds = batch["features"].to(device)
                labels = batch["labels"].to(device)
                labels = labels.unsqueeze(1)
                labels = labels.float()
                prediction = model(input_embeds)
                loss = criterion(prediction, labels)
                valid_loss += loss.item()
                num_iter += 1
        print(f"Valid Loss: {valid_loss / num_iter}")

In [23]:
device = "cuda" if torch.cuda.is_available() else "cpu"

model = CNNModel(embed_size = 20, hidden_size = 80).to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
num_epochs = 14

In [24]:
training(model, criterion, optimizer, num_epochs, trainloader, testloader)

  0%|          | 0/14 [00:00<?, ?it/s]

Train_Loss: 0.259717046192608
Valid Loss: 0.22581135117345386
Train_Loss: 0.22728935652354412
Valid Loss: 0.22027537922064463
Train_Loss: 0.21837722591851416
Valid Loss: 0.21734833824965688
Train_Loss: 0.2121006790966613
Valid Loss: 0.21554364992512598
Train_Loss: 0.20700154293727993
Valid Loss: 0.21099607431226305
Train_Loss: 0.20199075041157646
Valid Loss: 0.20594139761394925
Train_Loss: 0.19798305479256886
Valid Loss: 0.2025417691303624
Train_Loss: 0.19328388931262225
Valid Loss: 0.20504184067249298
Train_Loss: 0.19018151062979663
Valid Loss: 0.20297688725921842
Train_Loss: 0.1860239866034051
Valid Loss: 0.20501245102948612
Train_Loss: 0.1829981916303052
Valid Loss: 0.20225760522815917
Train_Loss: 0.17915994742713368
Valid Loss: 0.19993569345937834
Train_Loss: 0.1767178886103214
Valid Loss: 0.2106376941005389
Train_Loss: 0.1734151418536827
Valid Loss: 0.20385437251793015


In [25]:
def evaluate_model(loader):
    m = nn.Sigmoid()
    model.eval()
    ground_truth = []
    all_prob = []
    all_predict = []
    with torch.no_grad():
        for batch in loader:
            input_embeds = batch["features"].to(device)
            labels = batch["labels"].to(device)
            prediction = model(input_embeds)
            prediction = prediction.squeeze(1)
            pred = torch.where(prediction > 0.5, 1, 0)
            all_predict.append(pred.cpu().numpy())
            prob = m(prediction)
            all_prob.append(prob.cpu().numpy())
            ground_truth.append(labels.cpu().numpy())
    all_prob = np.concatenate(all_prob)
    ground_truth = np.concatenate(ground_truth)
    all_predict = np.concatenate(all_predict)
    return all_prob, all_predict , ground_truth

In [26]:
def print_metrics(**kwargs):
    return pd.DataFrame(kwargs, index = [0])

In [27]:
all_prob, all_predict, ground_truth = evaluate_model(testloader_lw)
recall = recall_score(ground_truth, all_predict)
acc = accuracy_score(ground_truth, all_predict)
prec = precision_score(ground_truth, all_predict)
auc = roc_auc_score(ground_truth, all_prob)
print_metrics(recall = recall, accracy = acc, precision = prec, roc_auc = auc)

Unnamed: 0,recall,accracy,precision,roc_auc
0,0.877095,0.914286,0.969136,0.978598


In [28]:
all_prob, all_predict, ground_truth = evaluate_model(testloader_hw)
recall = recall_score(ground_truth, all_predict)
acc = accuracy_score(ground_truth, all_predict)
prec = precision_score(ground_truth, all_predict)
auc = roc_auc_score(ground_truth, all_prob)
print_metrics(recall = recall, accracy = acc, precision = prec, roc_auc = auc)

Unnamed: 0,recall,accracy,precision,roc_auc
0,0.899276,0.91505,0.927856,0.97287


In [None]:
#наоборот метрики 