In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
import torch
import itertools
import random
import numpy as np
from torch.utils.data import Dataset, DataLoader, random_split, ConcatDataset
from torch import nn
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader, random_split
from torchvision import transforms, utils

import math
import argparse
import torch.optim as optim
from torch import nn
from pathlib import Path

In [7]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [1]:
def read_file(filepath):
    # Open and read lines from a file
    with open(filepath, "r") as file:
        lines = file.readlines()
    # Remove extra whitespace from each line
    lines = [line.strip() for line in lines]
    return lines

In [2]:
def generate_kmer_list(kmer_length, alphabet="TGCA"):
    # Convert the alphabet string to a list of characters
    characters = list(alphabet)
    # Create all possible k-mers of the specified length
    kmers = ["".join(combination) for combination in itertools.product(characters, repeat=kmer_length)]
    # Sort the k-mers alphabetically
    kmers.sort()
    return kmers

In [None]:
def protein_to_numeric(protein_sequence, kmer_list):
    # Get the k-mer length from the first element of the kmer list
    kmer_length = len(kmer_list[0])
    # Create a dictionary mapping k-mers to their indices
    kmer_dict = {kmer: index for index, kmer in enumerate(kmer_list)}
    numeric_representation = []

    # Convert each k-mer in the protein sequence to its numeric index
    for i in range(len(protein_sequence) - kmer_length + 1):
        try:
            numeric_representation.append(kmer_dict[protein_sequence[i:i+kmer_length]])
        except KeyError:
            # Handle k-mers not found in the dictionary
            numeric_representation.append(0)
            print("Key error:", protein_sequence[i:i+kmer_length], "at index", i)

    return numeric_representation

In [3]:
def generate_negative_samples(protein_sequence, num_segments=20, segments_to_keep=8, max_value=4):
    sequence_length = len(protein_sequence)
    # Calculate the length of each segment
    segment_length = sequence_length // num_segments

    # Adjust the number of segments if there's a remainder
    if segment_length * num_segments < sequence_length:
        num_segments += 1

    # Generate an array of segment indices
    segment_indices = np.arange(num_segments)
    # Randomly select segments to keep
    retained_segments = random.sample(list(segment_indices), k=segments_to_keep)
    output_sequence = []

    # Iterate over each segment
    for segment_index in segment_indices:
        start_index = segment_index * segment_length
        segment = protein_sequence[start_index:start_index + segment_length]
        if segment_index in retained_segments:
            # Keep the original segment
            output_sequence.extend(segment)
        else:
            # Replace the segment with random values
            random_segment = random.choices(np.arange(max_value), k=len(segment))
            output_sequence.extend(random_segment)

    return output_sequence

In [8]:
class OneHotDataset(Dataset):
    def __init__(self, file_path, is_positive=True, device="cuda", fake_type=0, sequence_length=300, segments=20, keep_segments=8):
        """
        Initializes the dataset for one-hot encoding of protein sequences.

        :param file_path: Path to the text file containing the dataset.
        :param is_positive: Indicates the dataset label; True for positive (1), False for negative (0).
        :param device: Specifies the device to use (e.g., "cuda" or "cpu").
        :param fake_type: Defines the type of fake data generation;
                          0 for loading the original dataset,
                          1 for generating randomized sequences,
                          2 for generating fake sequences using a specified method.
        :param sequence_length: Length of the input protein sequence.
        :param segments: Number of segments to divide the sequence into for fake data generation.
        :param keep_segments: Number of segments to retain unchanged during fake sequence generation.
        """
        if is_positive and fake_type != 0:
            raise ValueError("The 'fake_type' parameter cannot be used with a positive dataset.")

        self.device = device
        self.fake_type = fake_type
        self.sequence_length = sequence_length
        self.segments = segments
        self.keep_segments = keep_segments

        # Generate the k-mer dictionary
        kmer_dict = generate_kmer_list(1)

        # Load data from the specified file
        self.protein_data = read_file(file_path)

        # Convert protein sequences into numeric representations
        self.numeric_data = [protein_to_numeric(protein, kmer_dict) for protein in self.protein_data]

        # Assign labels based on whether the dataset is positive or negative
        if is_positive:
            self.labels = torch.from_numpy(np.ones(len(self.protein_data)))
        else:
            self.labels = torch.from_numpy(np.zeros(len(self.protein_data)))
        self.labels = self.labels.to(device)

    def __len__(self):
        # Return the total number of samples in the dataset
        return len(self.protein_data)

    def __getitem__(self, index):
        # Retrieve and pad/crop the numeric sequence to the fixed length
        sequence = self.numeric_data[index]
        if len(sequence) < self.sequence_length:
            sequence.extend([0] * (self.sequence_length - len(sequence)))
        elif len(sequence) > self.sequence_length:
            sequence = sequence[:self.sequence_length]

        # Generate fake data if specified
        if self.fake_type == 1:
            random.shuffle(sequence)  # Shuffle the sequence
        elif self.fake_type == 2:
            sequence = generate_negative_samples(sequence, num_part=self.segments, keep=self.keep_segments, max_class=4)

        # Convert the sequence to a tensor and one-hot encode it
        sequence_tensor = torch.from_numpy(np.array(sequence))
        one_hot_encoded = torch.nn.functional.one_hot(sequence_tensor, num_classes=4).to(self.device)

        return one_hot_encoded.float(), self.labels[index]

In [9]:
def load_data(data_path, train_ratio=0.8, add_random_neg=False, batch_size=32, num_workers=0, device="cuda"):
    """
    Load and prepare datasets for training, validation, and testing.

    :param data_path: Path to the text file containing promoter sequences (one DNA promoter per line).
    :param train_ratio: Proportion of the dataset allocated for training.
    :param add_random_neg: Whether to include random DNA sequences in the negative dataset.
    :param batch_size: Number of samples per batch for the data loader.
    :param num_workers: Number of CPU threads for loading data in parallel.
    :param device: Device on which data should be loaded (e.g., "cuda" or "cpu").
    :return: A list of data loaders for training, validation, and testing for positive and negative datasets.
    """
    # Set a manual seed for reproducibility
    seed_generator = torch.Generator().manual_seed(42)

    # Initialize positive and negative datasets
    positive_dataset = OneHotDataset(data_path, device=device)
    negative_dataset = OneHotDataset(data_path, is_pos=False, fake=2, device=device)

    # Calculate the sizes for training, validation, and testing splits
    train_size = int(len(positive_dataset) * train_ratio)
    val_size = int(len(positive_dataset) * (1 - train_ratio) * 0.5)
    split_sizes = [train_size, val_size, len(positive_dataset) - train_size - val_size]

    # Split the datasets into training, validation, and testing subsets
    train_positive, val_positive, test_positive = random_split(positive_dataset, split_sizes, generator=seed_generator)
    train_negative, val_negative, test_negative = random_split(negative_dataset, split_sizes, generator=seed_generator)

    # Optionally, add a random negative dataset to the training subset
    if add_random_neg:
        random_negative_dataset = OneHotDataset(data_path, is_pos=False, fake=1, device=device)
        train_negative = ConcatDataset([train_negative, random_negative_dataset])

    # Prepare data loaders for each dataset split
    datasets = [train_positive, val_positive, test_positive, train_negative, val_negative, test_negative]
    data_loaders = [
        DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers)
        for dataset in datasets
    ]

    return data_loaders

In [10]:
def load_test_data(data_path, batch_size=32, device="cuda", num_workers=0):
    """
    Load the test dataset and prepare a data loader.

    :param data_path: Path to the text file containing test data (one DNA sequence per line).
    :param batch_size: Number of samples per batch for the data loader.
    :param device: Device on which the data will be loaded (e.g., "cuda" or "cpu").
    :param num_workers: Number of CPU threads for parallel data loading.
    :return: A DataLoader for the test dataset.
    """
    # Create a dataset for one-hot encoded DNA sequences
    test_dataset = OneHotDataset(data_path, device=device)

    # Initialize a data loader without shuffling for the test dataset
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers)

    return test_loader


In [11]:
class ParallelCNN(nn.Module):
    def __init__(self, kernel_sizes, pooling_size=6, dropout_rate=0.5):
        """
        A module that applies multiple CNN layers in parallel to the input
        and concatenates their outputs.

        :param kernel_sizes: A list of kernel sizes for the convolutional layers.
        :param pooling_size: Size of the pooling layer applied after each CNN.
        :param dropout_rate: Dropout rate for regularization.
        """
        super(ParallelCNN, self).__init__()
        self.cnn_layers = nn.ModuleList()
        for kernel_size in kernel_sizes:
            # Define a sequential model for each kernel size
            layer = nn.Sequential(
                nn.Conv1d(in_channels=4, out_channels=4, kernel_size=kernel_size, padding="same"),
                nn.ReLU(),
                nn.MaxPool1d(kernel_size=pooling_size),
                nn.Dropout(dropout_rate)
            )
            self.cnn_layers.append(layer)

    def forward(self, inputs):
        """
        Forward pass through the parallel CNN layers.

        :param inputs: Input tensor representing one-hot encoded DNA sequences
                       with shape [batch_size, 4, sequence_length].
        :return: Concatenated output from all CNN layers
                 with shape [batch_size, 4 * len(kernel_sizes), reduced_length].
        """
        outputs = []
        for layer in self.cnn_layers:
            outputs.append(layer(inputs))
        # Concatenate outputs from all convolutional layers along the channel dimension
        concatenated_output = torch.cat(outputs, dim=1)
        return concatenated_output

In [12]:
class BidirectionalLSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        """
        A Bidirectional LSTM followed by a fully connected layer.

        :param input_dim: Size of the input features.
        :param hidden_dim: Number of hidden units in the LSTM.
        :param output_dim: Size of the output features.
        """
        super(BidirectionalLSTM, self).__init__()
        # Define a bidirectional LSTM
        self.rnn = nn.LSTM(input_size=input_dim, hidden_size=hidden_dim,
                           bidirectional=True, batch_first=True)
        # Define a linear layer to transform LSTM output to desired output size
        self.fc = nn.Linear(hidden_dim * 2, output_dim)

    def forward(self, inputs):
        """
        Perform a forward pass through the Bidirectional LSTM.

        :param inputs: Input tensor with shape [batch_size, sequence_length, input_dim].
        :return: Output tensor with shape [batch_size, sequence_length, output_dim].
        """
        # Optimize for better performance on multi-GPU setups
        self.rnn.flatten_parameters()

        # Pass the input through the LSTM
        lstm_output, _ = self.rnn(inputs)  # Shape: [batch_size, sequence_length, 2*hidden_dim]

        # Transform the LSTM output with the linear layer
        output = self.fc(lstm_output)  # Shape: [batch_size, sequence_length, output_dim]

        return output

In [13]:
class DeePromoter(nn.Module):
    def __init__(self, kernel_sizes, input_shape=(64, 300, 4), pooling_size=6, dropout_rate=0.5):
        """
        DeePromoter Model: Combines parallel CNNs, a Bidirectional LSTM,
        and fully connected layers for sequence classification.

        :param kernel_sizes: A list of kernel sizes for the parallel CNN layers.
        :param input_shape: Fixed input shape for the model (batch_size, sequence_length, num_channels).
        :param pooling_size: Pooling kernel size for the CNN layers.
        :param dropout_rate: Dropout rate for regularization.
        """
        super(DeePromoter, self).__init__()

        # Number of output channels from the ParallelCNN
        lstm_input_dim = len(kernel_sizes) * 4

        # Define the model components
        self.parallel_cnn = ParallelCNN(kernel_sizes, pooling_size, dropout_rate)
        self.bidirectional_lstm = BidirectionalLSTM(input_size=lstm_input_dim,
                                                    hidden_size=lstm_input_dim,
                                                    output_size=lstm_input_dim)
        self.flatten = nn.Flatten()

        # Determine the flattened feature size using a dummy input
        dummy_input = torch.zeros(input_shape)
        flattened_size = self.compute_flattened_size(dummy_input)

        # Fully connected layers for classification
        self.fc_layers = nn.Sequential(
            nn.Linear(flattened_size, flattened_size),
            nn.ReLU(),
            nn.Linear(flattened_size, 2),
            nn.Softmax(dim=1)
        )

    def compute_flattened_size(self, x):
        """
        Calculate the feature size after the convolutional and LSTM layers,
        before passing through the fully connected layers.

        :param x: Dummy input tensor with shape [batch_size, sequence_length, num_channels].
        :return: Size of the flattened features.
        """
        x = x.permute(0, 2, 1)  # Adjust dimensions for CNN input
        x = self.parallel_cnn(x)
        x = x.permute(0, 2, 1)  # Adjust dimensions for LSTM input
        x = self.bidirectional_lstm(x)
        x = self.flatten(x)
        return x.shape[1]

    def forward(self, x):
        """
        Forward pass through the DeePromoter model.

        :param x: Input tensor with shape [batch_size, sequence_length, num_channels].
        :return: Output tensor with shape [batch_size, 2], representing class probabilities.
        """
        x = x.permute(0, 2, 1)  # Adjust dimensions for CNN input
        x = self.parallel_cnn(x)
        x = x.permute(0, 2, 1)  # Adjust dimensions for LSTM input
        x = self.bidirectional_lstm(x)
        x = self.flatten(x)
        x = self.fc_layers(x)
        return x

In [14]:
def evaluate(model, data_loaders):
    """
    Perform inference and evaluate the model's predictions against the true labels.

    :param model: The trained model set to evaluation mode.
    :param data_loaders: A list of PyTorch DataLoader objects for evaluation.
    :return: A tuple containing:
             - A list of [correct_predictions, total_samples] for each DataLoader.
             - A list of predicted results as integers for each DataLoader.
    """
    evaluation_results = []
    total_samples_list = []
    correct_predictions_list = []
    all_predictions = []

    for loader in data_loaders:
        total_samples = 0
        correct_predictions = 0
        loader_predictions = []

        for batch in loader:
            inputs, labels = batch
            outputs = model(inputs)
            _, predicted = torch.max(outputs.data, dim=1)

            # Update total sample count and correct predictions
            total_samples += labels.size(0)
            correct_predictions += (predicted == labels).sum().item()

            # Store the predictions
            loader_predictions.extend(predicted.cpu().numpy())

        # Calculate accuracy for this DataLoader
        accuracy = correct_predictions / total_samples
        evaluation_results.append(accuracy)
        correct_predictions_list.append(correct_predictions)
        total_samples_list.append(total_samples)
        all_predictions.append(loader_predictions)

    return (correct_predictions_list, total_samples_list), all_predictions

In [15]:
def calculate_mcc(data):
    """
    Calculate the Matthews Correlation Coefficient (MCC), along with precision and recall.

    :param data: A list containing evaluation results, where:
                 - `data[0]` represents the counts for positive results [True Positive, True Negative].
                 - `data[1]` represents the total counts [Total Positive, Total Negative].
    :return: A tuple of (precision, recall, MCC).
    """
    # Extract true positive and true negative counts
    true_positive = data[0][0]
    true_negative = data[0][1]

    # Extract total positive and negative counts
    total_positive = data[1][0]
    total_negative = data[1][1]

    # Calculate false negatives and false positives
    false_negative = total_positive - true_positive
    false_positive = total_negative - true_negative

    # Calculate precision and recall
    precision = true_positive / (true_positive + false_positive)
    recall = true_positive / (true_positive + false_negative)

    # Calculate Matthews Correlation Coefficient
    numerator = (true_positive * true_negative) - (false_positive * false_negative)
    denominator = math.sqrt(
        (true_positive + false_positive) *
        (true_positive + false_negative) *
        (true_negative + false_positive) *
        (true_negative + false_negative)
    )
    mcc = numerator / denominator

    return precision, recall, mcc

In [16]:
def test(data_path, pretrained_model_path, kernel_sizes=None):
    """
    Evaluate a pretrained DeePromoter model on a test dataset.

    :param data_path: Path to the test dataset file.
    :param pretrained_model_path: Path to the pretrained model file (state dictionary).
    :param kernel_sizes: List of kernel sizes for the CNN layers in the DeePromoter model.
                         Defaults to [27, 14, 7].
    :return: A list of predicted results from the test dataset.
    """
    if kernel_sizes is None:
        kernel_sizes = [27, 14, 7]

    # Load the test data
    test_loader = load_test_data(data_path, device=device)

    # Initialize the DeePromoter model
    model = DeePromoter(kernel_sizes)
    model.to(device)

    # Load the pretrained model weights
    model.load_state_dict(torch.load(pretrained_model_path))

    # Set the model to evaluation mode
    model.eval()

    # Evaluate the model on the test dataset
    evaluation_data, predictions = evaluate(model, [test_loader])

    return predictions

In [17]:
def test_main():
  parser = argparse.ArgumentParser()
  parser.add_argument(
    "-d",
    "--data",
    type=str,
    required=True,
    help="path to dataset(txt file)",
  )
  parser.add_argument("-w", "--weight", type=str, help="Path to pre-train")
  args = parser.parse_args()

  output = test(args.data, args.weight)

  with open("infer_results.txt", "w") as f:
    for out in output[0]:
      f.write(str(out) + "\n")

## Train

In [18]:
def train(data_path, pretrained_weights=None, experiment_name="test", is_training=True, kernel_sizes=None, max_epochs=1000):
    """
    Train or evaluate the DeePromoter model.

    :param data_path: Path to the text file containing training data.
    :param pretrained_weights: Path to pretrained model weights for continued training.
    :param experiment_name: Name of the folder where results will be saved.
    :param is_training: If False, performs testing only.
    :param kernel_sizes: List of kernel sizes for the CNN layers in the DeePromoter model.
    :param max_epochs: Maximum number of epochs for training.
    """
    if kernel_sizes is None:
        kernel_sizes = [27, 14, 7]

    # Create output directories
    output_dir = Path("./output")
    output_dir.mkdir(exist_ok=True)
    experiment_dir = output_dir.joinpath(experiment_name)
    experiment_dir.mkdir(exist_ok=True)

    # Load data
    print("Loading data...")
    data_loaders = load_data(data_path, device=device)
    train_pos, val_pos, test_pos, train_neg, val_neg, test_neg = data_loaders

    # Initialize the model
    model = DeePromoter(kernel_sizes)
    model.to(device)

    # Load pretrained weights if provided
    if pretrained_weights is not None:
        model.load_state_dict(torch.load(pretrained_weights))

    # Define loss function and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.00001)

    # Training variables
    running_loss = 0.0
    best_mcc = 0
    best_precision = 0
    best_recall = 0
    early_stop_threshold = 10
    no_improvement_epochs = 0

    print("Starting training...")
    if is_training:
        for epoch in range(max_epochs):
            for (batch_pos, batch_neg) in zip(train_pos, train_neg):
                # Prepare inputs and labels
                inputs = torch.cat((batch_pos[0], batch_neg[0]), dim=0)
                labels = torch.cat((batch_pos[1], batch_neg[1]), dim=0)

                # Zero gradients
                optimizer.zero_grad()

                # Forward pass and compute loss
                outputs = model(inputs)
                loss = criterion(outputs, labels.long())
                loss.backward()
                optimizer.step()

                running_loss += loss.item()

            # Save and evaluate every 10 epochs
            if epoch % 10 == 0:
                torch.save(model.state_dict(), str(experiment_dir.joinpath(f"epoch_{epoch}.pth")))
                model.eval()
                eval_data, _ = evaluate(model, [val_pos, val_neg])
                precision, recall, mcc_value = mcc(eval_data)
                model.train()

                print(f"Epoch: {epoch}, Experiment: {experiment_name}")
                print(f"Precision: {precision}")
                print(f"Recall: {recall}")
                print(f"MCC: {mcc_value}")

                # Save the best model based on precision, recall, and MCC
                if precision > best_precision:
                    best_precision = precision
                    print("Updating best precision model...")
                    torch.save(model.state_dict(), str(experiment_dir.joinpath("best_precision.pth")))
                if recall > best_recall:
                    best_recall = recall
                    print("Updating best recall model...")
                    torch.save(model.state_dict(), str(experiment_dir.joinpath("best_recall.pth")))
                if mcc_value > best_mcc:
                    best_mcc = mcc_value
                    print("Updating best MCC model...")
                    torch.save(model.state_dict(), str(experiment_dir.joinpath("best_mcc.pth")))
                    no_improvement_epochs = 0
                else:
                    no_improvement_epochs += 1

                # Stop training early if no improvement in MCC
                if no_improvement_epochs >= early_stop_threshold:
                    break

        # Testing the model with the best MCC
        best_model_path = experiment_dir.joinpath("best_mcc.pth")
        model.load_state_dict(torch.load(best_model_path))
        model.eval()
        eval_data, _ = evaluate(model, [test_pos, test_neg])
        precision, recall, mcc_value = mcc(eval_data)
        print("Test Results:")
        print(f"Precision: {precision}")
        print(f"Recall: {recall}")
        print(f"MCC: {mcc_value}")

        # Save test results to a log file
        with open(str(experiment_dir.joinpath("log.txt")), "w") as log_file:
            log_file.write(f"Test Precision: {precision}\n")
            log_file.write(f"Test Recall: {recall}\n")
            log_file.write(f"Test MCC: {mcc_value}\n")

In [None]:
train('/content/drive/MyDrive/data/human/TATA/hs_pos_TATA.txt', exp_name="human_TATA", training=True)

Data loading


  return F.conv1d(


Start training
Epoch : 0 Experiment : human_TATA
precision : 0.7281553398058253
recall : 0.2568493150684932
MCC : 0.2111575952326206
Update best precision
Update best recall
Update best MCC
Epoch : 10 Experiment : human_TATA
precision : 0.7286432160804021
recall : 0.4965753424657534
MCC : 0.3287641768278304
Update best precision
Update best recall
Update best MCC
Epoch : 20 Experiment : human_TATA
precision : 0.79375
recall : 0.4349315068493151
MCC : 0.3608983811399463
Update best precision
Update best MCC
Epoch : 30 Experiment : human_TATA
precision : 0.8275862068965517
recall : 0.410958904109589
MCC : 0.3765367277077475
Update best precision
Update best MCC
Epoch : 40 Experiment : human_TATA
precision : 0.8266666666666667
recall : 0.4246575342465753
MCC : 0.38409228281811403
Update best MCC
Epoch : 50 Experiment : human_TATA
precision : 0.8641975308641975
recall : 0.4794520547945205
MCC : 0.4513030562052148
Update best precision
Update best MCC
Epoch : 60 Experiment : human_TATA
prec

  net.load_state_dict(torch.load(best_model))


Saved epoch 560 as best path. using this for testing

## Test

In [None]:
test('/content/drive/MyDrive/data/human/TATA/hs_pos_TATA.txt', pretrain = '/content/output/human_TATA/epoch_560.pth')

  net.load_state_dict(torch.load(pretrain))


[[0,
  1,
  1,
  0,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  1,
  1,
  1,
  1,
  1,
  0,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  0,
  0,
  1,
  1,
  0,
  1,
  1,
  1,
  0,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  1,
  1,
  0,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,


## Train Human Non TATA

In [None]:
train('/content/drive/MyDrive/data/human/nonTATA/hs_pos_nonTATA.txt', exp_name="human_non_TATA", training=True)

Data loading
Key error :  N 200
Key error :  N 200
Start training
Epoch : 0 Experiment : human_non_TATA
precision : 0.8718980549966465
recall : 0.5017367811655732
MCC : 0.4727382049857469
Update best precision
Update best recall
Update best MCC
Epoch : 10 Experiment : human_non_TATA
precision : 0.9102202145680407
recall : 0.6221536086453107
MCC : 0.5911742022606917
Update best precision
Update best recall
Update best MCC
Epoch : 20 Experiment : human_non_TATA
precision : 0.92253136933988
recall : 0.6526437668853724
MCC : 0.625190847473307
Update best precision
Update best recall
Update best MCC
Epoch : 30 Experiment : human_non_TATA
precision : 0.9396267837541163
recall : 0.6607487456580471
MCC : 0.6474685631153649
Update best precision
Update best recall
Update best MCC
Epoch : 40 Experiment : human_non_TATA
precision : 0.9551924090669478
recall : 0.6993438826707835
MCC : 0.6918166104549979
Update best precision
Update best recall
Update best MCC
Epoch : 50 Experiment : human_non_TATA