In [4]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split

In [30]:
# Read the DNA sequences and labels
with open('humanvsran.seq.txt') as f:
    sequences = [line.strip() for line in f.readlines()]

with open('humanvsran_label.txt') as f:
    labels = np.array([int(line.strip()) for line in f.readlines()])

In [31]:
# Convert the DNA sequences to one-hot encoding
def one_hot_encode(seq):
    mapping = {'A': 0, 'C': 1, 'G': 2, 'T': 3}
    one_hot = np.zeros((len(seq), 4), dtype=int)
    for i, nucleotide in enumerate(seq):
        one_hot[i, mapping[nucleotide]] = 1
    return one_hot


encoded_sequences = np.array([one_hot_encode(seq) for seq in sequences])

In [32]:
# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    encoded_sequences, labels, test_size=5000, random_state=42)

In [33]:
# Define a simple neural network model
class DNASequenceClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(DNASequenceClassifier, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)  # Adjusted input size
        self.fc2 = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        # Flatten the input for the fully connected layer
        x = x.view(x.size(0), -1)  # Ensure input is flattened
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x

In [34]:
# Initialize the model
input_size = 250 * 4  # Flattened input size for one-hot encoded sequences
hidden_size = 128
output_size = 2  # Assuming binary classification
model = DNASequenceClassifier(input_size, hidden_size, output_size)

In [35]:
# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [36]:
# Function to train the model
def train_model(model, X_train, y_train, epochs=10):
    model.train()
    for epoch in range(epochs):
        inputs = torch.tensor(X_train, dtype=torch.float32)
        labels = torch.tensor(y_train, dtype=torch.long)

        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, labels)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        print(f'Epoch {epoch+1}/{epochs}, Loss: {loss.item()}')

In [37]:
# Train the model
train_model(model, X_train, y_train, epochs=10)

Epoch 1/10, Loss: 0.6986245512962341
Epoch 2/10, Loss: 0.6625272631645203
Epoch 3/10, Loss: 0.6278836131095886
Epoch 4/10, Loss: 0.5954279899597168
Epoch 5/10, Loss: 0.5659282803535461
Epoch 6/10, Loss: 0.5445443391799927
Epoch 7/10, Loss: 0.5276498198509216
Epoch 8/10, Loss: 0.5197811722755432
Epoch 9/10, Loss: 0.513404369354248
Epoch 10/10, Loss: 0.5101212859153748


In [38]:
# Function to test the model
def test_model(model, X_test, y_test):
    model.eval()
    with torch.no_grad():
        inputs = torch.tensor(X_test, dtype=torch.float32)
        labels = torch.tensor(y_test, dtype=torch.long)
        outputs = model(inputs)
        _, predicted = torch.max(outputs, 1)

        accuracy = (predicted == labels).sum().item() / len(labels)
        print(f'Accuracy: {accuracy * 100}%')

In [39]:
# Test the model
test_model(model, X_test, y_test)

Accuracy: 79.0%
