In [37]:
import torch
import torchvision
import torchvision.datasets as datasets
import torchvision.transforms as transforms
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
#import matplotlib.pyplot as plt
import numpy as np

In [38]:
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

In [39]:
def one_hot_encoder(sequence):
    # Create a mapping from nucleotide to a column index
    mapping = {'A': 0, 'T': 1, 'C': 2, 'G': 3}

    # Initialize a zero matrix with shape (5, length of sequence)
    encoded_matrix = np.zeros((4, len(sequence)), dtype=int)

    # Populate the matrix based on the sequence
    for idx, char in enumerate(sequence.upper()):
        if char in mapping:
            encoded_matrix[mapping[char], idx] = 1

    return encoded_matrix



In [40]:
def read_sequences_from_file(filename):
    try:
        with open(filename, 'r') as file:
            sequences = [(int(line.split(' ', 1)[0]), one_hot_encoder(line.split(' ', 1)[1].strip()))
                         for line in file if len(line.split(' ', 1)) == 2]
    except FileNotFoundError:
        print(f"Error: The file {filename} was not found.")
        sequences = []
    except Exception as e:
        print(f"An error occurred: {e}")
        sequences = []
    return sequences



In [41]:
class data_class(Dataset):
    # def __init__(self,data1):
    #     self.data=[]
    #     self.labels=[]


    #     for line in data1:
    #       print('here',line[0])
    #       self.data.append(line[1])
    #       self.labels.append(line[0])
    #       #print(self.data)
    #       #print(self.labels)
    #     self.data= torch.stack([torch.tensor(d).float() for d in self.data])
    #     self.labels=torch.stack([torch.tensor(l).long() for l in self.labels])
    #     return self.data, self.labels


    def __init__(self,data,label):
        self.data = torch.stack([torch.tensor(d).float() for d in data])
        self.label = torch.stack([torch.tensor(l).long() for l in label])


    def __len__(self):
        return len(self.data)

    def  shape(self):
      return self.data.shape


    def __getitem__(self,id):
        data_set=self.data[id]
        labels=self.label[id]

        return data_set,labels

In [42]:
# Example usage assuming 'sampled.txt' is formatted correctly as "label sequence"
filename_train = 'Train_sampled.txt'
filename_test = 'Test_sampled.txt'

sequences_train = read_sequences_from_file(filename_train)
sequences_test = read_sequences_from_file(filename_test)
data_train = []
label_train = []

for line in sequences_train:
  label_train.append(line[0])
  data_train.append(line[1])


seq = data_class(data_train,label_train)
train_dataloader_1=DataLoader(seq,batch_size=512,shuffle=True)
data_test = []
label_test = []

for line in sequences_test:
    label_test.append(line[0])
    data_test.append(line[1])

# Assuming DataClass has been properly defined and implemented to handle the data
seq_test = data_class(data_test, label_test)
test_dataloader_1 = DataLoader(seq_test, batch_size=512, shuffle=True)

# New Section

# New Section

In [43]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class ProteinCNN(nn.Module):
    def __init__(self, input_dim, output_dim, kernel_size=3, stride=1):
        super(ProteinCNN, self).__init__()
        self.conv1 = nn.Conv1d(in_channels=input_dim, out_channels=64, kernel_size=kernel_size, stride=stride)
        self.conv2 = nn.Conv1d(in_channels=64, out_channels=128, kernel_size=kernel_size, stride=stride)
        self.pool = nn.MaxPool1d(kernel_size=2)

        conv_output_size = self._calculate_conv_output_size(input_dim)
        self.fc1 = nn.Linear(conv_output_size, 64)
        self.fc2 = nn.Linear(64, output_dim)

    def forward(self, x):
        x = self._forward_conv(x)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

    def _forward_conv(self, x):
        x = F.relu(self.conv1(x))
        x = self.pool(x)  
        x = F.relu(self.conv2(x))
        x = self.pool(x)  
        x = torch.flatten(x, 1)
        return x

    def _calculate_conv_output_size(self, input_dim):
       
        with torch.no_grad():
            dummy_input = torch.randn(1, input_dim, 199)
            conv_output = self._forward_conv(dummy_input)
            return conv_output.size(1)

input_dim = 4  
output_dim = 2 
model = ProteinCNN(input_dim, output_dim)

optimizer=optim.Adam(params=model.parameters(),lr=0.0001)
loss_fn = nn.CrossEntropyLoss()


In [44]:
def train(model,device,train_dataloader,optimizer,epochs):
    print("inside train")
    model.train()
    for batch_ids, (img, classes) in enumerate(train_dataloader):
        # print(batch_ids)
        # print(img)
        # print('class',len(classes))
        # print()
        # classes=classes.type(torch.LongTensor)
        img,classes=img.to(device),classes.to(device)
        torch.autograd.set_detect_anomaly(True)
        optimizer.zero_grad()
        output=model(img)
        loss = loss_fn(output,classes)

        loss.backward()
        optimizer.step()

    print("Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}".format(
        epochs, batch_ids* len(img), len(train_dataloader.dataset),
        100.*batch_ids / len(train_dataloader.dataset),loss.item()))

def test(model, device, test_dataloader):
    model.eval()
    test_loss=0
    correct=0
    with torch.no_grad():
        for img,classes in test_dataloader:
            img,classes=img.to(device), classes.to(device)
            y_hat=model(img)
            test_loss+=F.nll_loss(y_hat,classes,reduction='sum').item()
            _,y_pred=torch.max(y_hat,1)
            correct+=(y_pred==classes).sum().item()
        test_loss/=len(test_dataloader)
        print("\n Test set: Avarage loss: {:.0f},Accuracy:{}/{} ({:.0f}%)\n".format(
            test_loss,correct,len(test_dataloader.dataset),100.*correct/len(test_dataloader.dataset)))
        print('='*30)


In [45]:
# WE ARE USING RANDOM DATA SO THE TRAINING AND TESTING DOES NOT MATTER, THE AIM IS TO SHOWCASE THE USE OF A CUSTOM DATASET
# SINCE IN PRACTICAL SENSE YOU HAVE TO CLEAN THE DATA AND LOAD THE DATA INTO THE MODEL.


if __name__=='__main__':
    seed=42
    EPOCHS=5

    for epoch in range(1,EPOCHS+1):
        train(model,device,train_dataloader_1,optimizer,epoch)
        test(model,device,test_dataloader_1)

inside train

 Test set: Avarage loss: -8,Accuracy:101/200 (50%)

inside train

 Test set: Avarage loss: -10,Accuracy:101/200 (50%)

inside train

 Test set: Avarage loss: -12,Accuracy:108/200 (54%)

inside train

 Test set: Avarage loss: -14,Accuracy:115/200 (58%)

inside train

 Test set: Avarage loss: -17,Accuracy:120/200 (60%)



In [None]:
# # Generating random data

# random_train_data = np.random.rand(32,1,28, 28)
# print(random_train_data.dtype)
# random_test_data = np.random.rand(16,1,28, 28)
# print(random_test_data.dtype)

float64
float64


In [None]:
# # Converting the data to tensor type and floating point type

# tensor_train_data = torch.from_numpy(random_train_data).float()
# tensor_test_data = torch.from_numpy(random_test_data).float()

# print(tensor_train_data.shape)
# print(tensor_test_data.dtype)
# print(len(tensor_data))

# # Creating random binary labels. and converting it to tensor

# label_test = np.random.choice([0, 1], size=len(tensor_test_data))
# label_train = np.random.choice([0, 1], size=len(tensor_train_data))

# print(label_train.dtype)
# label_test = torch.from_numpy(label_test)
# label_train = torch.from_numpy(label_train)
# print(label_test.dtype)

In [None]:

# The most important class, a custom data loader, understand how it is working.

# class data_class(Dataset):
#     def __init__(self,data,label):
#         self.data=data
#         self.labels=torch.tensor(label)

#     def __len__(self):
#         return len(self.data)

#     def __getitem__(self,id):
#         data_set=self.data[id]
#         labels=self.labels[id]

#         return data_set,labels