In [46]:
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
from torch.utils.data import DataLoader, TensorDataset

In [3]:
f = r"C:/Code/bioinfo/data/fullset_test.csv"
df = pd.read_csv(f, names=["name", "seq", "class"])

In [13]:
mapping = {'A': 0, 'T': 1, 'C': 2, 'G': 3}

In [31]:
def mapping_fn(string):
    x = [mapping[x] for x in string]
    return x

In [106]:
data = df["seq"].apply(lambda x: mapping_fn(x))
data2 = np.zeros((len(df), 300), dtype=np.int64)

In [107]:
for i, d in enumerate(data):
    data2[i, :] = d

In [108]:
data2.shape

(26405, 300)

In [109]:
torch_tensor_output = torch.tensor(df['class'].values, dtype=torch.float)
torch_tensor_vectors = torch.from_numpy(data2)

In [110]:
torch_tensor_vectors.dtype

torch.int64

In [78]:
torch_tensor_vectors.shape

torch.Size([26405, 300])

In [101]:
EMBEDDING_VEC_SIZE = 32
DNA_BASES = 4

In [114]:
class Example(nn.Module):
    def __init__(self):
        super(Example, self).__init__()
        self.embed = nn.Embedding(DNA_BASES, EMBEDDING_VEC_SIZE)
        self.fc = nn.Linear(300 * 32, 1)
    
    def forward(self, x):
        x = self.embed(x)
        x = x.flatten()
        x = self.fc(x)
        return torch.sigmoid(x)

In [115]:
net = Example()

In [116]:
criterion = nn.BCELoss()
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)

In [118]:
for epoch in range(5):  # loop over the dataset multiple times
    epoch_loss = 0.0
    for i, (inputs, labels) in enumerate(zip(torch_tensor_vectors, torch_tensor_output)):

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = net(inputs)
        # print(outputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # print statistics
        epoch_loss += loss.item()
        if i % 2000 == 1999:    # print every 2000 mini-batches
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, i + 1, epoch_loss / (i + 1)))

print('Finished Training')

  "Please ensure they have the same size.".format(target.size(), input.size()))


[1,  2000] loss: 0.751
[1,  4000] loss: 0.826
[1,  6000] loss: 0.990
[1,  8000] loss: 1.048
[1, 10000] loss: 1.079
[1, 12000] loss: 1.094
[1, 14000] loss: 1.151
[1, 16000] loss: 1.168
[1, 18000] loss: 1.123
[1, 20000] loss: 1.098
[1, 22000] loss: 1.084
[1, 24000] loss: 1.094
[1, 26000] loss: 1.096
[2,  2000] loss: 0.780
[2,  4000] loss: 0.834
[2,  6000] loss: 0.938
[2,  8000] loss: 1.018
[2, 10000] loss: 1.050
[2, 12000] loss: 1.067
[2, 14000] loss: 1.132
[2, 16000] loss: 1.136
[2, 18000] loss: 1.107
[2, 20000] loss: 1.079
[2, 22000] loss: 1.062
[2, 24000] loss: 1.061
[2, 26000] loss: 1.045
[3,  2000] loss: 0.824
[3,  4000] loss: 0.812
[3,  6000] loss: 0.899
[3,  8000] loss: 0.970
[3, 10000] loss: 1.000
[3, 12000] loss: 1.032
[3, 14000] loss: 1.093
[3, 16000] loss: 1.090
[3, 18000] loss: 1.063
[3, 20000] loss: 1.041
[3, 22000] loss: 1.021
[3, 24000] loss: 1.028
[3, 26000] loss: 1.041
[4,  2000] loss: 0.791
[4,  4000] loss: 0.825
[4,  6000] loss: 0.932
[4,  8000] loss: 1.017
[4, 10000] 