In [1]:
import torch
from torch import nn
from text_preproc import TextPreproc

class MLPNet(nn.Module):

    def __init__(self, vec_len):
        super().__init__()
        #self.flatten = nn.Flatten
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(in_features=vec_len, out_features=vec_len//10, bias=True),
            nn.ReLU(),
            nn.Linear(in_features=vec_len//10, out_features=vec_len//100, bias=True),
            nn.ReLU(),
            nn.Linear(in_features=vec_len//100, out_features=2, bias=True)
        )

    def forward(self, x):
        #x = self.flatten(x)
        return self.linear_relu_stack(x)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
preproc_model = TextPreproc(rebalance=True)
net_model = MLPNet(preproc_model.get_vector_len()).to(device)
print(net_model)

loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(net_model.parameters(), lr=1e-3)

MLPNet(
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=7172, out_features=717, bias=True)
    (1): ReLU()
    (2): Linear(in_features=717, out_features=71, bias=True)
    (3): ReLU()
    (4): Linear(in_features=71, out_features=2, bias=True)
  )
)


In [2]:
from torch.utils.data import DataLoader, Dataset

class VecLoader(Dataset):

    def __init__(self, preproc_model, is_test=False):
        super().__init__()
        train, test = preproc_model.get_train_test_preprocd()

        if not is_test:
            x = train[train.columns[:-1]].values
            y = train[train.columns[-1]].values
        else:
            x = test[train.columns[:-1]].values
            y = test[train.columns[-1]].values

        self.x_train = torch.tensor(x).to(torch.float32)
        self.y_train = torch.tensor(y).to(torch.long)

    def __len__(self):
        return len(self.y_train)

    def __getitem__(self, idx):
        return self.x_train[idx], self.y_train[idx]


train_loader = DataLoader(VecLoader(preproc_model), batch_size=1,shuffle=False)
for i, (data, labels) in enumerate(train_loader):
    print(data.shape, labels.shape)
    print(data, labels)
    break

torch.Size([1, 7172]) torch.Size([1])
tensor([[0., 0., 0.,  ..., 0., 0., 0.]]) tensor([0])


In [3]:
def train(data_loader, model, loss_fn, optimizer):

    size = len(data_loader.dataset)
    model.train()

    for batch, (X, y) in enumerate(train_loader):
        X, y = X.to(device), y.to(device)

        # loss between forward and real vals
        pred = model(X)
        loss = loss_fn(pred, y)

        # backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch % 1000 == 0:
            loss, current = loss.item(), batch * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

def test(dataloader, model, loss_fn):

    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct = 0, 0

    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()

    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

    if correct > 0.95:
        torch.save(net_model.state_dict(), "MLPNet.pth")
        print("Saved PyTorch Model State to MLPNet.pth")

epochs = 10
test_loader = DataLoader(VecLoader(preproc_model, is_test=True), batch_size=10, shuffle=False)
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train(train_loader, net_model, loss_fn, optimizer)
    test(test_loader, net_model, loss_fn)

print("Done!")

Epoch 1
-------------------------------
loss: 0.713981  [    0/ 7392]
loss: 0.677244  [ 1000/ 7392]
loss: 0.731254  [ 2000/ 7392]
loss: 0.660710  [ 3000/ 7392]
loss: 0.643532  [ 4000/ 7392]
loss: 0.652796  [ 5000/ 7392]
loss: 0.667144  [ 6000/ 7392]
loss: 0.644526  [ 7000/ 7392]
Test Error: 
 Accuracy: 85.7%, Avg loss: 0.665554 

Epoch 2
-------------------------------
loss: 0.655800  [    0/ 7392]
loss: 0.645557  [ 1000/ 7392]
loss: 0.748385  [ 2000/ 7392]
loss: 0.650925  [ 3000/ 7392]
loss: 0.636892  [ 4000/ 7392]
loss: 0.649345  [ 5000/ 7392]
loss: 0.665044  [ 6000/ 7392]
loss: 0.642327  [ 7000/ 7392]
Test Error: 
 Accuracy: 85.7%, Avg loss: 0.664228 

Epoch 3
-------------------------------
loss: 0.655155  [    0/ 7392]
loss: 0.644227  [ 1000/ 7392]
loss: 0.747045  [ 2000/ 7392]
loss: 0.649771  [ 3000/ 7392]
loss: 0.634784  [ 4000/ 7392]
loss: 0.648424  [ 5000/ 7392]
loss: 0.664511  [ 6000/ 7392]
loss: 0.640118  [ 7000/ 7392]
Test Error: 
 Accuracy: 85.7%, Avg loss: 0.662860 

Epoc

In [4]:
import random

random.seed(40)
_, test = preproc_model.get_train_test_preprocd()
for i in range(5):
    n = random.randint(0, len(test))
    x = torch.Tensor([test.iloc[n][test.columns[:-1]]]).to(torch.float32).to(device)
    print(net_model(x).argmax(), test.iloc[n][test.columns[-1]])

tensor(1, device='cuda:0') 1.0
tensor(0, device='cuda:0') 0.0
tensor(0, device='cuda:0') 0.0
tensor(0, device='cuda:0') 0.0
tensor(0, device='cuda:0') 0.0


In [14]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

spam_dict = {"Hi, how are you feeling? You haven't written for a long time, so I thought something might have happened.": 0,
              'Only today! buy one king-size pizza, get one cola for free! Hurry up!': 1,
              'love you sweetie :)': 0,
              "Buy my book and I'll tell you how to become rich!": 1,
              'bae i cannot wait anymore. I want you now!': 0,
              'You’ve won a price! our phone number: +7 911 XXX-XX-XX': 1,
              'The IRS is trying to contact you': 1,
              'You have a refund coming': 1,
              'Verify your bank account': 1,
              'You have a package delivery': 0,
              'Verify your Apple iCloud ID': 0,
              'A family member needs help': 1,
              'You have a new billing statement': 1}

pred_y, real_y = [], list(spam_dict.values())
for letter in spam_dict:
    vectorized_letter = preproc_model.preproc_letter(letter)
    pred_y.append(net_model(torch.Tensor(vectorized_letter[0]).to(torch.float32).to(device)).argmax().item())

print(pred_y, '\n', real_y)
print('accuracy:', accuracy_score(pred_y, real_y))
print('precision:', precision_score(pred_y, real_y))
print('recall:', recall_score(pred_y, real_y))
print('f1_score:', f1_score(pred_y, real_y))

[0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1] 
 [0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1]
accuracy: 0.6153846153846154
precision: 0.375
recall: 1.0
f1_score: 0.5454545454545454
