<a href="https://colab.research.google.com/github/MaPavlovic/Zavrsni-projekt/blob/main/CNNzavrsni.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# This is needed so we can use torchtext.legacy
!pip install torch==1.8.0 torchtext==0.9.0 torchvision==0.9.0 

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import torch
from torchtext.legacy import data
from torchtext.legacy import datasets
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import random
import numpy as np
import spacy
import time
import matplotlib.pyplot as plt


In [3]:
TEXT = data.Field(tokenize = 'spacy', 
                  tokenizer_language = 'en_core_web_sm',
                  batch_first = True)
LABEL = data.LabelField(dtype = torch.float)

# The following code automatically downloads the IMDb dataset
train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)
train_data, valid_data = train_data.split()

# Example of the data
print(vars(train_data.examples[0]))

{'text': ['I', 'have', 'never', 'seen', 'such', 'terrible', 'performances', 'in', 'all', 'my', 'life.<br', '/><br', '/>Everyone', 'in', 'the', 'entire', 'film', 'was', 'absolute', 'rubbish.<br', '/><br', '/>Not', 'one', 'decent', 'actor', '/', 'actress', 'in', 'the', 'whole', 'film', ',', 'it', 'was', 'a', 'joke.<br', '/><br', '/>Reminded', 'me', 'of', 'drama', 'at', 'school', '...'], 'label': 'neg'}


In [4]:
MAX_VOCAB_SIZE = 25_000

TEXT.build_vocab(train_data, 
                 max_size = MAX_VOCAB_SIZE, 
                 vectors = "glove.6B.100d", 
                 unk_init = torch.Tensor.normal_)

LABEL.build_vocab(train_data)

In [5]:
BATCH_SIZE = 64

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE, 
    device = device)

In [6]:
class CNN1d(nn.Module):
    """Class that defines a model of an 1-dimensional convolutional layer"""

    def __init__(self, vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, 
                 dropout, pad_idx):
        """Init function that creates word embeddings from the input words(self.embedding),
        specifys the convolution with different filter sizes(self.convs) and adds a fully 
        connected layer for final predictions(self.fc)
        """
        
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx) 
        self.convs = nn.ModuleList([
                                    nn.Conv1d(in_channels = embedding_dim, 
                                              out_channels = n_filters, 
                                              kernel_size = fs)
                                    for fs in filter_sizes
                                    ])
        
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim) 
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
        """Forward path of the network, iterate through the list applying each
        convolutional layer to get a list of convolutional outputs
        """

        embedded = self.embedding(text).permute(0, 2, 1)      
        
        # Convolution Layer, apply ReLU activation function
        conved = [F.relu(conv(embedded)) for conv in self.convs]
            
        # Pooling Layer, reduce dimensionality
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        
        # Dropout Layer
        drop = self.dropout(torch.cat(pooled, dim = 1))    
        return self.fc(drop)

In [7]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
N_FILTERS = 100
FILTER_SIZES = [3,4,5]
OUTPUT_DIM = 1
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

model = CNN1d(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT, PAD_IDX)

pretrained_embeddings = TEXT.vocab.vectors
model.embedding.weight.data.copy_(pretrained_embeddings)

model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

print(model.embedding.weight.data)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [ 0.1897, -0.0174,  0.6258,  ..., -0.3503,  0.0343,  0.8224],
        [-0.0211, -0.1949, -0.4826,  ...,  0.1153,  0.1912, -0.4851],
        [-0.8699,  0.0444, -0.4751,  ...,  0.0196, -0.6144,  0.8288]])


In [8]:
# Network optimizer
optimizer = optim.Adam(model.parameters())

# Loss function
criterion = nn.BCEWithLogitsLoss()

model = model.to(device)
criterion = criterion.to(device)

In [9]:
def accuracy(preds, y):
    """Function that returns accuracy per batch, eg. 8/10 will
    return 0.8. Sends the predictions to a sigmoid function,
    squashing the values between 0 and 1.
    """

    correct = (torch.round(torch.sigmoid(preds)) == y).float() 
    return correct.sum() / len(correct)

def epoch_time(start_time, end_time):
    """Function thath tracks epoch training time"""

    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [10]:
def train(model, iterator, optimizer, criterion):
    """Function that trains the model with the data, optimizer and loss function."""

    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        optimizer.zero_grad()
        predictions = model(batch.text).squeeze(1)
        loss = criterion(predictions, batch.label)
        acc = accuracy(predictions, batch.label)
        # Back propagation
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluate(model, iterator, criterion):
    """Function that evaluates the model performance"""

    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
        for batch in iterator:
            predictions = model(batch.text).squeeze(1)
            loss = criterion(predictions, batch.label)
            acc = accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [11]:
# How many training loops we want
N_EPOCHS = 5

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'CNN-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 7m 1s
	Train Loss: 0.652 | Train Acc: 61.27%
	 Val. Loss: 0.505 |  Val. Acc: 78.01%
Epoch: 02 | Epoch Time: 7m 14s
	Train Loss: 0.432 | Train Acc: 80.20%
	 Val. Loss: 0.370 |  Val. Acc: 83.98%
Epoch: 03 | Epoch Time: 6m 56s
	Train Loss: 0.308 | Train Acc: 87.42%
	 Val. Loss: 0.320 |  Val. Acc: 86.54%
Epoch: 04 | Epoch Time: 6m 58s
	Train Loss: 0.224 | Train Acc: 91.20%
	 Val. Loss: 0.313 |  Val. Acc: 86.98%
Epoch: 05 | Epoch Time: 6m 51s
	Train Loss: 0.157 | Train Acc: 94.24%
	 Val. Loss: 0.324 |  Val. Acc: 87.09%


In [12]:
# Evaluates model on the saved test data
model.load_state_dict(torch.load('CNN-model.pt'))

test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.341 | Test Acc: 85.43%


In [13]:
#Use English language
nlp = spacy.load('en_core_web_sm')

def prediction(model, sentence, min_len = 5):
    """Function that predicts if the review is positive or negative,
    Positive reviews mapped to 1, and negative to 0.
    """
    
    model.eval()
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
    if len(tokenized) < min_len:
        tokenized += ['<pad>'] * (min_len - len(tokenized))
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(0)
    prediction = torch.sigmoid(model(tensor))
    return prediction.item()


reviews = ['This is the best movie I have ever watched!',
           'This is an okay movie',
           'This was a waste of time! I did not like this movie.']
scores=[prediction(model,review) for review in reviews]
scores

[0.9876343011856079, 0.5197110176086426, 0.014556529931724072]