In [None]:
import torch
import torchvision
import transformers
import numpy as np
import matplotlib.pyplot as plt
import PIL
import copy
import math

from torch import nn
from torch.nn import TransformerEncoder, TransformerEncoderLayer
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, TensorDataset, Subset
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification, get_scheduler
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from torchvision import transforms
import gensim.downloader as api
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score
from IPython.display import clear_output
from gensim.models import Word2Vec
from datasets import load_dataset

torch.manual_seed(42)

In [None]:
# gpu

print(torch.cuda.is_available())
device = 'cuda' if torch.cuda.is_available() else 'cpu'
torch.tensor(1).to(device)

In [3]:
# debug in google colab

import pdb
#pdb.set_trace()

# Simple Linear Models

In [None]:
# linear layer

lr = nn.Linear(in_features=1, out_features=1, bias=True)
print(lr.parameters())
print(lr.state_dict())
print(lr.weight)
print(lr.bias)
print(lr(torch.tensor([[2.], [3.]])))

In [None]:
# simple linear regression model

class LR(nn.Module):
    def __init__(self, input_size, output_size):
        super(LR, self).__init__()
        self.linear = nn.Linear(input_size, output_size)

    def forward(self, x):
        out = self.linear(x)
        return out

model = LR(1, 1)
print(model(torch.tensor([[2.], [3.]])))

In [None]:
# train a simple linear regression

X = torch.arange(-3, 3, 0.1).view(-1, 1)
y = -3 * X + 5
y += torch.randn(y.size())

def forward(x):
    return w * x + b

def criterion(pred, y):
    return torch.mean((pred - y) ** 2)

w = torch.tensor(10., requires_grad=True)
b = torch.tensor(-1., requires_grad=True)
lr, n_epochs = 0.1, 10

for _ in range(n_epochs):
    pred = forward(X)
    loss = criterion(pred, y)
    loss.backward()
    with torch.no_grad():
        w -= lr * w.grad
        b -= lr * b.grad
    # w.data -= lr * w.grad.data # alternatively
    # print(loss.item())
    w.grad.zero_()
    b.grad.zero_()

print(w, b)

In [None]:
# batch training with data loader

class Data(Dataset):
    def __init__(self):
        self.x = torch.arange(-3, 3, 0.1).view(-1, 1)
        self.y = 2 * self.x - 3
        self.len = self.x.shape[0]

    def __getitem__(self, index):
        return self.x[index], self.y[index]

    def __len__(self):
        return self.len

w = torch.tensor(10., requires_grad=True)
b = torch.tensor(10., requires_grad=True)

dataset = Data()
dataloader = DataLoader(dataset=dataset, batch_size=4)
def train(n_epochs):
    global w, b
    for _ in range(n_epochs):
        for X, y in dataloader:
            pred = forward(X)
            loss = criterion(pred, y)
            loss.backward()
            with torch.no_grad():
                w -= lr * w.grad
                b -= lr * b.grad
            w.grad.zero_()
            b.grad.zero_()

train(5)
print(w, b)

In [None]:
# train a model with an optimizer

criterion = nn.MSELoss()
model = LR(1, 1)
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
trainloader = DataLoader(dataset=dataset, batch_size=16)

def train(n_epochs):
    for epoch in range(n_epochs):
        for X, y in trainloader:
            pred = model(X)
            loss = criterion(pred, y)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

train(5)
print(model.state_dict())

In [None]:
# model with he initialized weights (for relu) + xavier + dropout + batch norm
# (dropout can sometimes be before the activation)
# (batch normalization: really big batch sizes, no dropout)

class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear1 = nn.Linear(1, 10)
        self.batch_norm1 = nn.BatchNorm1d(10) # num of layer's outputs
        nn.init.kaiming_uniform_(self.linear1.weight, nonlinearity='relu')
        
        self.linear2 = nn.Linear(10, 1)
        nn.init.xavier_uniform_(self.linear2.weight)

        self.drop = nn.Dropout(p=0.3)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        #x = self.drop(self.relu(self.linear1(x)))
        x = self.batch_norm1(self.relu(self.linear1(x)))
        x = self.sigmoid(self.linear2(x))
        return x

model = Net()
print(model(torch.tensor([[1.], [3.]])))

In [None]:
# detecting anomalies
class Autoencoder(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 16),
            nn.ReLU()
        )
        self.decoder = nn.Sequential(
            nn.Linear(16, 32),
            nn.ReLU(),
            nn.Linear(32, 64),
            nn.ReLU(),
            nn.Linear(64, input_dim),
            #nn.Sigmoid()
        )

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x

# sample data
normal_data = np.random.normal(2, 1, (1000, 50))
anomalous_data = np.random.normal(0, 2, (50, 50))
data = torch.tensor(np.vstack([normal_data, anomalous_data]), dtype=torch.float, device=device)
data_loader = DataLoader(TensorDataset(data), batch_size=32, shuffle=True)

# training
model = Autoencoder(data.shape[1]).to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

for epoch in range(11):
    for batch in data_loader:
        inputs = batch[0]
        outputs = model(inputs)
        loss = criterion(outputs, inputs)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    if epoch % 5 == 0:
        print(f"{epoch}: loss: {loss.item():.4f}")

# detecting anomalies
def detect_anomaly(data, threshold):
    output = model(data)
    loss = criterion(output, data)
    return loss.item() > threshold

def detect_anomalies(data, threshold):
    #data = torch.tensor(data, dtype=torch.float32)
    outputs = model(data)
    losses = nn.functional.mse_loss(outputs, data, reduction='none').mean(dim=1)
    anomalies = losses > threshold
    return anomalies

threshold = 2.0 

anomalies = []
for i, row in enumerate(data):
    if detect_anomaly(row, threshold):
        anomalies.append(i)

print(f'Anomalies detected at indices: {anomalies}')

preds = detect_anomalies(data, threshold).cpu()
y = np.full(1050, False)
y[1000:] = True
print("accuracy: ", accuracy_score(y, preds))
print("precision: ", precision_score(y, preds))
print("recall: ", recall_score(y, preds))

In [None]:
# plot scores of thresholds

thresholds = np.arange(0.05, 5.0, 0.05)
score_acc = []
score_prec = []
score_rec = []

for threshold in thresholds:
    preds = detect_anomalies(data, threshold).cpu()
    score_acc.append(accuracy_score(y, preds))
    score_prec.append(precision_score(y, preds))
    score_rec.append(recall_score(y, preds))

plt.plot(thresholds, score_acc, label='accuracy_score')
plt.plot(thresholds, score_prec, label='precision_score')
plt.plot(thresholds, score_rec, label='recall_score')
plt.legend()
plt.show()

# Training Loop

In [12]:
# sample data
x_sample = torch.rand((10_000, 1), device=device)
y_sample = (4*x_sample**5 + -3*x_sample**2) > -0.3
train_dataset = TensorDataset(x_sample[:-1000], y_sample[:-1000].type(torch.float))
valid_dataset = TensorDataset(x_sample[-1000:], y_sample[-1000:].type(torch.float))
train_loader = DataLoader(train_dataset, batch_size=512)
valid_loader = DataLoader(valid_dataset, batch_size=1024)

In [13]:
# a function to train a model
# you may need to convert tensors to float32 (before criterium)
# and .to(device)

import matplotlib.pyplot as plt
from IPython.display import clear_output
import copy

def compute_error(model, data_loader, criterion, c_sum=False):
    model.eval()
    losses, num_of_el = 0, 0
    with torch.no_grad():
        for x, y in data_loader:
            x = x.to(device)
            y = y.to(device)
            outputs = model(x)
            loss = criterion(outputs, y)
            if not c_sum: loss *= len(y)
            losses += loss
            num_of_el += len(y)
    return losses / num_of_el


def train_model(model: nn.Module,
              train_loader: DataLoader,
              valid_loader: DataLoader,
              num_epochs: int,
              optimizer: torch.optim.Optimizer,
              criterion,
              verbose: bool = True,
              verbose_plot: bool = False
              ) -> float:

    best_epoch = None
    best_params = None
    best_val_loss = np.inf
    train_losses, valid_losses = [], []

    for epoch in range(num_epochs):
        model.train()
        _iter = 1
        for inputs, targets in train_loader:
            inputs = inputs.to(device)
            targets = targets.to(device)

            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()

            if verbose:
                if _iter % 10 == 0:
                    print(f"Minibatch {_iter:>6}    |  loss {loss.item():>5.2f}  |")
            _iter += 1

        val_loss = compute_error(model, valid_loader, criterion)

        if val_loss < best_val_loss:
            best_epoch = epoch
            best_val_loss = val_loss
            best_params = [copy.deepcopy(p.detach().cpu()) for p in model.parameters()]

        if verbose:
            clear_output(True)
            m = f"After epoch {epoch:>2} | valid loss: {val_loss:>5.2f}"
            print("{0}\n{1}\n{0}".format("-" * len(m), m))

        if verbose_plot:
            train_loss = compute_error(model, train_loader, criterion)
            train_losses.append(train_loss.detach().cpu())
            valid_losses.append(val_loss.detach().cpu())

    if best_params is not None:
        if verbose:
            print(f"\nLoading best params on validation set in epoch {best_epoch} with loss {best_val_loss:.2f}")
        with torch.no_grad():
            for param, best_param in zip(model.parameters(), best_params):
                param[...] = best_param

    if verbose_plot:
        plt.figure(figsize=(6, 3))
        plt.plot(train_losses, c='b', label='train')
        plt.plot(valid_losses, c='r', label='valid')
        plt.grid(ls=':')
        plt.legend()
        plt.show()

    return best_val_loss

In [None]:
# train a single model

model = Net().to(device)
optimizer = torch.optim.NAdam(model.parameters(), lr=0.01)
criterion = nn.BCELoss()

train_model(model, train_loader, valid_loader, 10, optimizer, criterion, verbose=True, verbose_plot=True)

In [None]:
# split the dataset into k folds and train k models on them

def train_kfold(Net, dataset, n_splits=5, num_epochs=10, batch_size=32, learning_rate=0.01):
    models = nn.ModuleList()
    scores = []

    kf = KFold(n_splits=n_splits, shuffle=True)
    for train_ids, val_ids in kf.split(dataset):
        train_sub = Subset(dataset, train_ids)
        valid_sub = Subset(dataset, val_ids)
        train_loader = DataLoader(train_sub, batch_size=batch_size, shuffle=True)
        valid_loader = DataLoader(valid_sub, batch_size=batch_size, shuffle=False)

        model = Net().to(device)
        optimizer = torch.optim.NAdam(model.parameters(), lr=learning_rate)
        criterion = nn.BCELoss()

        train_model(model, train_loader, valid_loader, num_epochs=num_epochs, 
                    optimizer=optimizer, criterion=criterion, verbose=True)

        scores.append(compute_error(model, valid_loader, criterion).detach().cpu())
        models.append(model)

    return models, scores

# training the models and checking the scores
models, scores = train_kfold(Net, train_dataset, n_splits=10, num_epochs=1, batch_size=256)
clear_output(False)
print(scores)

In [None]:
# a function to predict outputs with k models

def predict_ensemble(models, x):
    predictions = []
    for model in models:
        model.eval()
        model_preds = []
        with torch.no_grad():
            pred = model(x)
            model_preds.append(pred)
        predictions.append(torch.cat(model_preds))

    predictions = torch.mean(torch.stack(predictions), dim=0)
    return predictions


def evaluate_ensemble(models, data_loader, criterion, c_sum=False):
    losses, num_of_el = 0, 0
    with torch.no_grad():
        for x, y in data_loader:
            outputs = predict_ensemble(models, x)
            loss = criterion(outputs, y)
            if not c_sum: loss *= len(y)
            losses += loss
            num_of_el += len(y)
    return losses / num_of_el


print(evaluate_ensemble(models, valid_loader, criterion))

In [None]:
# check accuracy on a single and multiple models

# binary output
def accuracy_binary(outputs, y, threshold=0.5):
    pred = outputs > threshold
    return sum(pred == y)

# multiple outputs
def accuracy_multiple(outputs, y):
    pred = outputs.argmax(dim=1)
    return sum(pred == y)

print(compute_error(models[0], valid_loader, accuracy_binary, c_sum=True))
print(evaluate_ensemble(models, valid_loader, accuracy_binary, c_sum=True))

# Convolutional Neural Networks

In [None]:
# using torchvision

model = torchvision.models.resnet50(weights='ResNet50_Weights.DEFAULT').to(device)

# normalize data - same for every torchvision pretrained model !
# and an example of some other transforms
composed = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                        std=[0.229, 0.224, 0.225])
])

for param in model.parameters():
    param.requires_grad = False

# change the last layer to fit our needs
# (you can check how does your chosen network looks like first [model])
num_classes = 3
model.fc = nn.Linear(model.fc.in_features, num_classes).to(device)

# optimize the parameters of the last layer
optimizer = torch.optim.Adam(model.fc.parameters(), lr=0.003)
# train_model(...)

# prediction
img = PIL.Image.fromarray((np.random.rand(224, 224, 3) * 255).astype(np.uint8))
model(composed(img)[None, :, :, :])

In [None]:
# an example of a CNN (28x28x3 -> 10 classes)

class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=0)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=0)
        self.pool = nn.MaxPool2d(kernel_size=3, stride=2)
        self.linear_layers = nn.Sequential(
            nn.Linear(64 * 11 * 11, 128),
            nn.Linear(128, 10)
        )

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.relu(self.pool(self.conv2(x)))
        x = torch.flatten(x, 1)
        x = self.linear_layers(x)
        return F.softmax(x, dim=1)

model = Net()
model(torch.zeros((1, 3, 28, 28)))

In [None]:
# an example of a CNN with batch normalization (16x16x1 -> 10 classes)

class Net(nn.Module):
    def __init__(self, out_1=16, out_2=32):
        super().__init__()
        self.cnn1 = nn.Conv2d(in_channels=1, out_channels=out_1, kernel_size=5, stride=1, padding="same")
        self.conv1_bn = nn.BatchNorm2d(out_1)
        self.maxpool1 = nn.MaxPool2d(kernel_size=2)

        self.cnn2 = nn.Conv2d(in_channels=out_1, out_channels=out_2, kernel_size=5, stride=1, padding="same")
        self.conv2_bn = nn.BatchNorm2d(out_2)
        self.maxpool2 = nn.MaxPool2d(kernel_size=2)

        self.fc1 = nn.Linear(out_2 * 4 * 4, 10)
        self.fc1_bn = nn.BatchNorm1d(10)
    
    def forward(self, x):
        x = self.conv1_bn(torch.relu(self.cnn1(x)))
        x = self.maxpool1(x)
        x = self.conv2_bn(torch.relu(self.cnn2(x)))
        x = self.maxpool2(x)
        x = x.view(x.shape[0], -1)
        x = self.fc1_bn(self.fc1(x))
        return x

model = Net()
model(torch.rand((3, 1, 16, 16)))

In [None]:
# an example of a convolutional neural network with skip connections

class ConvSkipNet(nn.Module):
    def __init__(self):
        super(ConvSkipNet, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=3, out_channels=16, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(in_channels=16, out_channels=16, kernel_size=3, padding=1)
        self.conv3 = nn.Conv2d(in_channels=16, out_channels=32, kernel_size=3, padding=1)
        self.fc = nn.Linear(16 * 16 * 32, 10) 

        self.skip_conv = nn.Conv2d(in_channels=3, out_channels=16, kernel_size=1)

    def forward(self, x):
        identity = self.skip_conv(x) 
        out = F.relu(self.conv1(x))
        out = F.relu(self.conv2(out) + identity) 
        out = F.relu(self.conv3(out))
        out = out.view(out.size(0), -1)
        out = self.fc(out)
        return out

model = ConvSkipNet()
input_data = torch.randn(2, 3, 16, 16)
print(model(input_data))

In [None]:
# Grad-CAM for explainability 

# similar to previous cell with torchvision + defining dataloader
import PIL.Image


composed = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                        std=[0.229, 0.224, 0.225])
])

dataset = torchvision.datasets.ImageFolder(root='./data/imgs/', transform=composed)
dataloader = DataLoader(dataset, shuffle=False, batch_size=1)


# defining our model with gradient hooks
# densenet: https://medium.com/@stepanulyanin/implementing-grad-cam-in-pytorch-ea0937c31e82#:~:text=case%20of%20the-,DenseNet,-)%20we%20are%20going
class Net_Grad(nn.Module):
    def __init__(self):
        super().__init__()
        # check the vgg19 architecture first
        self.model = torchvision.models.vgg19(weights='VGG19_Weights.IMAGENET1K_V1')
        self.features_conv = self.model.features[:36]
        self.max_pool = nn.MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=True)
        self.classifier = self.model.classifier
        self.gradients = None

    def activations_hook(self, grad):
        self.gradients = grad

    def forward(self, x):
        x = self.features_conv(x)
        h = x.register_hook(self.activations_hook)
        x = self.max_pool(x)
        x = x.view((1, -1))
        x = self.classifier(x)
        return x
    
    def get_activations_gradient(self):
        return self.gradients
    
    def get_activations(self, x):
        return self.features_conv(x)


# getting the heatmap
model = Net_Grad()
model.eval()
img, _ = next(iter(dataloader))
preds = model(img)
pred = preds.argmax(dim=1)

preds[:, pred].backward()
gradients = model.get_activations_gradient()
pooled_gradient = torch.mean(gradients, dim=[0, 2, 3])
activations = model.get_activations(img).detach()
for i in range(512):
    activations[:, i, :, :] *= pooled_gradient[i]
heatmap = torch.mean(activations, dim=1).squeeze()
heatmap = np.maximum(heatmap, 0)
heatmap /= torch.max(heatmap)
plt.matshow(heatmap.squeeze())


# interpolate the heat-map onto the image 
image = PIL.Image.open('./data/imgs/elephant/elephant.jpg').convert('RGBA')
heatmap_colored = (plt.cm.jet(heatmap) * 255).astype(np.uint8)
heatmap_image = PIL.Image.fromarray(heatmap_colored).resize(image.size, PIL.Image.BILINEAR)
blended_image = PIL.Image.blend(image, heatmap_image, alpha=0.5)
blended_image

# Recurrent Neural Networks

In [None]:
# a simple recurrent net with one output for each input

class Net(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.rnn1 = nn.LSTM(500, 128, batch_first=False, num_layers=3, bidirectional=True)
        self.layers = nn.Sequential(
            nn.Linear(256, 128),
            nn.Linear(128, 1)
        )
    
    def forward(self, x, input_lengths):
        packed_inputs = pack_padded_sequence(x, input_lengths, batch_first=True, enforce_sorted=False)
        packed_outputs, _ = self.rnn1(packed_inputs)
        outputs, _ = pad_packed_sequence(packed_outputs, batch_first=True)
        outputs = self.layers(outputs)[:, -1, :] # if you want only the last state
        return outputs

model = Net()
model.to(device)
x = torch.randn(3, 5, 500, device=device)  # (batch_size, seq_len, input_size)
input_lengths = torch.randint(1, 5, (3,))
model(x, input_lengths), input_lengths

In [None]:
# a recurrent net with varying outputs

def loss_fn(output, target, mask):
    # mask, target, outputs - tensors with the same shape
    output_masked = output[mask]
    target_masked = target[mask]
    loss = F.mse_loss(output_masked, target_masked)
    return loss


class Net(torch.nn.Module):
    def __init__(self, n_features=500, n_outputs=1):
        super().__init__()
        self.rnn1 = nn.LSTM(n_features, 128, batch_first=False, num_layers=3, bidirectional=True)
        self.rnn2 = nn.RNN(256, 256, batch_first=False)
        self.rnn3 = nn.GRU(256, 128, batch_first=False, bidirectional=True)
        self.layers = nn.Sequential(
            nn.Linear(256, 128),
            nn.LeakyReLU(128),
            nn.Linear(128, n_outputs)
        )
        
        nn.init.kaiming_uniform_(self.layers[0].weight, nonlinearity='relu')
    
    def forward(self, x, input_lengths):
        packed_inputs = pack_padded_sequence(x, input_lengths, batch_first=True, enforce_sorted=False)
        packed_outputs, _ = self.rnn1(packed_inputs)
        packed_outputs, _ = self.rnn2(packed_outputs)
        packed_outputs, _ = self.rnn3(packed_outputs)
        outputs, _ = pad_packed_sequence(packed_outputs, batch_first=True)
        outputs = self.layers(outputs)
        outputs = outputs[:, :, :len(packed_inputs.batch_sizes)]
        return outputs


model = Net(n_outputs=5)
model.to(device)

# (batch_size, seq_len, input_size)
sample_data = torch.randn((3, 5, 500), device=device)
sample_lengths = torch.randint(low=1, high=5, size=(3,))

# output: max input cnt in batch ^2
# for each sample in a batch: input cnt ^2 + padding
model(sample_data, sample_lengths), sample_lengths

In [None]:
# time series dataset for training recurrent networks
# (const seq_length and predicting next var in time series)

# dataset
class TimeSeriesDataset(Dataset):
    def __init__(self, data, seq_length):
        self.data = data
        self.seq_length = seq_length

    def __len__(self):
        return len(self.data) - self.seq_length

    def __getitem__(self, idx):
        x = self.data[idx:idx+self.seq_length]
        y = self.data[idx+self.seq_length]
        return torch.tensor(x, dtype=torch.float, device=device), torch.tensor(y, dtype=torch.float, device=device)

# model
class Net(torch.nn.Module):
    def __init__(self, n_features=500, n_outputs=1):
        super().__init__()
        self.rnn1 = nn.LSTM(n_features, 128, batch_first=True, num_layers=1)
        self.layers = nn.Sequential(
            nn.Linear(128, 128),
            nn.LeakyReLU(128),
            nn.Linear(128, n_outputs)
        )
        
        nn.init.kaiming_uniform_(self.layers[0].weight, nonlinearity='relu')
    
    def forward(self, x):
        x, _ = self.rnn1(x)
        x = x[:, -1, :] # if you want only the last state
        x = self.layers(x)
        return x

# some sample data
data = torch.linspace(0, 100, 200).reshape(-1, 2)

# dataset and data loader
seq_length = 5
dataset = TimeSeriesDataset(data, seq_length)
data_loader = DataLoader(dataset, batch_size=3, shuffle=True)

# sample training loop
model = Net(n_outputs=4, n_features=2).to(device)
for x, y in data_loader:
    outputs = model(x)
    break

outputs

# Natural Language Processing

In [None]:
# a simple word2vec model with gensim

# pretrained
w2v = api.load('glove-wiki-gigaword-50')
word_embedding = w2v['king']
print(word_embedding[:7])


sentences = [
    ["cat", "say", "meow"],
    ["dog", "say", "woof", "woof"],
    ["cat", "chase", "mouse"],
    ["dog", "chase", "cat"]
]

# window: maximum distance between the current and predicted word within a sentence
# min_count: Ignores all words with total frequency lower than this
# workers: Number of worker threads to use
model = Word2Vec(sentences, vector_size=3, window=5, min_count=1, workers=-1)
model.build_vocab(sentences)
model.train(sentences, total_examples=model.corpus_count, epochs=100)

embed_vector = model.wv['cat']
print(embed_vector, embed_vector.shape)

most_similar = model.wv.most_similar('cat', topn=3)
print(most_similar)

similarity_cat_dog = model.wv.similarity('cat', 'dog')
print(similarity_cat_dog)

distance_cat_dog = model.wv.distance('cat', 'dog')
print(distance_cat_dog)

words_similar_by_vector = model.wv.similar_by_vector(embed_vector, topn=3)
print(words_similar_by_vector)

odd_one_out = model.wv.doesnt_match(['cat', 'dog', 'mouse', 'say'])
print(odd_one_out)

result = model.wv.most_similar(positive=['dog', 'meow'], negative=['cat'], topn=3)
print(result)

vocabulary = list(model.wv.index_to_key)
print(vocabulary)

In [None]:
# hugging face transformers library 

model_name = "distilbert-base-uncased-finetuned-sst-2-english"
classifier = pipeline("sentiment-analysis", model=model_name)
result = classifier("The actors were very convincing")
print(result)

classifier = pipeline("text-classification", model="huggingface/distilbert-base-uncased-finetuned-mnli")
print(classifier("She loves me. [SEP] She loves me not."))


generator = pipeline('text-generation', model='gpt2')
result = generator("Once upon a time in a land far, far away", max_length=50, truncation=True, num_return_sequences=3)
print(result)


tokenizer = AutoTokenizer.from_pretrained(model_name)
token_ids = tokenizer(["I like soccer. [SEP] We all love soccer!",
    "Joe lived for a very long time. [SEP] Joe is old."], padding=True, return_tensors='pt')
print(token_ids['input_ids'])
print(token_ids['attention_mask'])

model = AutoModelForSequenceClassification.from_pretrained(model_name)
with torch.no_grad():
    outputs = model(**token_ids)
    logits = outputs.logits
probabilities = nn.functional.softmax(logits, dim=-1)
print(probabilities)

In [None]:
# a simple transformer in PyTorch

transformer_model = nn.Transformer(d_model=50, nhead=10, num_encoder_layers=12, batch_first=True)
# (batch_size, seq_len, embedding_dim)
src = torch.rand((32, 10, 50))
tgt = torch.rand((32, 10, 50))
out = transformer_model(src, tgt)
print(out.shape)

# Training Hugging Face Transformers

In [None]:
# fine tuning hugging face transformer for a task using Trainer

# sample data
X = ["The cat is white", "I like dogs"] * 10
y = [0, 3] * 10

# prepare the data
class TransDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        
        # tokenize the text
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# define tokenizer, dataset and model
max_len = 50
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
dataset = TransDataset(texts=X, labels=y, tokenizer=tokenizer, max_len=max_len)
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=4)
model.to(device)

# train
training_args = TrainingArguments(
    output_dir='results',
    num_train_epochs=10,
    per_device_train_batch_size=2, 
    warmup_steps=30,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    eval_dataset=dataset,
)

trainer.train()

# predict
encoding_valid = tokenizer(
    ["The cat is white", "I like dogs"],
    add_special_tokens=True,
    max_length=max_len,
    return_token_type_ids=False,
    padding='max_length',
    truncation=True,
    return_attention_mask=True,
    return_tensors='pt',
).to(device)

model.eval()
with torch.no_grad():
    outputs = model(**encoding_valid)
    print(outputs.logits)

In [None]:
# fine tuning hugging face transformer for a task using PyTorch

# define tokenizer, dataset and model - the same as previously
max_len = 50
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
dataset = TransDataset(texts=X, labels=y, tokenizer=tokenizer, max_len=max_len)
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=4)
model.to(device)

data_loader = DataLoader(dataset, batch_size=8, shuffle=True)
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

# define scheduler
num_epochs = 7
num_training_steps = num_epochs * len(data_loader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=10, num_training_steps=num_training_steps
)


# train
model.train()
for epoch in range(num_epochs):
    for batch in data_loader:
        #batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

    print(loss.item(), lr_scheduler._last_lr)   


# evaluate 
all_preds, all_labels = [], []

model.eval()
for batch in data_loader:
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    all_preds.extend(predictions.detach().cpu().numpy())
    all_labels.extend(batch['labels'].detach().cpu().numpy())

print(accuracy_score(all_labels, all_preds))