If you get the error "No CUDA GPUs are available," go to Runtime > Change runtime type > select GPU.

In [1]:
# install datasets
!pip install datasets

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/547.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.9/547.8 kB[0m [31m2.2 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m542.7/547.8 kB[0m [31m9.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-16.1.0-cp310-cp310-manylinux_2_28_x86_64.whl (40.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB

In [2]:
from datasets import load_dataset
imdb = load_dataset('imdb')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [4]:
imdb['train'][0]

{'text': 'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far be

In [5]:
def sentiment2label(sentiment):
    return 'positive' if sentiment == 1 else 'negative'

def label2sentiment(label):
    return 1 if label == 'positive' else 0

In [6]:
def clean_text(text):
    return text.replace('<br />', ' ')

def get_text_and_labels(data):
    texts = [clean_text(sample['text']) for sample in data]
    labels = [sample['label'] for sample in data]
    return texts, labels

In [7]:
texts, labels = get_text_and_labels(imdb['train'])
test_texts, test_labels = get_text_and_labels(imdb['test'])

In [8]:
texts[0], labels[0]

('I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.  The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.  What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far between, even then it\'s not s

In [9]:
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModel, AutoTokenizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [10]:
DEVICE = 'cuda'

In [11]:
class TextClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        inputs = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_tensors='pt',
            truncation=True
        )
        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'label': torch.tensor(label)
        }

In [12]:
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2, random_state=0)

tokenizer = AutoTokenizer.from_pretrained('distilbert/distilbert-base-uncased')
max_len = 128
batch_size = 16

train_dataset = TextClassificationDataset(train_texts, train_labels, tokenizer, max_len)
val_dataset = TextClassificationDataset(val_texts, val_labels, tokenizer, max_len)
test_dataset = TextClassificationDataset(test_texts, test_labels, tokenizer, max_len)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [13]:
train_texts[0]

"Saw this film ran in the wee hours on TCM. Several problems with the film were apparent from what I saw. First, the adults did not age when the children did for 10 years. Several parts of the film had continuity problems & for some reason the actor who played the youngest son looked like the oldest when the 10 years passed.   The copy I saw was missing about 20 minutes or so, at least a huge gap with black screen appeared. It is too bad, because even though the script left something to be desired, Bergman & Russell both did fine in the film in their roles. It is a shame the large chunk is missing, but what is here is watchable.  I just wish it was all intact. The script makes little sense in that Bergman's character is sent away when the kids are small but then brought back to take care of them when they are adults? Some of the time lines don't make sense either. There is a stock market crash that resembles 1929 but the kids grow up to fight in World War 1. All the acting by the suppo

In [14]:
# check if the data is loaded correctly
batch = next(iter(train_loader))

In [15]:
batch

{'input_ids': tensor([[  101,  2471,  2296,  ...,     0,     0,     0],
         [  101,  2065,  2017,  ...,  1997, 11680,   102],
         [  101,  2168,  2214,  ...,  2006,  1996,   102],
         ...,
         [  101,  2023,  3185,  ...,  3124,  7906,   102],
         [  101,  1045,  2109,  ...,  2001,  2035,   102],
         [  101,  1045,  2428,  ...,     0,     0,     0]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1],
         ...,
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 0, 0, 0]]),
 'label': tensor([1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1])}

In [16]:
embedding_model = AutoModel.from_pretrained('distilbert/distilbert-base-uncased')

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [17]:
# pass a single input through the model
input_ids = batch['input_ids']
attention_mask = batch['attention_mask']
output = embedding_model(input_ids, attention_mask)

In [18]:
output.last_hidden_state.shape

torch.Size([16, 128, 768])

In [19]:
class Classifier(nn.Module):
    def __init__(self, embedding_model, n_classes, dropout_p=0.1, train_embedder=True):
        super().__init__()
        self.embedding_model = AutoModel.from_pretrained(embedding_model)
        self.dropout = nn.Dropout(dropout_p)
        self.linear = nn.Linear(self.embedding_model.config.hidden_size, n_classes)

        if not train_embedder:
            for param in self.embedding_model.parameters():
                param.requires_grad = False

    def forward(self, input_ids, attention_mask):
        outputs = self.embedding_model(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden_state = outputs.last_hidden_state
        pooled_output = last_hidden_state[:, 0]
        pooled_output = self.dropout(pooled_output)
        logits = self.linear(pooled_output)
        return logits

In [20]:
def get_device():
    if torch.cuda.is_available():
        return torch.device('cuda')
    # For M1 Macs
    elif torch.backends.mps.is_available():
        return torch.device('mps')
    else:
        return torch.device('cpu')

device = get_device()
print(device)

cuda


In [21]:
# pass a single batch through the model
model = Classifier('distilbert/distilbert-base-uncased', 2, train_embedder=True).to(device)

input_ids = train_loader.dataset[0]['input_ids'].unsqueeze(0).to(device)
attention_mask = train_loader.dataset[0]['attention_mask'].unsqueeze(0).to(device)
with torch.no_grad():
    output = model(input_ids, attention_mask)
output

tensor([[-0.0854, -0.5905]], device='cuda:0')

In [22]:
from transformers import get_linear_schedule_with_warmup
from torch.optim import AdamW

loss_fn = nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=2e-5)
epochs = 3
total_steps = len(train_loader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

In [23]:
from tqdm import tqdm

In [24]:
for epoch in range(epochs):
    model.train()
    losses = []
    for batch in tqdm(train_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        model.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()
        losses.append(loss.item())

    print(f'Epoch {epoch + 1}, train loss: {sum(losses) / len(losses)}')

    model.eval()
    losses = []
    predictions = []
    true_labels = []
    with torch.no_grad():
        for batch in tqdm(val_loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, attention_mask)
            loss = loss_fn(outputs, labels)
            losses.append(loss.item())

            probs = nn.functional.softmax(outputs, dim=1)

            _, predicted = torch.max(probs, dim=1)
            predictions.extend(predicted.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    print(f'Epoch {epoch + 1}, validation loss: {sum(losses) / len(losses)}, accuracy: {accuracy_score(true_labels, predictions)}')


100%|██████████| 1250/1250 [04:00<00:00,  5.19it/s]


Epoch 1, train loss: 0.34779833637475965


100%|██████████| 313/313 [00:21<00:00, 14.57it/s]


Epoch 1, validation loss: 0.3228065661859874, accuracy: 0.864


100%|██████████| 1250/1250 [04:01<00:00,  5.18it/s]


Epoch 2, train loss: 0.2044195224881172


100%|██████████| 313/313 [00:21<00:00, 14.56it/s]


Epoch 2, validation loss: 0.31495681233085193, accuracy: 0.8808


100%|██████████| 1250/1250 [04:01<00:00,  5.18it/s]


Epoch 3, train loss: 0.10854224197156727


100%|██████████| 313/313 [00:21<00:00, 14.36it/s]

Epoch 3, validation loss: 0.38828613937641987, accuracy: 0.881





In [25]:
batch

{'input_ids': tensor([[  101,  2023,  2003,  ...,     0,     0,     0],
         [  101,  2023,  2143,  ...,  1996,  2974,   102],
         [  101,  1045,  2387,  ...,  1010,  2130,   102],
         ...,
         [  101,  2129,  2071,  ...,     0,     0,     0],
         [  101,  9792, 21306,  ...,  2322,  2781,   102],
         [  101,  2023,  2326,  ...,  2047,  7849,   102]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1]]),
 'label': tensor([1, 1, 1, 0, 1, 0, 0, 0])}

In [26]:
eval_text = ['This is a very bad code repository. It contains no README, no tests and the code is all over the place. I would not recommend using this code for anything. ']

In [None]:
eval_text = ['This is a great code repository. It contains the README, the unit tests run and everything is nicely documented.']

In [32]:
eval_text = ['The repository is a solid project with a strong foundation. It excels in documentation, code quality, and community engagement. However, addressing performance issues, adding more examples, updating dependencies, and improving the user interface would significantly enhance its usability and attractiveness. With these improvements, the project has the potential to become a go-to resource in its field.']

In [None]:
eval_text = ['The repository has significant issues that hinder its usability and appeal. The poor documentation, messy code, inactive community, lack of testing, performance issues, outdated dependencies, and unintuitive UI all contribute to a negative experience. Significant improvements are needed in these areas to make the project viable and useful to the community.']

In [33]:
eval_data = TextClassificationDataset(eval_text, [1], tokenizer, max_len)

In [34]:
eval_loader = DataLoader(eval_data, batch_size=batch_size, shuffle=True)

In [35]:
batch = next(iter(eval_loader))

In [36]:
batch

{'input_ids': tensor([[  101,  1996, 22409,  2003,  1037,  5024,  2622,  2007,  1037,  2844,
           3192,  1012,  2009, 24970,  2015,  1999, 12653,  1010,  3642,  3737,
           1010,  1998,  2451,  8147,  1012,  2174,  1010, 12786,  2836,  3314,
           1010,  5815,  2062,  4973,  1010,  2039, 16616, 12530, 15266,  1010,
           1998,  9229,  1996,  5310,  8278,  2052,  6022, 11598,  2049,  3915,
           8553,  1998,  8702,  2791,  1012,  2007,  2122,  8377,  1010,  1996,
           2622,  2038,  1996,  4022,  2000,  2468,  1037,  2175,  1011,  2000,
           7692,  1999,  2049,  2492,  1012,   102,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,    

In [40]:
with torch.no_grad():
  input_ids = batch['input_ids'].to(device)
  attention_mask = batch['attention_mask'].to(device)
  labels = batch['label'].to(device)

  outputs = model(input_ids, attention_mask)
  loss = loss_fn(outputs, labels)
  losses.append(loss.item())

  probs = nn.functional.softmax(outputs, dim=1)
  print(probs)
  _, predicted = torch.max(probs, dim=1)
  print(predicted.cpu().numpy())

tensor([[0.4201, 0.5799]], device='cuda:0')
[1]


In [39]:
torch.save(model, "./bert_classifier.pt")

In [43]:
import os
size = os.path.getsize("./bert_classifier.pt")
print(size/(1024*1024))

253.21990585327148


In [49]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [50]:
os.system("cp bert_classifier.pt ./drive/MyDrive/bert_classifier.pt")

0