In [1]:
!nvidia-smi

Fri Jun 28 13:02:56 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 545.29.06              Driver Version: 545.29.06    CUDA Version: 12.3     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Quadro RTX 8000                Off | 00000000:5E:00.0 Off |                  Off |
| 34%   32C    P5              16W / 260W |      6MiB / 49152MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   1  Quadro RTX 8000                Off | 00000000:86:00.0 Off |  

In [2]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Load the data
data = pd.read_csv('/po1/bannapol/jumbo/train.csv')  # Adjust the path to your dataset
data.columns = ["label", "text"]

In [5]:
# Split the data
train_texts, test_texts, train_labels, test_labels = train_test_split(data['text'].tolist(), data['label'].tolist(), test_size=0.2)


In [6]:
# Tokenize the data
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')




In [7]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

In [8]:
# Create a PyTorch dataset
class BeerReviewsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [9]:
train_dataset = BeerReviewsDataset(train_encodings, train_labels)
test_dataset = BeerReviewsDataset(test_encodings, test_labels)

In [10]:
# Load pre-trained BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
# Check if a GPU is available and move the model to the GPU
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [12]:
# Define the training arguments
training_args = TrainingArguments(
    output_dir='/po1/bannapol/jumbo/log/results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='/po1/bannapol/jumbo/log/logs',
    logging_steps=10,
)

In [13]:
# Define the compute metrics function
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(p.label_ids, preds, average='weighted')
    acc = accuracy_score(p.label_ids, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [15]:
# Train the model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Step,Training Loss
10,1.1378
20,1.1622
30,1.1209
40,1.1028
50,1.1057
60,1.128
70,1.0869
80,1.0816
90,1.0756
100,1.0371




TrainOutput(global_step=1581, training_loss=0.7088904881933533, metrics={'train_runtime': 987.9079, 'train_samples_per_second': 51.154, 'train_steps_per_second': 1.6, 'total_flos': 1.329643656488448e+16, 'train_loss': 0.7088904881933533, 'epoch': 3.0})

In [16]:
# Evaluate the model
trainer.evaluate()


{'eval_loss': 0.8488390445709229,
 'eval_accuracy': 0.644349477682811,
 'eval_f1': 0.6467455319511037,
 'eval_precision': 0.6509918407544927,
 'eval_recall': 0.644349477682811,
 'eval_runtime': 23.3726,
 'eval_samples_per_second': 180.211,
 'eval_steps_per_second': 1.412,
 'epoch': 3.0}

In [17]:
# Save the model
model.save_pretrained('/po1/bannapol/jumbo/log/beer_review_model')
tokenizer.save_pretrained('/po1/bannapol/jumbo/log/beer_review_tokenizer')

('/po1/bannapol/jumbo/log/beer_review_tokenizer/tokenizer_config.json',
 '/po1/bannapol/jumbo/log/beer_review_tokenizer/special_tokens_map.json',
 '/po1/bannapol/jumbo/log/beer_review_tokenizer/vocab.txt',
 '/po1/bannapol/jumbo/log/beer_review_tokenizer/added_tokens.json')

In [3]:
# Clear cache before loading the model
torch.cuda.empty_cache()

# Check if a GPU is available and move the model to the GPU
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# Load the model and tokenizer for inference
loaded_model = BertForSequenceClassification.from_pretrained('/po1/bannapol/jumbo/log/beer_review_model')
loaded_tokenizer = BertTokenizer.from_pretrained('/po1/bannapol/jumbo/log/beer_review_tokenizer')

# Move the model to the GPU for inference if available
loaded_model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [4]:
# Function to classify new reviews
def classify_review(review):
    inputs = loaded_tokenizer(review, return_tensors="pt", truncation=True, padding=True).to(device)
    outputs = loaded_model(**inputs)
    probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
    return torch.argmax(probs).item()

In [5]:
# Test the function
test_review = "This is a fantastic beer with a rich, complex flavor."
print(f"Review: '{test_review}' is classified as: {classify_review(test_review)}")

Review: 'This is a fantastic beer with a rich, complex flavor.' is classified as: 2


In [6]:
# Test the function
test_review = "This is a bad beer with a bad flavor."
print(f"Review: '{test_review}' is classified as: {classify_review(test_review)}")

Review: 'This is a bad beer with a bad flavor.' is classified as: 0


In [7]:
# Load the test data
test_data = pd.read_csv('/po1/bannapol/jumbo/test.csv')  # Adjust the path to your test dataset

In [8]:
# Tokenize the test data
test_encodings = loaded_tokenizer(test_data['text'].tolist(), truncation=True, padding=True, return_tensors="pt")

# Move the test encodings to the GPU
test_encodings = {key: val.to(device) for key, val in test_encodings.items()}

In [9]:
# Custom dataset to handle batching
class TestDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])

test_dataset = TestDataset(test_encodings)

# Create DataLoader for batching
batch_size = 32  # Adjust batch size as needed
test_loader = DataLoader(test_dataset, batch_size=batch_size)


In [11]:
# Generate predictions in batches
loaded_model.eval()
preds = []

with torch.no_grad():
    for batch in test_loader:
        batch = {key: val.to(device) for key, val in batch.items()}
        outputs = loaded_model(**batch)
        batch_preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()
        preds.extend(batch_preds)

In [13]:
# Create a submission file
submission = pd.DataFrame({'_id': test_data['_id'], 'pred_label': preds})
submission.to_csv('/po1/bannapol/jumbo/submission.csv', index=False)

Submission file created successfully!
