# Import package

In [1]:
import collections

import datasets 
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torchtext
import tqdm
import transformers

In [2]:
%%capture
# !pip install numpy==1.23.5
# !pip install sacremoses
# !pip install evaluate
# Restart your kernel

seed = 9072

np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True

# You should login the HF in order to upload your model

In [3]:
from huggingface_hub import notebook_login

notebook_login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
from datasets import load_dataset, load_metric

imdb = load_dataset("imdb")

###
# from datasets import load_dataset
# imdb_train = load_dataset('imdb',split="train")
# imdb_test = load_dataset('imdb',split="test[:6250]+test[-6250:]")
# imdb_val = load_dataset('imdb',split="test[6250:12500]+test[-12500:-6250]")

Downloading builder script:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

Downloading and preparing dataset imdb/plain_text (download: 80.23 MiB, generated: 127.02 MiB, post-processed: Unknown size, total: 207.25 MiB) to /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1...


Downloading data:   0%|          | 0.00/84.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Dataset imdb downloaded and prepared to /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [6]:
imdb

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [5]:
# Import the AutoModelForSequenceClassification and AutoTokenizer classes from the transformers library.
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Define possible model names for later use, with the current one set to 'albert-base-v2'.
# model_name = "bert-base-uncased"        # Commented out alternative model name for BERT base model
# model_name = "distilbert-base-uncased"  # Commented out alternative model name for DistilBERT base model
model_name = "albert-base-v2"             # The selected model name to be used

# Load the pre-trained ALBERT model for sequence classification with custom label mappings for binary classification.
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    id2label={0: "Negative", 1: "Positive"},
    label2id={"Negative": 0, "Positive": 1}
)

# Load the tokenizer that corresponds to the ALBERT model.
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Re-import AutoModelForSequenceClassification and AutoTokenizer for further use in the function.
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Define a function that estimates the number of trainable parameters for a given model name.
def estimate_parameters(model_name):
    # Load the model with the specified name and custom label mappings.
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        id2label={0: "Negative", 1: "Positive"},
        label2id={"Negative": 0, "Positive": 1}
    )
    
    # Calculate the total number of trainable parameters in the model.
    num_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)
    
    # Return the calculated number of parameters.
    return num_parameters

# Example usage of the function: compute the number of trainable parameters for the selected model.
num_params = estimate_parameters(model_name)

# Print a statement with the result that shows the number of trainable parameters of the selected model.
print(f"The model {model_name} has {num_params} trainable parameters.")

config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/47.4M [00:00<?, ?B/s]

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


The model albert-base-v2 has 11685122 trainable parameters.


In [6]:
# Define a function that tokenizes the 'text' field in a dataset example.
def tokenize(examples):
    # Tokenize the input text and enable truncation to handle texts longer than the model's maximum input length.
    outputs = tokenizer(examples['text'], truncation=True)
    # Return the tokenized outputs.
    return outputs

# Apply the 'tokenize' function to the entire imdb dataset, handling the examples in batches for efficiency.
tokenized_ds = imdb.map(tokenize, batched=True)

# The following commented section shows how to tokenize different splits of a dataset (train, test, validation)
# in a batched fashion, with padding and truncation enabled, and with a specified batch size.
###
# Tokenize the training set, applying padding and truncation, processing in batches of size 1000.
# enc_train = imdb_train.map(lambda e: tokenizer(e['text'],padding=True,truncation=True),batched=True,batch_size=1000)

# Tokenize the testing set in the same way as the training set.
# enc_test = imdb_test.map(lambda e: tokenizer(e['text'],padding=True,truncation=True),batched=True,batch_size=1000)

# Tokenize the validation set in the same way as the training and testing sets.
# enc_val = imdb_val.map(lambda e: tokenizer(e['text'],padding=True,truncation=True),batched=True,batch_size=1000)

  0%|          | 0/25 [00:00<?, ?ba/s]

  0%|          | 0/25 [00:00<?, ?ba/s]

  0%|          | 0/50 [00:00<?, ?ba/s]

In [21]:
# import numpy as np

# def compute_metrics(eval_preds):
#     metric = load_metric("accuracy")
#     logits, labels = eval_preds
#     predictions = np.argmax(logits, axis=-1)
#     return metric.compute(predictions=predictions, references=labels)

In [7]:
from transformers import TrainingArguments, Trainer, DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer) 

2024-03-24 16:28:55.039801: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-24 16:28:55.039912: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-24 16:28:55.171090: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [9]:
training_args = TrainingArguments(num_train_epochs=2,
#                                 output_dir="distilbert-imdb",
                                output_dir="bert-imdb",
                                # output_dir="albert-imdb",
                                push_to_hub=True,
                                per_device_train_batch_size=16,
                                per_device_eval_batch_size=16,
                                evaluation_strategy="epoch",
                                seed=seed
                                 )

In [10]:
# Import accuracy_score and precision_recall_fscore_support functions from sklearn.metrics for evaluation.
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Define a function that computes evaluation metrics from predictions.
def compute_metrics(pred):
    # Extract ground truth labels from the predictions object.
    labels = pred.label_ids
    # Extract the predicted labels by taking the argmax of the logits from the predictions object.
    preds = pred.predictions.argmax(-1)
    # Compute precision, recall, and F1-score using macro averaging.
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    # Compute the accuracy score.
    acc = accuracy_score(labels, preds)
    # Return a dictionary with accuracy, F1, precision, and recall.
    return {
        'Accuracy': acc,
        'F1': f1,
        'Precision': precision,
        'Recall': recall
    }

# The compute_metrics function is usually used as a parameter for the Trainer in Hugging Face's Transformers library,
# to compute evaluation metrics after each training epoch or after model evaluation.

In [11]:
trainer = Trainer(model=model, tokenizer=tokenizer,
                  data_collator=data_collator,
                  args=training_args,
#                   train_dataset=enc_train,
#                   eval_dataset=enc_val, 
                  train_dataset=tokenized_ds['train'],
                  eval_dataset=tokenized_ds['test'],
                  compute_metrics=compute_metrics)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [12]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [13]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.2223,0.189848,0.93276,0.932747,0.933098,0.93276
2,0.1161,0.226617,0.93956,0.939554,0.939743,0.93956


TrainOutput(global_step=3126, training_loss=0.19396053974398114, metrics={'train_runtime': 3571.813, 'train_samples_per_second': 13.998, 'train_steps_per_second': 0.875, 'total_flos': 1.301775246386544e+16, 'train_loss': 0.19396053974398114, 'epoch': 2.0})

# Save model

In [14]:
model_save_path = "MyBestIMDBModel_Bert"
trainer.save_model(model_save_path)
tokenizer.save_pretrained(model_save_path)

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

events.out.tfevents.1711297798.fef996fca6ad.34.0:   0%|          | 0.00/7.23k [00:00<?, ?B/s]

('MyBestIMDBModel_Bert/tokenizer_config.json',
 'MyBestIMDBModel_Bert/special_tokens_map.json',
 'MyBestIMDBModel_Bert/vocab.txt',
 'MyBestIMDBModel_Bert/added_tokens.json',
 'MyBestIMDBModel_Bert/tokenizer.json')

# Test your model

In [None]:
# from transformers import AlbertTokenizerFast, AlbertForSequenceClassification, pipeline

# model = AlbertForSequenceClassification.from_pretrained("/kaggle/working/MyBestIMDBModel")
# tokenizer = AlbertTokenizerFast.from_pretrained("/kaggle/working/MyBestIMDBModel")
# nlp = pipeline("sentiment-analysis", model=model,tokenizer=tokenizer)
# print(nlp("the movie was very impressive"))
# print(nlp("the text of the picture was very poor"))

from transformers import pipeline, DistilBertForSequenceClassification, DistilBertTokenizerFast
model = DistilBertForSequenceClassification.from_pretrained("/kaggle/working/MyBestIMDBModel_distiBert")
tokenizer = DistilBertTokenizerFast.from_pretrained("/kaggle/working/MyBestIMDBModel_distilBert")
nlp = pipeline("sentiment-analysis", model=model,tokenizer=tokenizer)
print(nlp("the movie was very impressive"))
print(nlp("the text of the picture was very poor"))

In [16]:
# Import necessary modules: os for interacting with the file system,
# zipfile for creating zip archives, and datetime for timestamping.
import os
import zipfile
import datetime

# Define a function to compress a directory (packagePath) into a zip file (zipPath).
def file2zip(packagePath, zipPath):
    '''
  :param packagePath: The path to the directory that you want to compress.
  :param zipPath: The path where the resulting zip file will be saved.
  :return: None
  '''
    # Create a ZipFile object in write mode.
    zip = zipfile.ZipFile(zipPath, 'w', zipfile.ZIP_DEFLATED)
    # Walk through the directory structure.
    for path, dirNames, fileNames in os.walk(packagePath):
        # Get the relative file path to maintain directory structure in the zip.
        fpath = path.replace(packagePath, '')
        # Add all files in the current directory to the zip file.
        for name in fileNames:
            # Get the full path to the current file.
            fullName = os.path.join(path, name)
            # Construct the name to be used within the zip file.
            name = fpath + '\\' + name
            # Write the file under the constructed name to the zip file.
            zip.write(fullName, name)
    # Close the ZipFile object to finalize the zip file.
    zip.close()

# This conditional block checks if the script is the main program and not an imported module.
if __name__ == "__main__":
    # Define the path to the directory you want to compress.
    packagePath = '/kaggle/working/MyBestIMDBModel_Bert'
    # Define the output path for the resulting zip file.
    zipPath = '/kaggle/working/MyBestIMDBModel_Bert.zip'
    # Check if a zip file with the same name already exists and remove it if it does.
    if os.path.exists(zipPath):
        os.remove(zipPath)
    # Call the file2zip function to compress the directory into a zip file.
    file2zip(packagePath, zipPath)
    # Print a confirmation that the packaging is complete.
    print("Packaging complete")
    # Print the current UTC time.
    print(datetime.datetime.utcnow())

打包完成
2024-03-24 17:32:23.125894


In [17]:
trainer.push_to_hub()

CommitInfo(commit_url='https://huggingface.co/JeffreyJIANG/bert-imdb/commit/0b5f6f12a72ff34e210ee05f42369271d3ecf03b', commit_message='End of training', commit_description='', oid='0b5f6f12a72ff34e210ee05f42369271d3ecf03b', pr_url=None, pr_revision=None, pr_num=None)

# Utilizing BERT for Text Classification with GPU Acceleration

In [15]:
# Import the pipeline function from transformers, 
# which provides a high-level API for various tasks including text classification.
from transformers import pipeline
# Import the torch library to work with PyTorch and utilize GPU capabilities.
import torch

# Set the device to the first GPU available for running the model.
device = torch.device("cuda:0")
# Create a text classification pipeline with a pre-trained model hosted on Hugging Face.
pipe = pipeline("text-classification", model="JeffreyJIANG/distilbert-imdb")

# Import AutoTokenizer and AutoModelForSequenceClassification classes from transformers.
# These classes are used for tokenizing the inputs and loading the model for sequence classification tasks.
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Load the tokenizer for the pre-trained model "JeffreyJIANG/distilbert-imdb" from Hugging Face.
tokenizer = AutoTokenizer.from_pretrained("JeffreyJIANG/distilbert-imdb")
# Load the model for sequence classification from the pre-trained model "JeffreyJIANG/distilbert-imdb".
model = AutoModelForSequenceClassification.from_pretrained("JeffreyJIANG/distilbert-imdb")
# Move the model to the GPU device defined earlier for faster computation.
model.to(device)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [30]:
# Use a pipeline as a high-level helper
from transformers import pipeline
import torch
device = torch.device("cuda:0")

pipe = pipeline("text-classification", model="JeffreyJIANG/bert-imdb")
# Load model directly
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("JeffreyJIANG/bert-imdb")
model = AutoModelForSequenceClassification.from_pretrained("JeffreyJIANG/bert-imdb")
model.to(device)

config.json:   0%|          | 0.00/847 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [44]:
# Use a pipeline as a high-level helper
from transformers import pipeline
import torch
device = torch.device("cuda:0")

pipe = pipeline("text-classification", model="JeffreyJIANG/albert-imdb")
# Load model directly
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("JeffreyJIANG/albert-imdb")
model = AutoModelForSequenceClassification.from_pretrained("JeffreyJIANG/albert-imdb")
model.to(device)

config.json:   0%|          | 0.00/1.01k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/46.7M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.22k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.27M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/286 [00:00<?, ?B/s]

AlbertForSequenceClassification(
  (albert): AlbertModel(
    (embeddings): AlbertEmbeddings(
      (word_embeddings): Embedding(30000, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0, inplace=False)
    )
    (encoder): AlbertTransformer(
      (embedding_hidden_mapping_in): Linear(in_features=128, out_features=768, bias=True)
      (albert_layer_groups): ModuleList(
        (0): AlbertLayerGroup(
          (albert_layers): ModuleList(
            (0): AlbertLayer(
              (full_layer_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (attention): AlbertAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768,

In [45]:
def predict_sentiment(text, model, tokenizer, device):
    # Tokenize the input text and get the input IDs required by the BERT model.
    ids = tokenizer(text)["input_ids"]
    # Convert the list of input IDs into a PyTorch tensor and add an extra dimension for batch size.
    tensor = torch.LongTensor(ids).unsqueeze(dim=0).to(device)
    # Disable gradient calculation as it is not needed during inference, which saves memory and computations.
    with torch.no_grad():
        # Pass the input tensor to the model to obtain the raw output, which includes the logits.
        outputs = model(tensor)
    # Extract the logits (unnormalized predictions) from the model output.
    logits = outputs.logits
    # Apply softmax function to the logits to get the probabilities of the classes.
    probability = torch.softmax(logits, dim=-1)
    # Determine the predicted class by finding the index of the highest logit value.
    predicted_class = logits.argmax(dim=-1).item()
    # Return the predicted class and the probabilities of all classes.
    return predicted_class, probability

In [46]:
# Case 1
text = (
    "When thinking of boxing films, there is already a series of predecessors: \"Rocky\" (1976), "
    "\"Million Dollar Baby\" (2004), \"Unbeatable\" (2013), and so on. The movie \"Hot and Heavy\" "
    "adapted from Masaharu Take's \"100 Yen Love\" (2014, starring Sakura Ando) also tells a "
    "story of an underdog's rise and struggle, showcasing the passion for life encapsulated in the "
    "phrase \"You Only Live Once.\"\n\n"
    "The film addresses social phenomena in China such as \"lying flat\" (tang ping), otaku culture, "
    "the \"boomerang generation\" (those who depend on their parents), and the media's fabrication "
    "of reality, with the story set against the backdrop of card gambling. Although the movie was "
    "shot in the Guangdong region, it lacks a distinct Cantonese flavor and sense of place, "
    "resembling a typical second or third-tier city in China aimed at a nationwide audience. "
    "Surprisingly, Lee Kwok Lun, known for his role as Jiumozhi in the beloved TV drama "
    "\"Demi-Gods and Semi-Devils,\" represents a vestige of value from Hong Kong's film and "
    "television culture.\n\n"
    "The focus of the film is on the actress Jia Ling's remarkable physical transformation, "
    "where she gained approximately 50 pounds and then lost around 130 pounds. The process is "
    "deeply moving and demonstrates that individuals who have dreams and persist in pursuing them "
    "are the most beautiful. On another level, the film also shows Jia Ling's discipline over her "
    "body, reflecting both collective political elements and the pursuit of personal breakthroughs."
)

In [2]:
# Case 2
text = "This film is not terrible, it's great!"

In [50]:
# Case 3
text = "This film is not great, it's terrible!"

In [52]:
# Case 4
text = (
    "A film adaptation of a true story about Britain's \"Shtetl's List\". When Prime Minister Chamberlain's "
    "appeasement policy fuels Hitler's totalitarian ambitions, the Czech Republic is in dire straits. Nicholas "
    "Winton sees the situation deteriorating and, out of compassion, goes all out to save the Jewish children. "
    "The rabbi tells Winton not to leave things unfinished, and Winton's diligence in doing a good deed deserves "
    "to be remembered for generations to come, but at the most important moment, things don't turn out well, and "
    "there is no way back, leaving Winton with a lifetime of guilt. The unique experience of an individual can be "
    "a heavy lesson in history, worthy of rethinking by future generations.\n\n"
    "The Miracle Train hovers between the extraordinary period of 1938 and the stable period of 1988, between "
    "memory and the present. Anthony Hopkins' excellent performance is certainly the core of the film, and it is "
    "also rare to see James Hawes, who is making his feature film debut, making his debut with a rather skilled "
    "and restrained approach."
)

In [54]:
# Case 5
text = (
    "One of the most puzzling aspects of \"Goldfinger\" is the lack of insight and personal perspective from the "
    "director regarding the sensational \"Jia Ning case\" and its mastermind. The character Cheng Yiyuan, portrayed "
    "by Tony Leung Chiu-Wai, becomes arrogant and driven by greed after entering the rapidly growing financial "
    "market, resorting to increasingly ruthless methods. The film employs visual and stylistic techniques to "
    "emphasize the exaggerated sense of decadence and corruption.\n\n"
    "However, beneath the surface dazzle, it merely reinforces the conventional impression of powerful and wealthy "
    "\"villains.\" Several scenes in the film's composition clearly bear the influence of both Martin Scorsese's "
    "earlier and recent works, which is unnecessary. Chapman To's mocking reference to Joe Pesci's \"funny how\" "
    "sequence in \"Goodfellas\" (1990) invites the audience to make comparisons but comes off as awkward.\n\n"
    "I have read some interviews with Felix Chong and Tony Leung Chiu-Wai, where Chong discusses his childhood "
    "memories of observing how the Jia Ning case affected people in the community, and Leung casually mentions that "
    "the Jia Ning chairman was probably just an ordinary businessman initially driven by a desire to make money. "
    "These aspects are more thought-provoking than what \"Goldfinger\" presents. The current script focuses on the "
    "confrontation between Cheng Yiyuan and the anti-corruption agency, skillfully maintaining a high-intensity "
    "dramatic tension. However, it falls short in exploring the significant aspects of real-life events. It treats "
    "past history merely as a blueprint for genre films, stopping at technical craftsmanship and thrilling plot "
    "development. This falls short of what can be considered a truly exemplary Hong Kong story."
)

In [55]:
predict_sentiment(text, model, tokenizer, device)

(1, tensor([[0.2702, 0.7298]], device='cuda:0'))