In [None]:
# Langkah 1: Instal library yang diperlukan
# Perlu dilakukan instalasi library "transformers", "datasets", dan "evaluate".
!pip install datasets evaluate transformers[sentencepiece]

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m6.

In [None]:
# Langkah 2: Pipeline Sentiment Analysis
# Pipeline mempermudah penggunaan model pre-trained untuk berbagai tugas.
from transformers import pipeline
# Membuat pipeline untuk sentiment analysis
classifier = pipeline("sentiment-analysis")

# Input teks untuk analisis sentimen
sentiment_results = classifier(
    [
        "I've been waiting for read this book my whole life.",
        "I hate this so much!",
    ]
)

print("Sentiment Analysis Results:", sentiment_results)

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cuda:0


Sentiment Analysis Results: [{'label': 'POSITIVE', 'score': 0.9973263740539551}, {'label': 'NEGATIVE', 'score': 0.9994558691978455}]


In [None]:
# Langkah 3: Tokenisasi teks dengan tokenizer pre-trained
# Memuat tokenizer berdasarkan checkpoint pre-trained
from transformers import AutoTokenizer

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# Menggunakan tokenizer untuk memproses input teks
raw_inputs = [
    "I've been waiting for read this book my whole life.",
    "I hate this so much!",
]
inputs = tokenizer(raw_inputs, padding=True, truncation=True, return_tensors="pt")
print("Tokenized Inputs:", inputs)

Tokenized Inputs: {'input_ids': tensor([[ 101, 1045, 1005, 2310, 2042, 3403, 2005, 3191, 2023, 2338, 2026, 2878,
         2166, 1012,  102],
        [ 101, 1045, 5223, 2023, 2061, 2172,  999,  102,    0,    0,    0,    0,
            0,    0,    0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0]])}


In [None]:
# Langkah 4: Memuat model pre-trained untuk klasifikasi
from transformers import AutoModelForSequenceClassification

# Memuat model klasifikasi dari checkpoint
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

# Memberikan input yang sudah ditokenisasi ke model
outputs = model(**inputs)
print("Logits Shape:", outputs.logits.shape)
print("Logits:", outputs.logits)

Logits Shape: torch.Size([2, 2])
Logits: tensor([[-2.9261,  2.9956],
        [ 4.1692, -3.3464]], grad_fn=<AddmmBackward0>)


In [None]:
# Langkah 5: Menghitung probabilitas dengan fungsi softmax
import torch

# Menggunakan softmax untuk mendapatkan probabilitas dari logits
predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
print("Predictions (Probabilities):", predictions)

# Menampilkan label sentimen dari konfigurasi model
print("Label Mapping:", model.config.id2label)

Predictions (Probabilities): tensor([[2.6736e-03, 9.9733e-01],
        [9.9946e-01, 5.4418e-04]], grad_fn=<SoftmaxBackward0>)
Label Mapping: {0: 'NEGATIVE', 1: 'POSITIVE'}


In [None]:
# Langkah 6: Membuat model secara manual menggunakan konfigurasi
from transformers import BertConfig, BertModel

# Membuat konfigurasi baru untuk model BERT
config = BertConfig()

# Membuat model BERT berdasarkan konfigurasi yang baru dibuat
random_model = BertModel(config)  # Model diinisialisasi secara acak
print("Model Config:", config)

Model Config: BertConfig {
  "_attn_implementation_autoset": true,
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.47.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}



In [None]:
# Langkah 7: Memuat model pre-trained dan menyimpan ke penyimpanan lokal
# Memuat model pre-trained "bert-base-cased"
bert_model = BertModel.from_pretrained("bert-base-cased")

# Menyimpan model ke direktori lokal
bert_model.save_pretrained("directory_on_my_drive")


In [None]:
# Langkah 8: Menyimpan model di Google Drive
from google.colab import drive
drive.mount('/content/drive')  # Mount Google Drive

# Menyimpan model ke Google Drive
bert_model.save_pretrained('/content/drive/My Drive/NLP-HuggingFace/Models')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Langkah 9: Tokenisasi teks secara manual
# Contoh teks yang ingin ditokenisasi
sequences = ["Hello!", "Cool.", "Nice!"]

# Contoh encoded sequences (harus sesuai dengan tokenizer yang digunakan)
encoded_sequences = [
    [101, 7592, 999, 102],  # "Hello!"
    [101, 4658, 1012, 102],  # "Cool."
    [101, 3835, 999, 102],  # "Nice!"
]
# Mengonversi encoded sequences menjadi tensor
model_inputs = torch.tensor(encoded_sequences)

# Memberikan tensor ke model
output = bert_model(model_inputs)

In [None]:
# Langkah 10: Tokenisasi dan decoding dengan tokenizer pre-trained
from transformers import AutoTokenizer

# Memuat tokenizer pre-trained
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

# Contoh kalimat untuk tokenisasi
sequence = "Using a Transformer network is simple"

# Tokenisasi teks
tokens = tokenizer.tokenize(sequence)
print("Tokens:", tokens)

# Konversi token ke ID
ids = tokenizer.convert_tokens_to_ids(tokens)
print("Token IDs:", ids)

# Decoding kembali ID ke string
decoded_string = tokenizer.decode(ids)
print("Decoded String:", decoded_string)

# Menyimpan tokenizer di Google Drive
tokenizer.save_pretrained('/content/drive/My Drive/NLP-HuggingFace/Tokenizer')

Tokens: ['Using', 'a', 'Trans', '##former', 'network', 'is', 'simple']
Token IDs: [7993, 170, 13809, 23763, 2443, 1110, 3014]
Decoded String: Using a Transformer network is simple


('/content/drive/My Drive/NLP-HuggingFace/Tokenizer/tokenizer_config.json',
 '/content/drive/My Drive/NLP-HuggingFace/Tokenizer/special_tokens_map.json',
 '/content/drive/My Drive/NLP-HuggingFace/Tokenizer/vocab.txt',
 '/content/drive/My Drive/NLP-HuggingFace/Tokenizer/added_tokens.json',
 '/content/drive/My Drive/NLP-HuggingFace/Tokenizer/tokenizer.json')