# **Global Needs**

In [None]:
import torch

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# **Tokenization & Data Splitting**

In [None]:
import pandas as pd
preprocessed_paslon = "preprocessed_paslon.csv"
data = pd.read_csv(preprocessed_paslon, delimiter = ",", encoding='utf-8')

# result
print(data.head())

                                               tweet  label
0  kader pdip jadi presiden mana menteri impor pa...      1
1  amien rais pilih prabowo subianto bila anies b...      1
2            kpk gerak adil seluruh rakyat indonesia      1
3                    jadi gubernur ada partai dukung      1
4                                 pikirin bisa kalah      1


Splitting

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
# dataset (100%) = training (70%) + temp (30%)_validation + test
train_texts, temp_texts, train_labels, temp_labels = train_test_split(
    data['tweet'], data['label'], test_size=0.3, random_state=42, shuffle=True
)

# temp (30%) = validation (50%) + test (50%)
val_texts, test_texts, val_labels, test_labels = train_test_split(
    temp_texts, temp_labels, test_size=0.5, random_state=42, shuffle=True
)

# Result
print(f"Training set size: {len(train_texts)}")
print(f"Validation set size: {len(val_texts)}")
print(f"Testing set size: {len(test_texts)}")

Training set size: 5618
Validation set size: 1204
Testing set size: 1204


Tokenization

In [None]:
from transformers import BertTokenizer

Tokenization (IndoBert Base)/Teacher Model

In [None]:
# Load tokenizer base model
tokenizer_base = BertTokenizer.from_pretrained('indobenchmark/indobert-base-p2')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/229k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.53k [00:00<?, ?B/s]

In [None]:
# Tokenize training set
# Ensure all elements are strings
train_encodings_base = tokenizer_base([str(text) for text in train_texts], truncation=True, padding=True, max_length=512)

# Tokenize validation set
# Ensure all elements are strings
val_encodings_base = tokenizer_base([str(text) for text in val_texts], truncation=True, padding=True, max_length=512)

# Tokenize testing set
# Ensure all elements are strings
test_encodings_base = tokenizer_base([str(text) for text in test_texts], truncation=True, padding=True, max_length=512)

In [None]:
import json

# Saving tokenized data for base model
base_tokenized_data = {
    'train': {
        'input_ids': train_encodings_base['input_ids'],
        'attention_mask': train_encodings_base['attention_mask'],
        'labels': list(train_labels)
    },
    'val': {
        'input_ids': val_encodings_base['input_ids'],
        'attention_mask': val_encodings_base['attention_mask'],
        'labels': list(val_labels)
    },
    'test': {
        'input_ids': test_encodings_base['input_ids'],
        'attention_mask': test_encodings_base['attention_mask'],
        'labels': list(test_labels)
    }
}

# Saving the JSON file
with open('base_tokenized_data.json', 'w') as f:
    json.dump(base_tokenized_data, f)


Tokenization (IndoBert Lite)/Student Model

In [None]:
# Load tokenizer untuk lite model
tokenizer_lite = BertTokenizer.from_pretrained('indobenchmark/indobert-lite-base-p2')

tokenizer_config.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/225k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.54k [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'AlbertTokenizer'. 
The class this function is called from is 'BertTokenizer'.


In [None]:
# Tokenize training set untuk lite model
# Ensure all elements are strings by converting within a list comprehension
train_encodings_lite = tokenizer_lite([str(text) for text in train_texts], truncation=True, padding=True, max_length=512)

# Tokenize validation set untuk lite model
# Ensure all elements are strings
val_encodings_lite = tokenizer_lite([str(text) for text in val_texts], truncation=True, padding=True, max_length=512)

# Tokenize testing set untuk lite model
# Ensure all elements are strings
test_encodings_lite = tokenizer_lite([str(text) for text in test_texts], truncation=True, padding=True, max_length=512)

In [None]:
# Saving tokenized data for lite model
lite_tokenized_data = {
    'train': {
        'input_ids': train_encodings_lite['input_ids'],
        'attention_mask': train_encodings_lite['attention_mask'],
        'labels': list(train_labels)
    },
    'val': {
        'input_ids': val_encodings_lite['input_ids'],
        'attention_mask': val_encodings_lite['attention_mask'],
        'labels': list(val_labels)
    },
    'test': {
        'input_ids': test_encodings_lite['input_ids'],
        'attention_mask': test_encodings_lite['attention_mask'],
        'labels': list(test_labels)
    }
}

# Saving the JSON file
with open('lite_tokenized_data.json', 'w') as f:
    json.dump(lite_tokenized_data, f)