In [1]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import XLMRobertaModel, XLMRobertaTokenizer, AutoTokenizer
from safetensors.torch import load_file 
from sklearn.metrics import classification_report, accuracy_score, f1_score
import torch.nn as nn
import json
import time
from tqdm import tqdm

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/content-cleaning/__results__.html
/kaggle/input/content-cleaning/cleaned_df.csv
/kaggle/input/content-cleaning/__notebook__.ipynb
/kaggle/input/content-cleaning/__output__.json
/kaggle/input/content-cleaning/custom.css
/kaggle/input/pengambilan-sampel-supervised/labeling_df_part1.csv
/kaggle/input/pengambilan-sampel-supervised/labeling_df_part3.csv
/kaggle/input/pengambilan-sampel-supervised/__results__.html
/kaggle/input/pengambilan-sampel-supervised/__notebook__.ipynb
/kaggle/input/pengambilan-sampel-supervised/labeling_df_part2.csv
/kaggle/input/pengambilan-sampel-supervised/__output__.json
/kaggle/input/pengambilan-sampel-supervised/labeling_df_part4.csv
/kaggle/input/pengambilan-sampel-supervised/custom.css
/kaggle/input/satdat-sf-24/other/31k/1/roberta_fine_tuned.onnx
/kaggle/input/satdat-sf-24/other/31k/1/label_encoders.pkl
/kaggle/input/satdat-sf-24/other/31k/1/best_model/training_args.bin
/kaggle/input/satdat-sf-24/other/31k/1/best_model/model.safetensors


In [2]:
def load_and_cache_tokenizer(tokenizer_name):
    """Loads and caches the tokenizer."""
    if not hasattr(load_and_cache_tokenizer, 'cached_tokenizer'):
        load_and_cache_tokenizer.cached_tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
    return load_and_cache_tokenizer.cached_tokenizer

In [3]:
from functools import lru_cache 

@lru_cache(maxsize=None)
def tokenize_text(text, tokenizer, max_length=512):
    """Tokenize text using the tokenizer."""
    encoding = tokenizer(
        text,
        padding='max_length',
        truncation=True,
        max_length=max_length,
        return_tensors='pt',
    )
    return encoding['input_ids'].squeeze(), encoding['attention_mask'].squeeze()

class CustomDataset(Dataset):
    def __init__(self, df, tokenizer, max_length=512, device='cuda'):
        self.texts = df['content'].tolist()
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.device = device  # Store the device

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        input_ids, attention_mask = tokenize_text(text, self.tokenizer, self.max_length)

        # Move the tensors to the specified device
        input_ids = input_ids.to(self.device)
        attention_mask = attention_mask.to(self.device)

        return {'input_ids': input_ids, 'attention_mask': attention_mask}

In [4]:
class MultiTaskXLMRoberta(nn.Module):
    def __init__(self):
        super(MultiTaskXLMRoberta, self).__init__()
        self.roberta = XLMRobertaModel.from_pretrained('xlm-roberta-base')
        self.classifier_candidate = nn.Linear(self.roberta.config.hidden_size, 4)
        self.classifier_sentiment = nn.Linear(self.roberta.config.hidden_size, len(label_encoders['sentiment'].classes_))
        self.classifier_topic = nn.Linear(self.roberta.config.hidden_size, len(label_encoders['topic'].classes_))
        
        self.criterion_candidate = nn.BCEWithLogitsLoss()
        self.criterion_sentiment = nn.CrossEntropyLoss()
        self.criterion_topic = nn.CrossEntropyLoss() 
#         self.embeddings_cache = {} 

    def forward(self, input_ids, attention_mask, labels):
#         cache_key = tuple(input_ids.tolist())  # Convert to a tuple
#         if cache_key in self.embeddings_cache:
#             outputs = self.embeddings_cache[cache_key]
#         else:
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
#             self.embeddings_cache[cache_key] = outputs
        
        cls_output = outputs[0][:, 0, :]
        candidate_logits = self.classifier_candidate(cls_output)
        sentiment_logits = self.classifier_sentiment(cls_output)
        topic_logits = self.classifier_topic(cls_output)
        
        if labels is not None:
            loss_candidate = self.criterion_candidate(candidate_logits, labels[:, :4].float())
            loss_sentiment = self.criterion_sentiment(sentiment_logits, labels[:, 4].long())
            loss_topic = self.criterion_topic(topic_logits, labels[:, 5].long())
            
            total_loss = loss_candidate + loss_sentiment + loss_topic
            return {
                "loss": total_loss,
                "candidate_logits": candidate_logits,
                "sentiment_logits": sentiment_logits,
                "topic_logits": topic_logits
            }
        else:
            return {
                "candidate_logits": candidate_logits,
                "sentiment_logits": sentiment_logits,
                "topic_logits": topic_logits
            }


In [5]:
import pickle
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch.nn.utils.prune as prune
import safetensors

with open('/kaggle/input/satdat-sf-24/other/31k/1/label_encoders.pkl', 'rb') as f:
    label_encoders = pickle.load(f)

# move to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load the fine-tuned model (using safetensors)
model_path = "/kaggle/input/satdat-sf-24/other/31k/1/best_model/model.safetensors"
model_state_dict = load_file(model_path) 
model = MultiTaskXLMRoberta().to(device) 
model.load_state_dict(model_state_dict)
model.eval() 

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

MultiTaskXLMRoberta(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
    

In [6]:
import pandas as pd
from tqdm import tqdm

def read_csv_with_tqdm(filepath, chunksize=400000):    
    # Get total rows (without reading the entire file)
    with open(filepath, 'r') as f:
        reader = pd.read_csv(f, chunksize=chunksize, iterator=True)
        total_rows = sum(1 for _ in reader)
    
    # Calculate approximate iterations 
    approximate_iterations = total_rows // chunksize + (total_rows % chunksize != 0) 
    
    # Read the CSV file in chunks
    df = pd.DataFrame()
    for chunk in tqdm(pd.read_csv(filepath, chunksize=chunksize, iterator=True), 
                     total=approximate_iterations, unit="chunk", desc="Reading CSV...", leave=True):
        df = pd.concat([df, chunk], ignore_index=True)
    return df

In [7]:
# Initialize the tokenizer
tokenizer = load_and_cache_tokenizer('xlm-roberta-base')

# Load the cleaned CSV file 
prediction_df = read_csv_with_tqdm("/kaggle/input/pengambilan-sampel-supervised/labeling_df_part4.csv")
prediction_df = prediction_df.dropna(subset=['content'], axis=0)

# Create a dataset for prediction
prediction_dataset = CustomDataset(prediction_df, tokenizer)
prediction_loader = DataLoader(prediction_dataset, batch_size=10, shuffle=False)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

Reading CSV...: 100%|██████████| 1/1 [00:00<00:00,  1.29chunk/s]


In [8]:
import ast
# Helper function to decode predictions
def get_decoded_predictions(outputs, label_encoders):
    """Decodes model outputs to human-readable labels."""
    candidate_threshold = 0.5
    batch_predictions = []

    # Candidate Mentions (Multi-label) - Corrected logic
    candidate_probs = torch.sigmoid(outputs['candidate_logits']).cpu()
    candidate_preds = (candidate_probs > candidate_threshold).int()
    candidate_preds_decoded = []

    for row in candidate_preds:
        row_preds = []
        for j, val in enumerate(row):
            if val == 1 and j != 0:  # Exclude '0' (no candidate)
                row_preds.append(str(j))
        candidate_preds_decoded.append(f"[{','.join(row_preds)}]" if row_preds else "[0]")

    # Sentiment and Topic (Single-label)
    sentiment_preds = torch.argmax(outputs['sentiment_logits'], dim=1).cpu().numpy()
    topic_preds = torch.argmax(outputs['topic_logits'], dim=1).cpu().numpy()

    sentiment_preds_decoded = [label_encoders['sentiment'].classes_[i] for i in sentiment_preds]
    topic_preds_decoded = [label_encoders['topic'].classes_[i] for i in topic_preds]

    # Store in a list of dictionaries (one dictionary per sample in batch)
    for i in range(len(candidate_preds_decoded)):
        batch_predictions.append({
            'candidate_mention': candidate_preds_decoded[i],
            'sentiment': sentiment_preds_decoded[i],
            'topic': topic_preds_decoded[i]
        })

    return batch_predictions

In [9]:
import torch
import torch.autograd.profiler as profiler

for column, le in label_encoders.items():
    print(f"{column}:\n{le.classes_}\n")

# Collect ground truth labels and predictions
all_predictions = [] 

total_items = len(prediction_loader.dataset)

start_time = time.time()
last_update_time = start_time
rows_processed = 0

with torch.no_grad():
    for batch in tqdm(prediction_loader, total=len(prediction_loader), unit="batch", desc="Predicting..."):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=None)

        # Extract and decode predictions for the batch
        batch_predictions = get_decoded_predictions(outputs, label_encoders)
        all_predictions.extend(batch_predictions)

end_time = time.time()
elapsed_time = end_time - start_time
total_rows = len(prediction_loader.dataset)
rps = total_rows / elapsed_time
print(f"Elapsed time: {elapsed_time:.2f} seconds")
print(f"Rows per second: {rps:.2f}")

    
predicted_candidate_mentions = []
predicted_sentiment = []
predicted_topic = []

for prediction_dict in all_predictions:
    predicted_candidate_mentions.append(prediction_dict['candidate_mention'])
    predicted_sentiment.append(prediction_dict['sentiment'])
    predicted_topic.append(prediction_dict['topic'])

prediction_df['predicted_candidate_mentions'] = predicted_candidate_mentions
prediction_df['predicted_sentiment'] = predicted_sentiment
prediction_df['predicted_topic'] = predicted_topic

print(prediction_df.head())

prediction_df.to_csv('predicted_samples.csv', index=False)

candidate_mention:
['[0]' '[1, 2, 3]' '[1, 2]' '[1, 3, 2]' '[1, 3]' '[1]' '[2, 0]'
 '[2, 1, 3]' '[2, 1]' '[2, 3, 1]' '[2, 3]' '[2]' '[3, 1, 2]' '[3, 1]'
 '[3, 2, 1]' '[3, 2]' '[3]']

sentiment:
[1 2 3 4]

topic:
[0 1 2 3 4 5 6 7 8]



Predicting...: 100%|██████████| 37177/37177 [1:58:50<00:00,  5.21batch/s]


Elapsed time: 7130.84 seconds
Rows per second: 52.13
     index                                            content  \
0  1500000                     cc  .buzzer anda memang tolol.   
1  1500001  Ngga ada rahasia lagi! Capres No Urut 03 Ganja...   
2  1500002  anyone wondering how ganjar improves on this d...   
3  1500003                   Aaaa pen jumpa pak prabowo dipku   
4  1500004   .dikunjungi oleh Calon Presiden Capres Nomor ...   

  predicted_candidate_mentions  predicted_sentiment  predicted_topic  
0                          [0]                    4                0  
1                          [3]                    2                1  
2                          [3]                    4                0  
3                          [2]                    4                0  
4                          [3]                    2                8  
