In [1]:
# Cell 1: Check CUDA
import torch
print(f"CUDA Available: {torch.cuda.is_available()}")
print(f"CUDA Device Name: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None'}")

CUDA Available: True
CUDA Device Name: NVIDIA GeForce GTX 1660 Ti


In [2]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from torch.utils.data import Dataset
import pandas as pd
import numpy as np
from torch.nn.functional import softmax

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
class NewsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {k: v[idx] for k, v in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
from sklearn.model_selection import train_test_split
from torch.utils.data import Subset

def fine_tune_bert(train_texts, train_labels):
    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
    
    # Class weights
    class_counts = np.bincount(train_labels)
    class_weights = 1. / (class_counts + 1e-6)
    class_weights = torch.tensor(class_weights, dtype=torch.float32).to("cuda")
    
    model = AutoModelForSequenceClassification.from_pretrained(
        "bert-base-uncased",
        num_labels=4,
        problem_type="single_label_classification"
    ).to("cuda")

    # Tokenize text
    train_encodings = tokenizer(
        train_texts,
        truncation=True,
        padding=True,
        max_length=512,
        return_tensors="pt"
    )
    train_dataset = NewsDataset(train_encodings, train_labels)

    # Split indices for train/validation
    indices = list(range(len(train_dataset)))
    train_indices, val_indices = train_test_split(indices, test_size=0.2, random_state=42)
    train_subset = Subset(train_dataset, train_indices)
    val_subset = Subset(train_dataset, val_indices)

    # Training arguments with early stopping
    training_args = TrainingArguments(
        output_dir="./results",
        num_train_epochs=3,  # Reduced from 5
        per_device_train_batch_size=8,
        learning_rate=2e-5,
        warmup_steps=100,  # Add warmup
        weight_decay=0.01,
        fp16=False,
        logging_steps=10,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="loss",
        greater_is_better=False,
    )
    
    optimizer = torch.optim.AdamW(
        model.parameters(),
        lr=2e-5,
        weight_decay=0.01
    )
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, 
        mode='min',
        factor=0.1,
        patience=1
    )

    class WeightedTrainer(Trainer):
        def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
            labels = inputs.pop("labels")
            outputs = model(**inputs)
            logits = outputs.logits
            loss = torch.nn.CrossEntropyLoss(weight=class_weights)(logits, labels)
            return (loss, outputs) if return_outputs else loss
        
        def on_epoch_end(self, args, state, control, **kwargs):
            scheduler.step(self.state.log_history[-1]['eval_loss'])
            super().on_epoch_end(args, state, control, **kwargs)

    trainer = WeightedTrainer(
        model=model,
        args=training_args,
        train_dataset=train_subset,
        eval_dataset=val_subset,
    )
    
    trainer.train()
    return model

In [10]:
def pseudo_label(model, tokenizer, unlabeled_texts, confidence_threshold=0.95):  # Increased threshold
    model.eval()
    pseudo_texts = []
    pseudo_labels = []
    
    batch_size = 32
    for i in range(0, len(unlabeled_texts), batch_size):
        batch_texts = unlabeled_texts[i:i+batch_size]
        inputs = tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True).to("cuda")
        
        with torch.no_grad():
            outputs = model(**inputs)
            probs = softmax(outputs.logits, dim=-1)
            confidences, preds = torch.max(probs, dim=-1)
        
        mask = confidences > confidence_threshold
        pseudo_texts.extend([batch_texts[j] for j in range(len(mask)) if mask[j]])
        pseudo_labels.extend(preds[mask].cpu().numpy().astype(int).tolist())
    
    return pseudo_texts, pseudo_labels

In [None]:
if torch.cuda.is_available():
    # Load labeled data
    labeled_df = pd.read_csv("labeled_data.csv")
    
    # Text preprocessing for labeled data
    labeled_df["text"] = labeled_df["JUDUL"] + " " + labeled_df["OUTLET BERITA"]
    labeled_df["source"] = "original"
    labeled_df["original_index"] = -1  # Mark as original data
    
    # Load unlabeled data
    unlabeled_df = pd.read_csv("unlabeled_data.csv")
    unlabeled_df["text"] = unlabeled_df["title"] + " " + unlabeled_df["content"]
    unlabeled_df["source"] = "unlabeled"
    unlabeled_df["original_index"] = unlabeled_df.index  # Track original indices
    
    # Track all pseudo-labeled indices
    pseudo_indices = []
    pseudo_labels = []

    # Iterative bootstrapping
    max_iterations = 3
    iteration = 0
    batch_size = 80
    unlabeled_texts = unlabeled_df["text"].tolist()
    
    while len(unlabeled_texts) > 0 and iteration < max_iterations:
        print(f"\n=== Iteration {iteration + 1} ===")
        
        # Train model
        model = fine_tune_bert(current_texts, current_labels)
        
        # Process batch
        batch_texts = unlabeled_texts[:batch_size]
        original_indices = unlabeled_df.iloc[:batch_size].index.tolist()  # Get original indices
        
        # Pseudo-labeling
        new_texts, new_labels = pseudo_label(model, tokenizer, batch_texts)
        
        # Track pseudo-labeled indices and labels
        pseudo_indices.extend(original_indices[:len(new_texts)])
        pseudo_labels.extend(new_labels)
        
        # Remove processed texts (preserve original indices)
        unlabeled_texts = unlabeled_texts[batch_size:]
        unlabeled_df = unlabeled_df.iloc[batch_size:]  # Maintain original index tracking
        
        iteration += 1

    # Build final ordered dataset
    final_ordered = pd.concat([
        labeled_df[["original_index", "text", "label", "source"]],
        unlabeled_df.loc[pseudo_indices].assign(
            label=pseudo_labels,
            source="pseudo"
        )
    ]).sort_values("original_index", ascending=True)

    # Save results
    final_ordered.to_csv("ordered_final_dataset.csv", index=False)
    print("\n=== Results Saved ===")
    print("Final ordered dataset: 'ordered_final_dataset.csv'")

Label distribution after imputation:
label
0    36
1    35
2    33
3    17
Name: count, dtype: int64

=== Iteration 1 ===


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,1.4542,1.36289
2,1.3533,1.440123
3,1.1917,1.513061
4,1.1696,1.453206
5,0.9803,1.43696



=== Iteration 2 ===


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,1.4003,1.626087
2,1.3294,1.36321
3,1.192,1.468374
4,1.122,1.33384
5,0.9788,1.35427



=== Iteration 3 ===


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,1.4003,1.626087
2,1.3294,1.36321
3,1.192,1.468374
4,1.122,1.33384
5,0.9788,1.35427



=== Iteration 4 ===


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,1.4003,1.626087
2,1.3294,1.36321
3,1.192,1.468374
4,1.122,1.33384
5,0.9788,1.35427



=== Iteration 5 ===


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,1.4003,1.626087
2,1.3294,1.36321
3,1.192,1.468374
4,1.122,1.33384
5,0.9788,1.35427



=== Results Saved ===
1. Final dataset: 'final_labeled_dataset.csv'
2. Model: './final_model' directory
3. Pseudo-labels: 'pseudo_labels_iteration_*.csv'


In [12]:
if torch.cuda.is_available():
    # Load labeled data
    labeled_df = pd.read_csv("labeled_data.csv")
    
    # Text preprocessing
    labeled_df["JUDUL"] = labeled_df["JUDUL"].fillna("").astype(str)
    labeled_df["OUTLET BERITA"] = labeled_df["OUTLET BERITA"].fillna("").astype(str)
    labeled_df["text"] = labeled_df["JUDUL"] + " " + labeled_df["OUTLET BERITA"]
    
    # Label preprocessing
    labeled_df["label"] = pd.to_numeric(labeled_df["ISI BERITA"], errors="coerce")
    labeled_df["label"] = labeled_df["label"].fillna(2)  # Impute missing with 2
    labeled_df["label"] = labeled_df["label"].astype(int)
    
    # Add metadata for original labeled data
    labeled_df["source"] = "original"
    labeled_df["original_index"] = -1  # Mark as original data
    
    # Verify labels
    print("Label distribution after imputation:")
    print(labeled_df["label"].value_counts())
    
    current_texts = labeled_df["text"].tolist()
    current_labels = labeled_df["label"].tolist()
    
    # Load unlabeled data
    unlabeled_df = pd.read_csv("unlabeled_data.csv")
    unlabeled_df["title"] = unlabeled_df["title"].fillna("").astype(str)
    unlabeled_df["content"] = unlabeled_df["content"].fillna("").astype(str)
    unlabeled_df["text"] = unlabeled_df["title"] + " " + unlabeled_df["content"]
    
    # Track original indices
    unlabeled_df["original_index"] = unlabeled_df.index
    unlabeled_records = unlabeled_df[["text", "original_index"]].to_records(index=False).tolist()
    
    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

    max_iterations = 3
    iteration = 0
    batch_size = 80
    confidence_threshold = 0.95  # Start with high confidence
    pseudo_history = []  # Track pseudo-labeled data

    while len(unlabeled_records) > 0 and iteration < max_iterations:
        print(f"\n=== Iteration {iteration + 1} ===")
        
        # Train model
        model = fine_tune_bert(current_texts, current_labels)
        
        # Process batch with index tracking
        batch_records = unlabeled_records[:batch_size]
        batch_texts = [r[0] for r in batch_records]
        batch_indices = [r[1] for r in batch_records]
        
        # Pseudo-labeling
        new_texts, new_labels = pseudo_label(model, tokenizer, batch_texts, confidence_threshold)
        
        # Save pseudo-labels for this batch
        pd.DataFrame({
            "original_index": batch_indices,
            "text": new_texts,
            "label": new_labels,
            "iteration": iteration + 1
        }).to_csv(f"pseudo_labels_iteration_{iteration + 1}.csv", index=False)
        
        # Update datasets
        current_texts.extend(new_texts)
        current_labels.extend(new_labels)
        pseudo_history.extend(zip(batch_indices, new_labels))
        unlabeled_records = unlabeled_records[batch_size:]
        
        # Adjust confidence threshold dynamically
        success_rate = len(new_texts) / batch_size
        if success_rate < 0.3:  # If too few samples pass
            confidence_threshold = max(0.85, confidence_threshold - 0.05)
        
        iteration += 1

    # Create final ordered dataset
    final_df = pd.concat([
        labeled_df[["original_index", "text", "label", "source"]],
        pd.DataFrame({
            "original_index": [idx for idx, _ in pseudo_history],
            "text": [text for text, idx in unlabeled_records if idx in pseudo_history],
            "label": [label for _, label in pseudo_history],
            "source": "pseudo"
        })
    ])

    # Sort by original index (original labels first, then pseudo in processing order)
    final_df = final_df.sort_values("original_index", ascending=True)
    final_df.to_csv("ordered_final_dataset.csv", index=False)
    
    print("\n=== Results Saved ===")
    print("1. Final dataset: 'ordered_final_dataset.csv'")
    print("2. Model: './final_model' directory")
    print("3. Pseudo-labels: 'pseudo_labels_iteration_*.csv'")

Label distribution after imputation:
label
0    36
1    35
2    33
3    17
Name: count, dtype: int64

=== Iteration 1 ===


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 