# Deep learning-based approaches

## RobBERT

In [None]:
!pip install transformers
!pip install imbalanced-learn
!pip install torch
!pip install accelerate -U
!pip install datasets

Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch)
  Using cached nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl (56.5 MB)
Collectin

In [None]:
import pandas as pd
import torch
import numpy as np
import random
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from datasets import Dataset
from imblearn.over_sampling import RandomOverSampler

# Function to set all seeds for reproducibility
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# Setting the seed
set_seed(42)

# Function to load a single dataset
def load_dataset(filename):
    df = pd.read_csv(filename)
    return df['text'], df['labels']

# Check if only the labels 0, 1 and 2 are present
def map_labels(label):
    if label == 0:
        return 0  # negative
    elif label == 1:
        return 1  # neutral
    elif label == 2:
        return 2  # positive
    else:
        return -1  # unknown

# Function to tokenize the texts
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True)

# Load model and tokenizer
model_name = "pdelobelle/robbert-v2-dutch-base"
tokenizer = RobertaTokenizer.from_pretrained(model_name)
model = RobertaForSequenceClassification.from_pretrained(model_name, num_labels=3)

# List of datasets
dataset_paths = ["1960s_gas.csv", "1970s_gas.csv", "1980s_gas.csv", "1990s_gas.csv"]

# Iterate over each dataset path in the list
for dataset_path in dataset_paths:
    dataset_name = dataset_path.split(".")[0]
    print(f"Processing {dataset_name}...")

    X, y = load_dataset(dataset_path)

    # Map numerical labels to sentiment categories for ground truth
    y = y.apply(map_labels)

    # Create a pandas DataFrame
    df = pd.DataFrame({'text': X, 'label': y})

    train_val_df, test_df = train_test_split(df, test_size=0.15, random_state=42, stratify=df['label'])
    train_df, val_df = train_test_split(train_val_df, test_size=0.1765, random_state=42, stratify=train_val_df['label'])

    # Oversample the training data to handle class imbalance
    oversampler = RandomOverSampler(random_state=42)
    train_df_resampled, train_labels_resampled = oversampler.fit_resample(train_df[['text']], train_df['label'])
    train_df_resampled['label'] = train_labels_resampled

    # Convert pandas DataFrames to Hugging Face Datasets
    train_dataset = Dataset.from_pandas(train_df_resampled)
    val_dataset = Dataset.from_pandas(val_df)
    test_dataset = Dataset.from_pandas(test_df)

    # Tokenize datasets
    train_dataset = train_dataset.map(tokenize_function, batched=True)
    val_dataset = val_dataset.map(tokenize_function, batched=True)
    test_dataset = test_dataset.map(tokenize_function, batched=True)

    # Set format for PyTorch
    train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
    val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
    test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

    # Define training arguments
    training_args = TrainingArguments(
        output_dir=f'./results/{dataset_name}',
        evaluation_strategy="epoch",
        save_strategy="epoch",
        logging_dir=f'./logs/{dataset_name}',
        num_train_epochs=2,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=8,
        logging_steps=10,
        load_best_model_at_end=True,
        learning_rate=1e-4,
    )

    # Define Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
    )

    # Train the model
    trainer.train()

    # Evaluate the model
    print(f"Evaluating {dataset_name}...")
    eval_result = trainer.evaluate(eval_dataset=test_dataset)
    print(f"Test Set Evaluation for {dataset_name}:\n", eval_result)

    # Get predictions
    predictions = trainer.predict(test_dataset)
    preds = predictions.predictions.argmax(-1)
    true_labels = test_dataset['label']

    # Generate classification report
    report = classification_report(true_labels, preds, target_names=["negative", "neutral", "positive"])
    print(f"Classification Report for {dataset_name}:\n", report)

    # Generate confusion matrix
    cm = confusion_matrix(true_labels, preds)
    print(f"Confusion Matrix for {dataset_name}:\n", cm)

    print("=" * 50)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.14k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/733k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/383k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.12M [00:00<?, ?B/s]



config.json:   0%|          | 0.00/660 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/467M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at pdelobelle/robbert-v2-dutch-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processing 1960s_gas...


Map:   0%|          | 0/462 [00:00<?, ? examples/s]

Map:   0%|          | 0/65 [00:00<?, ? examples/s]

Map:   0%|          | 0/65 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss
1,1.197,1.155904
2,0.6822,1.133945
3,0.3503,1.337193
4,0.0942,1.450952


Evaluating 1960s_gas...


Test Set Evaluation for 1960s_gas:
 {'eval_loss': 1.0511798858642578, 'eval_runtime': 1.9975, 'eval_samples_per_second': 32.541, 'eval_steps_per_second': 4.506, 'epoch': 4.0}
Classification Report for 1960s_gas:
               precision    recall  f1-score   support

    negative       0.45      0.75      0.56        12
     neutral       0.00      0.00      0.00        20
    positive       0.57      0.76      0.65        33

    accuracy                           0.52        65
   macro avg       0.34      0.50      0.40        65
weighted avg       0.37      0.52      0.43        65

Confusion Matrix for 1960s_gas:
 [[ 9  0  3]
 [ 4  0 16]
 [ 7  1 25]]
Processing 1970s_gas...


Map:   0%|          | 0/114 [00:00<?, ? examples/s]

Map:   0%|          | 0/15 [00:00<?, ? examples/s]

Map:   0%|          | 0/15 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss
1,No log,1.091561
2,1.044900,1.457587
3,0.374500,1.907759
4,0.220600,1.756206


Evaluating 1970s_gas...


Test Set Evaluation for 1970s_gas:
 {'eval_loss': 0.8900591135025024, 'eval_runtime': 0.5331, 'eval_samples_per_second': 28.138, 'eval_steps_per_second': 3.752, 'epoch': 4.0}


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Classification Report for 1970s_gas:
               precision    recall  f1-score   support

    negative       1.00      0.33      0.50         3
     neutral       0.00      0.00      0.00         3
    positive       0.64      1.00      0.78         9

    accuracy                           0.67        15
   macro avg       0.55      0.44      0.43        15
weighted avg       0.59      0.67      0.57        15

Confusion Matrix for 1970s_gas:
 [[1 0 2]
 [0 0 3]
 [0 0 9]]
Processing 1980s_gas...


Map:   0%|          | 0/186 [00:00<?, ? examples/s]

Map:   0%|          | 0/33 [00:00<?, ? examples/s]

Map:   0%|          | 0/33 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss
1,1.1853,1.078205
2,1.0755,1.107207
3,0.9841,1.146729
4,0.8113,1.13191


Evaluating 1980s_gas...


Test Set Evaluation for 1980s_gas:
 {'eval_loss': 1.076548457145691, 'eval_runtime': 1.0467, 'eval_samples_per_second': 31.528, 'eval_steps_per_second': 4.777, 'epoch': 4.0}


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Classification Report for 1980s_gas:
               precision    recall  f1-score   support

    negative       0.45      0.77      0.57        13
     neutral       0.00      0.00      0.00         7
    positive       0.36      0.31      0.33        13

    accuracy                           0.42        33
   macro avg       0.27      0.36      0.30        33
weighted avg       0.32      0.42      0.36        33

Confusion Matrix for 1980s_gas:
 [[10  0  3]
 [ 3  0  4]
 [ 9  0  4]]
Processing 1990s_gas...


Map:   0%|          | 0/51 [00:00<?, ? examples/s]

Map:   0%|          | 0/8 [00:00<?, ? examples/s]

Map:   0%|          | 0/8 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss
1,No log,1.083399
2,No log,1.103045
3,1.032700,1.073438
4,1.032700,1.074149


Evaluating 1990s_gas...


Test Set Evaluation for 1990s_gas:
 {'eval_loss': 1.0107314586639404, 'eval_runtime': 0.2832, 'eval_samples_per_second': 28.251, 'eval_steps_per_second': 3.531, 'epoch': 4.0}
Classification Report for 1990s_gas:
               precision    recall  f1-score   support

    negative       0.50      0.50      0.50         4
     neutral       0.50      0.50      0.50         2
    positive       0.00      0.00      0.00         2

    accuracy                           0.38         8
   macro avg       0.33      0.33      0.33         8
weighted avg       0.38      0.38      0.38         8

Confusion Matrix for 1990s_gas:
 [[2 0 2]
 [1 1 0]
 [1 1 0]]


## BERTje

In [None]:
!pip install transformers
!pip install imbalanced-learn
!pip install torch
!pip install accelerate -U
!pip install datasets

Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch)
  Using cached nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl (56.5 MB)
Collectin

In [None]:
import pandas as pd
import torch
import numpy as np
import random
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from datasets import Dataset
from imblearn.over_sampling import RandomOverSampler

# Function to set all seeds for reproducibility
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# Setting the seed
set_seed(42)

# Function to load a single dataset
def load_dataset(filename):
    df = pd.read_csv(filename)
    return df['text'], df['labels']

# Check if only the labels 0, 1 and 2 are present
def map_labels(label):
    if label == 0:
        return 0  # negative
    elif label == 1:
        return 1  # neutral
    elif label == 2:
        return 2  # positive
    else:
        return -1  # unknown

# Function to tokenize the texts
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True)

# Load model and tokenizer
model_name = "wietsedv/bert-base-dutch-cased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=3)

# List of datasets
dataset_paths = ["1960s_gas.csv", "1970s_gas.csv", "1980s_gas.csv", "1990s_gas.csv"]

# Iterate over each dataset path in the list
for dataset_path in dataset_paths:
    dataset_name = dataset_path.split(".")[0]  # use the name from the CSV files
    print(f"Processing {dataset_name}...")

    # Load dataset
    X, y = load_dataset(dataset_path)

    # Map numerical labels to sentiment categories for ground truth
    y = y.apply(map_labels)

    # Create a pandas DataFrame
    df = pd.DataFrame({'text': X, 'label': y})

    train_val_df, test_df = train_test_split(df, test_size=0.15, random_state=42, stratify=df['label'])
    train_df, val_df = train_test_split(train_val_df, test_size=0.1765, random_state=42, stratify=train_val_df['label'])

    # Oversample the training data to handle class imbalance
    oversampler = RandomOverSampler(random_state=42)
    train_df_resampled, train_labels_resampled = oversampler.fit_resample(train_df[['text']], train_df['label'])
    train_df_resampled['label'] = train_labels_resampled

    # Convert pandas DataFrames to Hugging Face Datasets
    train_dataset = Dataset.from_pandas(train_df_resampled)
    val_dataset = Dataset.from_pandas(val_df)
    test_dataset = Dataset.from_pandas(test_df)

    # Tokenize datasets
    train_dataset = train_dataset.map(tokenize_function, batched=True)
    val_dataset = val_dataset.map(tokenize_function, batched=True)
    test_dataset = test_dataset.map(tokenize_function, batched=True)

    # Set format for PyTorch
    train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
    val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
    test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

    # Define training arguments
    training_args = TrainingArguments(
        output_dir=f'./results/{dataset_name}',
        evaluation_strategy="epoch",
        save_strategy="epoch",
        logging_dir=f'./logs/{dataset_name}',
        num_train_epochs=4,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=8,
        logging_steps=10,
        load_best_model_at_end=True,
        learning_rate=1e-4,
    )

    # Define Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
    )

    # Train the model
    trainer.train()

    # Evaluate the model
    print(f"Evaluating {dataset_name}...")
    eval_result = trainer.evaluate(eval_dataset=test_dataset)
    print(f"Test Set Evaluation for {dataset_name}:\n", eval_result)

    # Get predictions
    predictions = trainer.predict(test_dataset)
    preds = predictions.predictions.argmax(-1)
    true_labels = test_dataset['label']

    # Generate classification report
    report = classification_report(true_labels, preds, target_names=["negative", "neutral", "positive"])
    print(f"Classification Report for {dataset_name}:\n", report)

    # Generate confusion matrix
    cm = confusion_matrix(true_labels, preds)
    print(f"Confusion Matrix for {dataset_name}:\n", cm)

    print("=" * 50)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at wietsedv/bert-base-dutch-cased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processing 1960s_gas...


Map:   0%|          | 0/462 [00:00<?, ? examples/s]

Map:   0%|          | 0/65 [00:00<?, ? examples/s]

Map:   0%|          | 0/65 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss
1,1.1242,1.128516
2,0.4524,1.013467
3,0.1413,1.315793
4,0.0165,1.564197


Evaluating 1960s_gas...


Test Set Evaluation for 1960s_gas:
 {'eval_loss': 1.3837476968765259, 'eval_runtime': 1.8233, 'eval_samples_per_second': 35.65, 'eval_steps_per_second': 4.936, 'epoch': 4.0}
Classification Report for 1960s_gas:
               precision    recall  f1-score   support

    negative       0.33      0.17      0.22        12
     neutral       0.00      0.00      0.00        20
    positive       0.51      0.85      0.64        33

    accuracy                           0.46        65
   macro avg       0.28      0.34      0.29        65
weighted avg       0.32      0.46      0.36        65

Confusion Matrix for 1960s_gas:
 [[ 2  1  9]
 [ 2  0 18]
 [ 2  3 28]]
Processing 1970s_gas...


Map:   0%|          | 0/114 [00:00<?, ? examples/s]

Map:   0%|          | 0/15 [00:00<?, ? examples/s]

Map:   0%|          | 0/15 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss
1,No log,1.172485
2,1.008000,1.619523
3,0.210600,1.842801
4,0.049500,2.056693


Evaluating 1970s_gas...


Test Set Evaluation for 1970s_gas:
 {'eval_loss': 0.9468934535980225, 'eval_runtime': 0.5143, 'eval_samples_per_second': 29.167, 'eval_steps_per_second': 3.889, 'epoch': 4.0}


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Classification Report for 1970s_gas:
               precision    recall  f1-score   support

    negative       1.00      0.33      0.50         3
     neutral       0.00      0.00      0.00         3
    positive       0.64      1.00      0.78         9

    accuracy                           0.67        15
   macro avg       0.55      0.44      0.43        15
weighted avg       0.59      0.67      0.57        15

Confusion Matrix for 1970s_gas:
 [[1 0 2]
 [0 0 3]
 [0 0 9]]
Processing 1980s_gas...


Map:   0%|          | 0/186 [00:00<?, ? examples/s]

Map:   0%|          | 0/33 [00:00<?, ? examples/s]

Map:   0%|          | 0/33 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss
1,1.1059,1.20226
2,0.6811,1.394272
3,0.28,1.663344
4,0.0953,1.877661


Evaluating 1980s_gas...


Test Set Evaluation for 1980s_gas:
 {'eval_loss': 1.207517147064209, 'eval_runtime': 0.9812, 'eval_samples_per_second': 33.632, 'eval_steps_per_second': 5.096, 'epoch': 4.0}
Classification Report for 1980s_gas:
               precision    recall  f1-score   support

    negative       0.50      0.23      0.32        13
     neutral       0.29      0.71      0.42         7
    positive       0.40      0.31      0.35        13

    accuracy                           0.36        33
   macro avg       0.40      0.42      0.36        33
weighted avg       0.42      0.36      0.35        33

Confusion Matrix for 1980s_gas:
 [[3 5 5]
 [1 5 1]
 [2 7 4]]
Processing 1990s_gas...


Map:   0%|          | 0/51 [00:00<?, ? examples/s]

Map:   0%|          | 0/8 [00:00<?, ? examples/s]

Map:   0%|          | 0/8 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss
1,No log,1.259754
2,No log,1.440435
3,0.697200,1.602775
4,0.697200,1.728642


Evaluating 1990s_gas...


Test Set Evaluation for 1990s_gas:
 {'eval_loss': 1.1769102811813354, 'eval_runtime': 0.2808, 'eval_samples_per_second': 28.493, 'eval_steps_per_second': 3.562, 'epoch': 4.0}
Classification Report for 1990s_gas:
               precision    recall  f1-score   support

    negative       0.50      0.50      0.50         4
     neutral       0.00      0.00      0.00         2
    positive       0.25      0.50      0.33         2

    accuracy                           0.38         8
   macro avg       0.25      0.33      0.28         8
weighted avg       0.31      0.38      0.33         8

Confusion Matrix for 1990s_gas:
 [[2 0 2]
 [1 0 1]
 [1 0 1]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
