In [7]:
import torch
import pandas as pd
import re
from string import punctuation
from transformers import BertModel, BertTokenizer
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from torch import nn, optim
from tqdm import tqdm

# GPU/CPU device setup
def get_device():
    if torch.cuda.is_available():
        device = torch.device("cuda")
        print('There are %d GPU(s) available.' % torch.cuda.device_count())
        print('We will use the GPU:', torch.cuda.get_device_name(0))
    else:
        print('No GPU available, using the CPU instead.')
        device = torch.device("cpu")
    return device

# Data cleaning function
def clean_text(tweet):
    tweet = re.sub(r"@[A-Za-z0-9_-]+", 'USR', tweet)
    tweet = re.sub(r"http\S+", 'URL', tweet)
    tweet = tweet.replace('\n', ' ').replace('\t', ' ')
    tweet = tweet.translate(str.maketrans('', '', punctuation))
    tweet = tweet.replace('„', '').replace('“', '')
    tweet = re.sub(r'[^\w\s,]', '', tweet)
    # tweet = tweet.strip().lower()
    return tweet

# Dataset class
class BERTDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=280):
        self.tok = tokenizer
        self.cleaned_tweets = data['text'].apply(lambda x: clean_text(x))
        self.tweets = list(self.cleaned_tweets.apply(self.tok.encode, max_length=max_length, truncation=True))
        self.labels = list(data['label_value'])

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        tweet = self.tweets[idx]
        label = self.labels[idx]
        return tweet, label

# Collate function for DataLoader
def bert_collate(batch):
    batch_size = len(batch)
    tweets = [t for t, _ in batch]
    labels = torch.tensor([l for _, l in batch]).long()
    max_len = max(len(t) for t in tweets)
    tweets_pad = torch.zeros((batch_size, max_len)).long()
    masks_pad = torch.zeros((batch_size, max_len)).long()
    for i, t in enumerate(tweets):
        tweets_pad[i, :len(t)] = torch.tensor(t)
        masks_pad[i, :len(t)] = 1
    return tweets_pad, masks_pad, labels

# BERT Classifier
class BERTClassifier(nn.Module):
    def __init__(self, model_name):
        super(BERTClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(model_name)
        self.linear = nn.Linear(768, 2)
        self.dropout = nn.Dropout(0.2)

        # Freeze BERT layers
        for n, p in self.bert.named_parameters():
            p.requires_grad = False

    def forward(self, tweets, masks):
        output_bert = self.bert(tweets, attention_mask=masks)[0].mean(axis=1)
        return self.linear(self.dropout(output_bert))

# Training and validation function
def train_model(model, train_loader, dev_loader, device, optimizer, criterion, epochs=5):
    for epoch_i in range(epochs):
        model.train()
        for i, batch in enumerate(tqdm(train_loader)):
            optimizer.zero_grad()
            tweets, masks, labels = [t.to(device) for t in batch]
            output = model(tweets, masks)
            loss = criterion(output, labels)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()

        model.eval()
        y_true, y_pred = [], []
        with torch.no_grad():
            for batch in dev_loader:
                tweets, masks, labels = [t.to(device) for t in batch]
                output = model(tweets, masks)
                max_output = output.argmax(dim=1)
                y_true.extend(labels.tolist())
                y_pred.extend(max_output.tolist())
        print(f"Accuracy after {epoch_i + 1} epoch(s): {accuracy_score(y_true, y_pred)}")

# Evaluation function
def evaluate_model(model, test_loader, device):
    model.eval()
    y_true, y_pred = [], []
    with torch.no_grad():
        for batch in test_loader:
            tweets, masks, labels = [t.to(device) for t in batch]
            output = model(tweets, masks)
            max_output = output.argmax(dim=1)
            y_true.extend(labels.tolist())
            y_pred.extend(max_output.tolist())
    print('Test accuracy: {:.2f}'.format(accuracy_score(y_true, y_pred)))
    print('\nClassification report: \n', classification_report(y_true, y_pred))

# Function to run the entire process
def run_bert_training(model_name, language):
    device = get_device()

    # Load dataset
    csv_file = "./resampled_multilingual_all_data.csv"
    resampled_df = pd.read_csv(csv_file)
    df = resampled_df[resampled_df['language'] == language]

    # Split data
    train, test = train_test_split(df, test_size=0.2, stratify=df['label_value'])
    train, dev = train_test_split(train, test_size=0.25, stratify=train['label_value'])

    tokenizer = BertTokenizer.from_pretrained(model_name)
    train_dataset = BERTDataset(train, tokenizer)
    dev_dataset = BERTDataset(dev, tokenizer)
    test_dataset = BERTDataset(test, tokenizer)

    train_loader = DataLoader(train_dataset, batch_size=100, collate_fn=bert_collate, shuffle=True)
    dev_loader = DataLoader(dev_dataset, batch_size=100, collate_fn=bert_collate)
    test_loader = DataLoader(test_dataset, batch_size=100, collate_fn=bert_collate)

    model = BERTClassifier(model_name)
    model = model.to(device)

    optimizer = optim.Adam(model.parameters(), lr=0.01)
    criterion = nn.CrossEntropyLoss()

    train_model(model, train_loader, dev_loader, device, optimizer, criterion)
    evaluate_model(model, test_loader, device)

In [8]:
run_bert_training("bert-base-cased", "eng")

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


100%|██████████| 56/56 [00:54<00:00,  1.04it/s]


Accuracy after 1 epoch(s): 0.6314943760042849


100%|██████████| 56/56 [00:53<00:00,  1.05it/s]


Accuracy after 2 epoch(s): 0.659346545259775


100%|██████████| 56/56 [00:54<00:00,  1.03it/s]


Accuracy after 3 epoch(s): 0.6400642742367434


100%|██████████| 56/56 [00:55<00:00,  1.01it/s]


Accuracy after 4 epoch(s): 0.6539903588644885


100%|██████████| 56/56 [00:54<00:00,  1.02it/s]


Accuracy after 5 epoch(s): 0.64542046063203
Test accuracy: 0.65

Classification report: 
               precision    recall  f1-score   support

           0       0.65      0.68      0.66       933
           1       0.66      0.63      0.64       934

    accuracy                           0.65      1867
   macro avg       0.65      0.65      0.65      1867
weighted avg       0.65      0.65      0.65      1867



In [9]:
run_bert_training("bert-base-german-cased", "ger")

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/255k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/485k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/433 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/439M [00:00<?, ?B/s]

100%|██████████| 56/56 [00:42<00:00,  1.32it/s]


Accuracy after 1 epoch(s): 0.629887520085699


100%|██████████| 56/56 [00:41<00:00,  1.35it/s]


Accuracy after 2 epoch(s): 0.6791644349223352


100%|██████████| 56/56 [00:39<00:00,  1.41it/s]


Accuracy after 3 epoch(s): 0.6705945366898768


100%|██████████| 56/56 [00:40<00:00,  1.39it/s]


Accuracy after 4 epoch(s): 0.6823781467595073


100%|██████████| 56/56 [00:41<00:00,  1.34it/s]


Accuracy after 5 epoch(s): 0.6963042313872523
Test accuracy: 0.70

Classification report: 
               precision    recall  f1-score   support

           0       0.76      0.58      0.65       934
           1       0.66      0.82      0.73       933

    accuracy                           0.70      1867
   macro avg       0.71      0.70      0.69      1867
weighted avg       0.71      0.70      0.69      1867



In [10]:
run_bert_training("bert-base-multilingual-cased", "eng")

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

100%|██████████| 56/56 [00:58<00:00,  1.04s/it]


Accuracy after 1 epoch(s): 0.5966791644349223


100%|██████████| 56/56 [00:56<00:00,  1.00s/it]


Accuracy after 2 epoch(s): 0.6063202999464381


100%|██████████| 56/56 [00:56<00:00,  1.02s/it]


Accuracy after 3 epoch(s): 0.6202463845741831


100%|██████████| 56/56 [00:54<00:00,  1.03it/s]


Accuracy after 4 epoch(s): 0.5891805034815212


100%|██████████| 56/56 [00:56<00:00,  1.00s/it]


Accuracy after 5 epoch(s): 0.6207820032137118
Test accuracy: 0.62

Classification report: 
               precision    recall  f1-score   support

           0       0.62      0.65      0.63       933
           1       0.63      0.60      0.62       934

    accuracy                           0.62      1867
   macro avg       0.62      0.62      0.62      1867
weighted avg       0.62      0.62      0.62      1867



In [11]:
run_bert_training("bert-base-multilingual-cased", "ger")

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


100%|██████████| 56/56 [00:47<00:00,  1.17it/s]


Accuracy after 1 epoch(s): 0.6572040707016604


100%|██████████| 56/56 [00:47<00:00,  1.19it/s]


Accuracy after 2 epoch(s): 0.6416711301553294


100%|██████████| 56/56 [00:46<00:00,  1.21it/s]


Accuracy after 3 epoch(s): 0.6775575790037494


100%|██████████| 56/56 [00:47<00:00,  1.19it/s]


Accuracy after 4 epoch(s): 0.678093197643278


100%|██████████| 56/56 [00:46<00:00,  1.21it/s]


Accuracy after 5 epoch(s): 0.6759507230851634
Test accuracy: 0.67

Classification report: 
               precision    recall  f1-score   support

           0       0.72      0.57      0.64       933
           1       0.64      0.78      0.71       934

    accuracy                           0.67      1867
   macro avg       0.68      0.67      0.67      1867
weighted avg       0.68      0.67      0.67      1867



In [12]:
run_bert_training("bert-base-uncased", "eng")

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

100%|██████████| 56/56 [00:53<00:00,  1.05it/s]


Accuracy after 1 epoch(s): 0.6325656132833423


100%|██████████| 56/56 [00:53<00:00,  1.04it/s]


Accuracy after 2 epoch(s): 0.6561328334226031


100%|██████████| 56/56 [00:53<00:00,  1.04it/s]


Accuracy after 3 epoch(s): 0.6539903588644885


100%|██████████| 56/56 [00:53<00:00,  1.06it/s]


Accuracy after 4 epoch(s): 0.6497054097482592


100%|██████████| 56/56 [00:52<00:00,  1.07it/s]


Accuracy after 5 epoch(s): 0.6588109266202464
Test accuracy: 0.67

Classification report: 
               precision    recall  f1-score   support

           0       0.69      0.61      0.65       934
           1       0.65      0.72      0.68       933

    accuracy                           0.67      1867
   macro avg       0.67      0.67      0.66      1867
weighted avg       0.67      0.67      0.66      1867



In [13]:
run_bert_training("bert-base-german-cased", "eng")

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


100%|██████████| 56/56 [01:16<00:00,  1.37s/it]


Accuracy after 1 epoch(s): 0.576861274772362


100%|██████████| 56/56 [01:17<00:00,  1.38s/it]


Accuracy after 2 epoch(s): 0.5634708087841457


100%|██████████| 56/56 [01:19<00:00,  1.42s/it]


Accuracy after 3 epoch(s): 0.5913229780396357


100%|██████████| 56/56 [01:20<00:00,  1.44s/it]


Accuracy after 4 epoch(s): 0.5865024102838778


100%|██████████| 56/56 [01:18<00:00,  1.40s/it]


Accuracy after 5 epoch(s): 0.550615961435458
Test accuracy: 0.55

Classification report: 
               precision    recall  f1-score   support

           0       0.65      0.22      0.33       933
           1       0.53      0.88      0.66       934

    accuracy                           0.55      1867
   macro avg       0.59      0.55      0.50      1867
weighted avg       0.59      0.55      0.50      1867



In [14]:
run_bert_training("bert-base-cased", "ger")

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


100%|██████████| 56/56 [01:14<00:00,  1.34s/it]


Accuracy after 1 epoch(s): 0.5656132833422604


100%|██████████| 56/56 [01:13<00:00,  1.32s/it]


Accuracy after 2 epoch(s): 0.5607927155865025


100%|██████████| 56/56 [01:12<00:00,  1.29s/it]


Accuracy after 3 epoch(s): 0.6009641135511515


100%|██████████| 56/56 [01:12<00:00,  1.29s/it]


Accuracy after 4 epoch(s): 0.5372254954472415


100%|██████████| 56/56 [01:14<00:00,  1.33s/it]


Accuracy after 5 epoch(s): 0.5773968934118907
Test accuracy: 0.58

Classification report: 
               precision    recall  f1-score   support

           0       0.55      0.86      0.67       933
           1       0.68      0.30      0.42       934

    accuracy                           0.58      1867
   macro avg       0.61      0.58      0.54      1867
weighted avg       0.61      0.58      0.54      1867

