In [3]:

from torch import nn
import torch
import numpy as np
import pandas as pd
from transformers import BertTokenizer, BertModel
from torch.optim import Adam
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold

In [35]:
### Create new dataset with topics and comments

class Dataset(torch.utils.data.Dataset):

    def __init__(self,dataframe):
        self.labels = [label_to_id[label] for label in dataframe["Topic"]]
        self.texts = [tokenizer(txt, padding ="max_length", max_length = 1024, truncation=True, return_tensors="pt") for txt in dataframe["Comment"]]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        return self.texts[idx]

    def __getitem__(self, idx):

        batch_texts = self.get_batch_texts(idx)
        batch_labels = self.get_batch_labels(idx)

        return batch_texts, batch_labels

label_to_id = {"politik" : 0, "maßnahmen" : 1, "infektion" : 2, "impfung": 3, "lockdown": 4, "wirtschaft":5, "lockerung":6}
dataset_topic = pd.read_csv("annotated_data/annotated_data_with_users_and_al_cleaned.csv", names=["ID", "Date", "Time", "Comment Level", "Username", "Opinion", "Sentiment", "Topic_comment", "Topic_article", "Comment", "Method"])
dataset_topic = dataset_topic[["Comment", "Topic_comment"]]
dataset_topic = dataset_topic.loc[dataset_topic["Topic_comment"].isin(label_to_id.keys())]
dataset_topic.columns = ["Comment", "Topic"]
dataset_topic["Comment"]
df_train, df_test = np.split(dataset_topic.sample(frac=1, random_state=77), [int(.8*len(dataset_topic))])

Dataset(df_train)

loader = torch.utils.data.DataLoader(Dataset(df_train), batch_size=2, shuffle=True)


[{'input_ids': tensor([[[  101, 10236, 10221,  ...,     0,     0,     0]],

        [[  101, 14427, 16082,  ...,     0,     0,     0]]]), 'token_type_ids': tensor([[[0, 0, 0,  ..., 0, 0, 0]],

        [[0, 0, 0,  ..., 0, 0, 0]]]), 'attention_mask': tensor([[[1, 1, 1,  ..., 0, 0, 0]],

        [[1, 1, 1,  ..., 0, 0, 0]]])}, tensor([1, 4], dtype=torch.int32)]
[{'input_ids': tensor([[[  101, 21023, 12382,  ...,     0,     0,     0]],

        [[  101, 14619, 15826,  ...,     0,     0,     0]]]), 'token_type_ids': tensor([[[0, 0, 0,  ..., 0, 0, 0]],

        [[0, 0, 0,  ..., 0, 0, 0]]]), 'attention_mask': tensor([[[1, 1, 1,  ..., 0, 0, 0]],

        [[1, 1, 1,  ..., 0, 0, 0]]])}, tensor([1, 2], dtype=torch.int32)]
[{'input_ids': tensor([[[  101,   148, 11945,  ...,     0,     0,     0]],

        [[  101, 70061, 10760,  ...,     0,     0,     0]]]), 'token_type_ids': tensor([[[0, 0, 0,  ..., 0, 0, 0]],

        [[0, 0, 0,  ..., 0, 0, 0]]]), 'attention_mask': tensor([[[1, 1, 1,  ..., 0, 0, 

In [4]:
data_text = []
data_labels = []
text_train = []
labels_train = []
text_test = np.array([])
labels_test = np.array([])

tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")


label_to_id = {"politik" : 0, "maßnahmen" : 1, "infektion" : 2, "impfung": 3, "lockdown": 4, "wirtschaft":5, "lockerung":6}
id_to_label = {0:"politik", 1:"maßnahmen", 2:"infektion", 3: "impfung", 4: "lockdown", 5:"wirtschaft", 6:"lockerung"}

### Here we create a training set that can be used to compare across different sizes of training data

mein_dataframe1 = pd.read_csv() 
mein_dataframe1.columns = ["ID", "Date", "Time", "Comment Level", "Username", "Opinion", "Sentiment",
                          "topic_comment", "Topic_article", "Comment", "Method"]

display(len(mein_dataframe1))
# split_index = int(len(mein_dataframe1)*0.8)
for index, row in mein_dataframe1.iterrows():
    if row["Sentiment"] in label_to_id.keys():
        data_text.append(row["Comment"])
        data_labels.append(row["Sentiment"])

print(len(data_labels), len(mein_dataframe1.iterrows()))


class Dataset(torch.utils.data.Dataset):

    def __init__(self,dataframe):
        self.labels = [label_to_id[label] for label in dataframe["topic_comment"] if label in ["politik", "maßnahmen", "infektion", "impfung", "lockdown", "wirtschaft", "lockerung"]]
        self.texts = [tokenizer(txt, padding ="max_length", max_length = 1024, truncation=True, return_tensors="pt") for txt in dataframe["Comment"]]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        return self.texts[idx]

    def __getitem__(self, idx):

        batch_texts = self.get_batch_texts(idx)
        batch_labels = self.get_batch_labels(idx)

        return batch_texts, batch_labels
# ### Define test and train set
# text_test = text_train[split_index+1:]
# text_train = text_train[:split_index]
# labels_test = labels_train[split_index+1:]
# labels_train = labels_train[:split_index]


### Replace label as int
for idx, labels in enumerate(data_labels):
    data_labels[idx] = label_to_id[labels]



    
    
def preprocess_function(examples):
    return tokenizer(examples["texts"], padding = "max_length", max_length=120, truncation=True)




### Convert to numpy array
data_text = np.array([i for i in data_text])
data_labels = np.array([i for i in data_labels])

class ClassifierText(nn.Module):
    def __init__(self, dropout = 0.5):
        super(ClassifierText, self).__init__()

        self.bert = BertModel.from_pretrained("bert-base-multilingual-cased")
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 7)
        self.relu = nn.ReLU()
    
    def forward_pass(self, input_id, mask):
        _, pooled_output = self.bert(input_id = input_id, attention_mask = mask, return_dict = False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        final_layer = self.relu(linear_output)

        return final_layer




accuracies = []
hello = []
# folds = StratifiedKFold(n_splits=5)
# for train_index, test_index in folds.split(data_text, data_labels):
#     text_train, labels_train  = data_text[train_index], data_labels[train_index]
#     text_test, labels_test  = data_text[test_index], data_labels[test_index]

#     train_dict = {"texts": text_train, "labels" : labels_train}
#     test_dict = {"texts": text_test, "labels" : labels_test}

    # print([len(z) for z in [text_train, text_test]])


    ### Remove newline characters
    # for idx, text in enumerate(train_dict["texts"]):
    #     train_dict["texts"][idx] = text.rstrip("\n")

    # for idx2, text2 in enumerate(test_dict["texts"]):
    #     test_dict["texts"][idx2] = text2.rstrip("\n")
        
    # data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    
    # meine_dataset_train = Dataset.from_dict(train_dict)

    # mein_dataset_test = Dataset.from_dict(test_dict)

    # tokenized_dataset_train = meine_dataset_train.map(preprocess_function, batched= True)
    # tokenized_dataset_test = mein_dataset_test.map(preprocess_function, batched=True)


Downloading: 100%|██████████| 996k/996k [00:01<00:00, 568kB/s]  
Downloading: 100%|██████████| 29.0/29.0 [00:00<00:00, 19.5kB/s]
Downloading: 100%|██████████| 1.96M/1.96M [00:03<00:00, 588kB/s]
Downloading: 100%|██████████| 625/625 [00:00<00:00, 432kB/s]


TypeError: read_csv() missing 1 required positional argument: 'filepath_or_buffer'

In [None]:
def train(model, train_data, learning_rate, epochs):

    train = data_text 

    train_dataloader = torch.utils.data.DataLoader(train, batch_size=2, shuffle=True)


    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr= learning_rate)

    if use_cuda:

            model = model.cuda()
            criterion = criterion.cuda()

    for epoch_num in range(epochs):

            total_acc_train = 0
            total_loss_train = 0

            for train_input, train_label in tqdm(train_dataloader):

                train_label = train_label.to(device)
                mask = train_input['attention_mask'].to(device)
                input_id = train_input['input_ids'].squeeze(1).to(device)

                output = model(input_id, mask)
                
                batch_loss = criterion(output, train_label.long())
                total_loss_train += batch_loss.item()
                
                acc = (output.argmax(dim=1) == train_label).sum().item()
                total_acc_train += acc

                model.zero_grad()
                batch_loss.backward()
                optimizer.step()
            
            total_acc_val = 0
            total_loss_val = 0

            print(
                f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_data): .3f} \
                | Train Accuracy: {total_acc_train / len(train_data): .3f}'
            )    
                  
EPOCHS = 5
model = ClassifierText()
LR = 1e-6
for train_index, test_index in folds.split(data_text, data_labels):
    text_train, labels_train  = data_text[train_index], data_labels[train_index]
    text_test, labels_test  = data_text[test_index], data_labels[test_index]

    train_dict = {"texts": text_train, "labels" : labels_train}
    test_dict = {"texts": text_test, "labels" : labels_test}              
    train(model, text_train, LR, EPOCHS)

In [10]:
pd.set_option("display.max_rows", None)
meine_daten = pd.read_csv("annotated_data/annotated_data_with_users_and_al_cleaned2x.csv", names=["ID", "Date", "Time", "Comment Level", "Username", "Opinion", "Sentiment", "topic_comment", "Topic_article", "Comment", "Method"], encoding="utf-8-sig", header=None)
print(meine_daten["topic_comment"].value_counts())
meine_daten.loc[meine_daten["topic_comment"] == "fußball", "topic_comment"] = "sport"
meine_daten.loc[meine_daten["topic_comment"] == "schulschliessung", "topic_comment"] = "lockdown"
meine_daten.loc[meine_daten["topic_comment"] == "homeoffice", "topic_comment"] = "arbeit"
meine_daten.loc[meine_daten["topic_comment"] == "quarantäne", "topic_comment"] = "maßnahmen"
meine_daten.loc[meine_daten["topic_comment"] == "warnapp", "topic_comment"] = "maßnahmen"
meine_daten.loc[meine_daten["topic_comment"] == "reiseverbot", "topic_comment"] = "reise"
meine_daten.loc[meine_daten["topic_comment"] == "schnelltests", "topic_comment"] = "tests"
meine_daten.loc[meine_daten["topic_comment"] == "inzidenz", "topic_comment"] = "infektion"
meine_daten.loc[meine_daten["topic_comment"] == "schulschließung", "topic_comment"] = "maßnahmen"
meine_daten.loc[meine_daten["topic_comment"] == "intensivstation", "topic_comment"] = "krankenhaus"
meine_daten.loc[meine_daten["topic_comment"] == "intensivstationen", "topic_comment"] = "krankenhaus"
meine_daten.loc[meine_daten["topic_comment"] == "demo", "topic_comment"] = "demos"
meine_daten.loc[meine_daten["topic_comment"] == "testen", "topic_comment"] = "tests"
meine_daten.loc[meine_daten["topic_comment"] == "konjunktur", "topic_comment"] = "wirtschaft"
meine_daten.loc[meine_daten["topic_comment"] == "maske", "topic_comment"] = "maskenpflicht"
print(meine_daten["topic_comment"].value_counts())
meine_daten.to_csv("annotated_data/annotated_data_with_users_and_al_cleaned2x.csv", header=None, index=None)
meine_daten.loc[meine_daten["topic_comment"] == "neutral", :]

politik          535
maßnahmen        440
impfung          389
infektion        298
lockdown         268
wirtschaft       232
lockerung        104
usa               96
reise             87
demos             84
maskenpflicht     71
virus             69
tests             63
arbeit            57
krankenhaus       29
sport             23
china             22
Name: topic_comment, dtype: int64
politik          535
maßnahmen        440
impfung          389
infektion        298
lockdown         268
wirtschaft       232
lockerung        104
usa               96
reise             87
demos             84
maskenpflicht     71
virus             69
tests             63
arbeit            57
krankenhaus       29
sport             23
china             22
Name: topic_comment, dtype: int64


Unnamed: 0,ID,Date,Time,Comment Level,Username,Opinion,Sentiment,topic_comment,Topic_article,Comment,Method


In [62]:
meine_daten["topic_comment"] == "fußball"

0       False
1       False
2       False
3       False
4       False
5       False
6       False
7       False
8       False
9       False
10      False
11      False
12      False
13      False
14      False
15      False
16      False
17      False
18      False
19      False
20      False
21      False
22      False
23      False
24      False
25      False
26      False
27      False
28      False
29      False
30      False
31      False
32      False
33      False
34      False
35      False
36      False
37      False
38      False
39      False
40      False
41      False
42      False
43      False
44      False
45      False
46      False
47      False
48      False
49      False
50      False
51      False
52      False
53      False
54      False
55      False
56      False
57      False
58      False
59      False
60      False
61      False
62      False
63      False
64      False
65      False
66       True
67      False
68      False
69      False
70      False
71    