In [4]:

from torch import nn
import torch
import numpy as np
import pandas as pd
from transformers import BertTokenizer, BertModel, AutoTokenizer
from torch.optim import Adam
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold
import requests
from bs4 import BeautifulSoup

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.20.1-py3-none-any.whl (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 6.0 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 45.8 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 12.3 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 51.5 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling P

In [25]:
torch.cuda.empty_cache()
pd.set_option('display.max_rows', None)

In [31]:

label_to_id = {"politik" : 0, "maßnahmen" : 1, "infektion" : 2, "impfung": 3, "lockdown": 4, "wirtschaft":5, "lockerung":6}
dataset_topic = pd.read_csv("annotated_data/annotated_data_with_users_and_al_cleaned.csv", names=["ID", "Date", "Time", "Comment Level", "Username", "Opinion", "Sentiment", "Topic_comment", "Topic_article", "Comment", "Method"])
dataset_topic = dataset_topic.loc[dataset_topic["Topic_comment"].isin(label_to_id.keys())]
dataset_topic = dataset_topic[["Comment", "Topic_comment"]]
dataset_topic = dataset_topic.loc[dataset_topic["Topic_comment"].isin(label_to_id.keys())]
dataset_topic.columns = ["Comment", "Topic"]
dataset_topic["Comment"]
df_train, df_test = np.split(dataset_topic.sample(frac=1, random_state=77), [int(.8*len(dataset_topic))])


dataset_topic["Topic"].value_counts()
len(dataset_topic)
dataset_topic.reset_index(drop=True, inplace=True)
dataset_topic["Comment"]

0       Die Stiko stiftet wieder nur Verwirrung, das i...
1       Es werden nie 85% der Erwachsenen voll geimpft...
2       Laumann: Ja, aber viele Menschen fragen uns Po...
3       Wer bricht denn da zusammen - jetzt, kurz vor ...
4       Draußen ist das Risiko doch nun wirklich minim...
5       erneut vermischt SPON den nicht benötigten so ...
6       Wenn nicht bald ordentlich Geld fließt, werden...
7       Den Regierungen wird hier vorgeworfen, mit dem...
8       Eine Impfpflicht für bestimmte Berufsgruppen, ...
9       Frage an die, die Angst (schon mal die Bedeutu...
10      Gähn, das alte Spielchen, Merkel ist unzufried...
11      Ich bin fassungslos das so ein Typ im Bundesta...
12      Manche Foristen sind sich nicht zu Schade und ...
13      Ich finde man sollte alle Tankstellen schließe...
14      Es kann schon kompliziert sein auf der Welt.  ...
15      Wie gut oder schlecht das bei uns läuft kann m...
16      Zitat "ein bisweilen erratisches Krisenmanagem...
17      Also m

In [None]:
label_to_id = {"politik" : 0, "maßnahmen" : 1, "infektion" : 2, "impfung": 3, "lockdown": 4, "wirtschaft":5, "lockerung":6}
dataset_topic = pd.read_csv("annotated_data/annotated_data_with_users_and_al_cleaned.csv", names=["ID", "Date", "Time", "Comment Level", "Username", "Opinion", "Sentiment", "Topic_comment", "Topic_article", "Comment", "Method"])
dataset_topic = dataset_topic.loc[dataset_topic["Topic_comment"].isin(label_to_id.keys())]
dataset_topic.reset_index(drop=True, inplace=True)
for x in range(len(dataset_topic)):
    print(x)
    request = requests.get("https://www.spiegel.de/wissenschaft/medizin/corona-news-am-samstag-die-wichtigsten-entwicklungen-zu-sars-cov-2-und-covid-19-a-" +dataset_topic.loc[int(x), "ID"])
    soup = BeautifulSoup(request.content, "html.parser")
    title = soup.find("title").text
    subtitle = soup.find("meta", property="og:description")["content"]

    dataset_topic.loc[x, "Comment"] = title + " " + subtitle

dataset_topic


In [38]:
dataset_topic["Comment"][0]
dataset_topic.to_csv("annotated_data/annotated_data_with_users_and_al_cleaned_article.csv", encoding="utf-8-sig", index=False, header=False)

In [3]:
data_text = []
data_labels = []
text_train = []
labels_train = []
text_test = np.array([])
labels_test = np.array([])


tokenizer = AutoTokenizer.from_pretrained("bert-base-german-cased")




label_to_id = {"politik" : 0, "maßnahmen" : 1, "infektion" : 2, "impfung": 3, "lockdown": 4, "wirtschaft":5, "lockerung":6}
id_to_label = {0:"politik", 1:"maßnahmen", 2:"infektion", 3: "impfung", 4: "lockdown", 5:"wirtschaft", 6:"lockerung"}

### Here we create a training set that can be used to compare across different sizes of training data



class Dataset(torch.utils.data.Dataset):

    def __init__(self,dataframe):
        self.labels = [label_to_id[label] for label in dataframe["Topic"] if label in ["politik", "maßnahmen", "infektion", "impfung", "lockdown", "wirtschaft", "lockerung"]]
        self.texts = [tokenizer(txt, padding ="max_length", max_length = 512, truncation=True, return_tensors="pt") for txt in dataframe["Comment"]]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        return self.texts[idx]

    def __getitem__(self, idx):

        batch_texts = self.get_batch_texts(idx)
        batch_labels = self.get_batch_labels(idx)

        return batch_texts, batch_labels
# ### Define test and train set
# text_test = text_train[split_index+1:]
# text_train = text_train[:split_index]
# labels_test = labels_train[split_index+1:]
# labels_train = labels_train[:split_index]


### Replace label as int
for idx, labels in enumerate(data_labels):
    data_labels[idx] = label_to_id[labels]


class ClassifierText(nn.Module):
    def __init__(self, dropout = 0.5):
        super(ClassifierText, self).__init__()

        self.bert = BertModel.from_pretrained("bert-base-german-cased")
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 7)
        self.relu = nn.ReLU()
    
    def forward(self, input_id, mask):
        _, pooled_output = self.bert(input_ids = input_id, attention_mask = mask, return_dict = False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        final_layer = self.relu(linear_output)

        return final_layer




accuracies = []
hello = []
# folds = StratifiedKFold(n_splits=5)
# for train_index, test_index in folds.split(data_text, data_labels):
#     text_train, labels_train  = data_text[train_index], data_labels[train_index]
#     text_test, labels_test  = data_text[test_index], data_labels[test_index]

#     train_dict = {"texts": text_train, "labels" : labels_train}
#     test_dict = {"texts": text_test, "labels" : labels_test}

    # print([len(z) for z in [text_train, text_test]])


    ### Remove newline characters
    # for idx, text in enumerate(train_dict["texts"]):
    #     train_dict["texts"][idx] = text.rstrip("\n")

    # for idx2, text2 in enumerate(test_dict["texts"]):
    #     test_dict["texts"][idx2] = text2.rstrip("\n")
        
    # data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    
    # meine_dataset_train = Dataset.from_dict(train_dict)

    # mein_dataset_test = Dataset.from_dict(test_dict)

    # tokenized_dataset_train = meine_dataset_train.map(preprocess_function, batched= True)
    # tokenized_dataset_test = mein_dataset_test.map(preprocess_function, batched=True)


In [4]:
def train(model, train_data, learning_rate, epochs):

    train = Dataset(train_data)

    train_dataloader = torch.utils.data.DataLoader(train, batch_size=16, shuffle=True)


    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr= learning_rate)

    if use_cuda:
        model = model.cuda()
        criterion = criterion.cuda()

    for epoch_num in range(epochs):

        total_acc_train = 0
        total_loss_train = 0

        for train_input, train_label in tqdm(train_dataloader):

            train_label = train_label.to(device)
            mask = train_input['attention_mask'].to(device)
            input_id = train_input['input_ids'].squeeze(1).to(device)

            output = model(input_id, mask)
            
            batch_loss = criterion(output, train_label.long())
            total_loss_train += batch_loss.item()
            
            acc = (output.argmax(dim=1) == train_label).sum().item()
            total_acc_train += acc

            model.zero_grad()
            batch_loss.backward()
            optimizer.step()
        
        total_acc_val = 0
        total_loss_val = 0

        print(
            f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_data): .3f} \
            | Train Accuracy: {total_acc_train / len(train_data): .3f}'
        )    
                  
EPOCHS = 20
model = ClassifierText()
LR = 1e-6
train(model, df_train, LR, EPOCHS)
# for train_index, test_index in folds.split(data_text, data_labels):
#     text_train, labels_train  = data_text[train_index], data_labels[train_index]
#     text_test, labels_test  = data_text[test_index], data_labels[test_index]

#     train_dict = {"texts": text_train, "labels" : labels_train}
#     test_dict = {"texts": text_test, "labels" : labels_test}              
#     train(model, text_train, LR, EPOCHS)

Some weights of the model checkpoint at bert-base-german-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


KeyboardInterrupt: 

In [5]:
def evaluate(model, test_data):

    test = Dataset(test_data)

    test_dataloader = torch.utils.data.DataLoader(test, batch_size=8)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:

        model = model.cuda()

    total_acc_test = 0
    with torch.no_grad():

        for test_input, test_label in test_dataloader:

              test_label = test_label.to(device)
              mask = test_input['attention_mask'].to(device)
              input_id = test_input['input_ids'].squeeze(1).to(device)

              output = model(input_id, mask)

              acc = (output.argmax(dim=1) == test_label).sum().item()
              
              total_acc_test += acc
    
    print(f'Test Accuracy: {total_acc_test / len(test_data): .3f}')

# evaluate(model, df_test)

# torch.save(model, "mein_model.pt")

In [28]:
model = ClassifierText()
model.load_state_dict(torch.load("state_dict"))


Some weights of the model checkpoint at bert-base-german-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


<All keys matched successfully>

In [29]:
geladenes_model = model
evaluate(geladenes_model, df_test)

Test Accuracy:  0.652


In [30]:
from numpy.ma.core import argmax
from transformers import TextClassificationPipeline

use_cuda = torch.cuda.is_available()

device = torch.device("cuda" if use_cuda else "cpu")


if use_cuda:

    mein_model = geladenes_model.cuda()

correct_per_topic = {"politik" : 0, "maßnahmen" : 0, "infektion" : 0, "impfung": 0, "lockdown": 0, "wirtschaft":0, "lockerung":0}
true = 0
with torch.no_grad():


  for _, sample in df_test.iterrows():

    input_ids = tokenizer(sample["Comment"], return_tensors="pt")["input_ids"].to(device)

    masks = tokenizer(sample["Comment"], return_tensors="pt")["attention_mask"].to(device)

    prediction = model( input_ids, masks)

 
    class_pred = np.argmax(prediction.cpu()).item()

    topic = label_to_id[sample["Topic"]]

    #print(class_pred, topic)
    
    
    if class_pred == topic:
      true+= 1
      correct_per_topic[id_to_label[topic]] += 1


print(true/len(df_test))

for topic in df_test["Topic"].unique():
  correct_per_topic[topic] /= df_test["Topic"].value_counts()[topic]


    # prediction = model(mask=masks, input_id=input_ids)

    # topic = np.argmax(prediction)

    # topic
print(correct_per_topic)
print(df_train["Topic"].value_counts())

0.6606334841628959
{'politik': 0.7757009345794392, 'maßnahmen': 0.6075949367088608, 'infektion': 0.6507936507936508, 'impfung': 0.8421052631578947, 'lockdown': 0.43103448275862066, 'wirtschaft': 0.6888888888888889, 'lockerung': 0.0}
politik       433
maßnahmen     325
impfung       320
infektion     227
lockdown      199
wirtschaft    175
lockerung      89
Name: Topic, dtype: int64


In [169]:
df_test.iloc[0]["Topic"]

'politik'

In [None]:
pd.set_option("display.max_rows", None)
meine_daten = pd.read_csv("annotated_data/annotated_data_with_users_and_al_cleaned2x.csv", names=["ID", "Date", "Time", "Comment Level", "Username", "Opinion", "Sentiment", "topic_comment", "Topic_article", "Comment", "Method"], encoding="utf-8-sig", header=None)
print(meine_daten["topic_comment"].value_counts())
meine_daten.loc[meine_daten["topic_comment"] == "fußball", "topic_comment"] = "sport"
meine_daten.loc[meine_daten["topic_comment"] == "schulschliessung", "topic_comment"] = "lockdown"
meine_daten.loc[meine_daten["topic_comment"] == "homeoffice", "topic_comment"] = "arbeit"
meine_daten.loc[meine_daten["topic_comment"] == "quarantäne", "topic_comment"] = "maßnahmen"
meine_daten.loc[meine_daten["topic_comment"] == "warnapp", "topic_comment"] = "maßnahmen"
meine_daten.loc[meine_daten["topic_comment"] == "reiseverbot", "topic_comment"] = "reise"
meine_daten.loc[meine_daten["topic_comment"] == "schnelltests", "topic_comment"] = "tests"
meine_daten.loc[meine_daten["topic_comment"] == "inzidenz", "topic_comment"] = "infektion"
meine_daten.loc[meine_daten["topic_comment"] == "schulschließung", "topic_comment"] = "maßnahmen"
meine_daten.loc[meine_daten["topic_comment"] == "intensivstation", "topic_comment"] = "krankenhaus"
meine_daten.loc[meine_daten["topic_comment"] == "intensivstationen", "topic_comment"] = "krankenhaus"
meine_daten.loc[meine_daten["topic_comment"] == "demo", "topic_comment"] = "demos"
meine_daten.loc[meine_daten["topic_comment"] == "testen", "topic_comment"] = "tests"
meine_daten.loc[meine_daten["topic_comment"] == "konjunktur", "topic_comment"] = "wirtschaft"
meine_daten.loc[meine_daten["topic_comment"] == "maske", "topic_comment"] = "maskenpflicht"
print(meine_daten["topic_comment"].value_counts())
meine_daten.to_csv("annotated_data/annotated_data_with_users_and_al_cleaned2x.csv", header=None, index=None)
meine_daten.loc[meine_daten["topic_comment"] == "neutral", :]

politik          535
maßnahmen        440
impfung          389
infektion        298
lockdown         268
wirtschaft       232
lockerung        104
usa               96
reise             87
demos             84
maskenpflicht     71
virus             69
tests             63
arbeit            57
krankenhaus       29
sport             23
china             22
Name: topic_comment, dtype: int64
politik          535
maßnahmen        440
impfung          389
infektion        298
lockdown         268
wirtschaft       232
lockerung        104
usa               96
reise             87
demos             84
maskenpflicht     71
virus             69
tests             63
arbeit            57
krankenhaus       29
sport             23
china             22
Name: topic_comment, dtype: int64


Unnamed: 0,ID,Date,Time,Comment Level,Username,Opinion,Sentiment,topic_comment,Topic_article,Comment,Method


In [16]:
pd.set_option("max_rows", None)
meine_daten = pd.read_csv("annotated_data/annotated_data_with_users_and_al_cleaned2x.csv", names=["ID", "Date", "Time", "Comment Level", "Username", "Opinion", "Sentiment", "topic_comment", "Topic_article", "Comment", "Method"], encoding="utf-8-sig", header=None)
meine_daten.loc[meine_daten["Topic_article"] == "intensivstation", "Topic_article"] = "krankenhaus"
meine_daten.loc[meine_daten["Topic_article"] == "homeoffice", "Topic_article"] = "arbeit"
meine_daten.loc[meine_daten["Topic_article"] == "schule", "Topic_article"] = "maßnahmen"
meine_daten.loc[meine_daten["Topic_article"] == "reiseverbot", "Topic_article"] = "reise"
meine_daten.loc[meine_daten["Topic_article"] == "quarantäne", "Topic_article"] = "maßnahmen"
meine_daten.loc[meine_daten["Topic_article"] == "schulschließung", "Topic_article"] = "maßnahmen"
meine_daten.loc[meine_daten["Topic_article"] == "demo", "Topic_article"] = "demos"
meine_daten.loc[meine_daten["Topic_article"] == "warnapp", "Topic_article"] = "maßnahmen"
meine_daten.loc[meine_daten["Topic_article"] == "intensivstationen", "Topic_article"] = "krankenhaus"
meine_daten.loc[meine_daten["Topic_article"] == "lockerungen", "Topic_article"] = "lockerung"
meine_daten.loc[meine_daten["Topic_article"] == "konjunktur", "Topic_article"] = "wirtschaft"
meine_daten.loc[meine_daten["Topic_article"] == "schulschliessung", "Topic_article"] = "maßnahmen"
meine_daten.loc[meine_daten["Topic_article"] == "inzidenz", "Topic_article"] = "infektion"
meine_daten.loc[meine_daten["Topic_article"] == "kontaktbeschränkung", "Topic_article"] = "maßnahmen"
meine_daten.loc[meine_daten["Topic_article"] == "schnelltests", "Topic_article"] = "tests"
meine_daten.loc[meine_daten["Topic_article"] == "verschärfungen", "Topic_article"] = "maßnahmen"
meine_daten.loc[meine_daten["Topic_article"] == "intesivstation", "Topic_article"] = "krankenhaus"
meine_daten.loc[meine_daten["Topic_article"] == "maskenplficht", "Topic_article"] = "maskenpflicht"
meine_daten.loc[meine_daten["Topic_article"] == "fußball", "Topic_article"] = "sport"
meine_daten = meine_daten[meine_daten.Topic_article != "XXX"]
meine_daten.to_csv("annotated_data/annotated_data_with_users_and_al_cleaned2x.csv", header=None, index=None)
meine_daten["Topic_article"].value_counts()

überblick        691
maßnahmen        313
infektion        269
wirtschaft       251
lockdown         238
impfung          224
politik          180
lockerung        112
usa               93
reise             80
demos             71
virus             60
maskenpflicht     47
arbeit            47
tests             35
krankenhaus       32
sport             26
china             10
Name: Topic_article, dtype: int64

In [7]:
import tensorflow as tf
tf.test.gpu_device_name()

''