In [2]:

from torch import nn
import torch
from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter()
import numpy as np
import pandas as pd
from transformers import BertTokenizer, BertModel, AutoTokenizer
from torch.optim import Adam
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold
import requests
from bs4 import BeautifulSoup
import os
import sys


  from .autonotebook import tqdm as notebook_tqdm


In [1]:
! pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.22.1-py3-none-any.whl (4.9 MB)
[K     |████████████████████████████████| 4.9 MB 13.0 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 36.5 MB/s 
Collecting huggingface-hub<1.0,>=0.9.0
  Downloading huggingface_hub-0.9.1-py3-none-any.whl (120 kB)
[K     |████████████████████████████████| 120 kB 68.1 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.9.1 tokenizers-0.12.1 transformers-4.22.1


In [None]:
torch.cuda.empty_cache() # I used this sometimes to clear the cache for OutOfMemoryErrors



In [None]:
# Choose specific GPU for training, if all GPUs are available no need to run this
# torch.cuda.set_device(0)
# torch.cuda.current_device()

0

The following script lets you choose the model you want to train. It can also be used to evaluate: specify the model you want to evaluate (Sentiment, Opinion, etc.), intiate the evaluate function and give the model you want to load in further down

In [10]:

to_be_trained = input("What do you want to train/evaluate? Choose \n Sentiment \n Opinion \n TopicComment \n TopicArticle \n AugmentedSentiment").lower()
### Here we set the labels of the topics, in this case we have 7 labels for comments and 8 for articles
if to_be_trained in ["sentiment", "opinion", "topiccomment", "augmentedsentiment"]:
    dataset_topic = pd.read_csv("annotated_data/annotated_data_with_users_and_al_cleaned2check_final_topics.csv", names=["ID", "Date", "Time", "Comment Level", "Username", "Opinion", "Sentiment", "Topic_comment", "Topic_article", "Comment", "Method"], encoding="ISO-8859-1")
elif to_be_trained == "topicarticle":
    dataset_topic = pd.read_csv("annotated_data/annotated_data_with_users_and_al_cleaned2check_article.csv", names=["ID", "Date", "Time", "Comment Level", "Username", "Opinion", "Sentiment", "Topic_comment", "Topic_article", "Comment", "Method"], encoding="utf-8")


input_to_column = {"sentiment": "Sentiment", "opinion" : "Opinion", "topiccomment" : "Topic_comment", "augmentedsentiment" : "Sentiment", "topicarticle" : "Topic_article"}
column = input_to_column[to_be_trained]

label_to_id = {x:y for y,x in enumerate(dataset_topic[column].unique()) }

#label_to_id = {"politik" : 0, "maßnahmen" : 1, "infektion" : 2, "impfung": 3, "lockdown": 4, "wirtschaft":5, "lockerung":6}
# label_to_id = {"positive":0, "negative" : 1, "neutral": 2}
print(label_to_id)



dataset_topic = dataset_topic.loc[dataset_topic[column].isin(label_to_id.keys())]


print(dataset_topic)
# dataset_topic = dataset_topic[["Comment", "Topic_comment"]]
dataset_topic = dataset_topic[["Comment", column]]
# dataset_topic = dataset_topic[["Comment", "Topic_article"]]

# dataset_topic = dataset_topic.loc[dataset_topic["Topic_article"].isin(label_to_id.keys())]
# dataset_topic = dataset_topic.loc[dataset_topic["Topic_comment"].isin(label_to_id.keys())]
dataset_topic = dataset_topic.loc[dataset_topic[column].isin(label_to_id.keys())]

# dataset_topic.columns = ["Comment", "Topic"]
dataset_topic.columns = ["Comment", column]
print(len(dataset_topic))
df_train, df_test = np.split(dataset_topic.sample(frac=1, random_state=77), [int(.8*len(dataset_topic))])


##### Concats 400 positive samples
if to_be_trained == "augmentedsentiment":
    augmented_400  =pd.read_csv("annotated_data/data_augment_400_positive.csv", names=["ID", "Date", "Time", "Comment Level", "Username", "Comment"], encoding="utf-8", header = None)
    augmented_400  = augmented_400.sample(frac=1, random_state=77).reset_index(drop=True)
    augmented_400.insert(4, 'Sentiment', ["positive" for i in range(len(augmented_400))])
    augmented_400 = augmented_400[["Comment", "Sentiment"]]
    df_train = pd.concat([df_train, augmented_400])
    df_train = df_train.sample(frac=1, random_state=77).reset_index(drop=True)

len(dataset_topic)
dataset_topic.reset_index(drop=True, inplace=True)

df_train.drop(df_train.loc[df_train["Comment"].isnull()].index, axis = 0, inplace = True)
# for x in df_train["Comment"]:
#     if type(x) != str:
#         print(type(x), x)
df_train


{'negative': 0, 'positive': 1, 'neutral': 2}
                                        ID        Date      Time  \
0     40a7c181-1ab8-4d4b-bca5-bf8f4ace27ea  30.04.2020  06:59:00   
1     4736f7ec-6396-4f5f-991b-1f83e5f5df66  07.03.2021  17:19:00   
2     5994b2b8-4e42-49a7-b82c-c78e084df7f7  18.03.2021  13:40:00   
3     15557a4c-0e90-412e-8bd5-03ae3f6a7854  13.02.2021  17:57:00   
4     aecb14c3-12f5-4f76-aab8-c3620a709fa4  22.12.2020  10:18:00   
...                                    ...         ...       ...   
3991  855c1ae5-aa3b-4497-bf62-a2b450564320  18.03.2021  16:23:00   
3992  3d7819e6-f105-4d41-a2c4-1d06b6f51dc1  22.05.2020  07:15:00   
3993  2fbfe25d-a788-409a-844d-fb7bd80bcf44  09.01.2021  13:26:00   
3994  55863de1-bf74-49d1-95fe-08d2fbd51970  15.04.2021  11:50:00   
3995  bd0465c0-4fc7-4762-9731-a0c98baa0ae0  05.05.2021  19:05:00   

      Comment Level              Username   Opinion Sentiment Topic_comment  \
0                 0     Karsten-QGRAZB3WR  negative  negati

Unnamed: 0,Comment,Sentiment
401,Die Einigung für Schulen ist fatal. Frau Eisen...,negative
478,"Vernünftige Entscheidung, mit der Impfung entf...",positive
2488,Wir sind doch nicht in einer (Standard-) Schul...,negative
843,"Wie schön, es geht voran. Bis Ende Mai sind ...",positive
344,"Nun gut, wenn sarkastische Kommentare nicht me...",neutral
...,...,...
1066,Übrigens: das RKI prognostizierte am 12. März ...,neutral
3901,Donny will nach 8 Monaten beginnen Menschen zu...,negative
1393,De lockdau mi sine positive eigeschaft für d'u...,negative
3027,"Deutschlands ""Intelligentia"" regelt ...",negative


In [None]:
# This cell scrapes the articles for their header content, (4000 articles) only needed to run if text is needed

label_to_id = {"politik" : 0, "maßnahmen" : 1, "infektion" : 2, "impfung": 3, "wirtschaft":4, "lockerung":5, "demos":6, "überblick":7}
dataset_topic = pd.read_csv("annotated_data/annotated_data_with_users_and_al_cleaned2check_final_topics.csv", names=["ID", "Date", "Time", "Comment Level", "Username", "Opinion", "Sentiment", "Topic_comment", "Topic_article", "Comment", "Method"], encoding="ISO-8859-1")
dataset_topic = dataset_topic.loc[dataset_topic["Topic_article"].isin(label_to_id.keys())]
dataset_topic.reset_index(drop=True, inplace=True)
for x in range(len(dataset_topic)):
    print(x)

    request = requests.get("https://www.spiegel.de/wissenschaft/medizin/corona-news-am-samstag-die-wichtigsten-entwicklungen-zu-sars-cov-2-und-covid-19-a-" +dataset_topic.loc[int(x), "ID"])
    soup = BeautifulSoup(request.content, "html.parser")
    title = soup.find("title").text
    subtitle = soup.find("meta", property="og:description")["content"]

    dataset_topic.loc[x, "Comment"] = title + " " + subtitle

dataset_topic


In [3]:
### Saves the dataframe with comment cells filled with article text. Sadly not all articles contrained the
### tag searched with soup, so there are some missing articles in the training data set
dataset_topic.to_csv("annotated_data/annotated_data_with_users_and_al_cleaned2check_article.csv", encoding="utf-8", index=False, header=False)

In [8]:
## training loop, classifier arch and dataset class from repo https://gist.github.com/marcellusruben/e4de016da784f4b2debb6ea6f674867d, modified to fit data set and added tensorboard logs
from cProfile import label


data_text = []
data_labels = []
text_train = []
labels_train = []
text_test = np.array([])
labels_test = np.array([])


tokenizer = AutoTokenizer.from_pretrained("bert-base-german-cased")



# For Topics Article
# label_to_id = {"politik" : 0, "maßnahmen" : 1, "infektion" : 2, "impfung": 3, "lockdown": 4, "wirtschaft":5, "lockerung":6, "überblick":7 }
# id_to_label = {0:"politik", 1:"maßnahmen", 2:"infektion", 3: "impfung", 4: "lockdown", 5:"wirtschaft", 6:"lockerung", 7:"überblick"}


# For Topics Comment
# label_to_id = {"politik" : 0, "maßnahmen" : 1, "infektion" : 2, "impfung": 3, "lockdown": 4, "wirtschaft":5, "lockerung":6}
# id_to_label = {0:"politik", 1:"maßnahmen", 2:"infektion", 3: "impfung", 4: "lockdown", 5:"wirtschaft", 6:"lockerung"}

# For Sentiment
# label_to_id = {"positive":0, "negative" : 1, "neutral": 2}
# id_to_label = {0:"positive", 1:"negative", 2 : "neutral"}


label_to_id = {x:y for y,x in enumerate(dataset_topic[column].unique()) }
id_to_label = {x:y for x,y in enumerate(dataset_topic[column].unique()) }
print(label_to_id)

### Here we create a training set that can be used to compare across different sizes of training data



class Dataset(torch.utils.data.Dataset):

    def __init__(self,dataframe):
        self.labels = [label_to_id[label] for label in dataframe[column] if label in label_to_id.keys()]
        self.texts = [tokenizer(txt, padding ="max_length", max_length = 512, truncation=True, return_tensors="pt") for txt in dataframe["Comment"]]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        return self.texts[idx]

    def __getitem__(self, idx):

        batch_texts = self.get_batch_texts(idx)
        batch_labels = self.get_batch_labels(idx)

        return batch_texts, batch_labels
# ### Define test and train set
# text_test = text_train[split_index+1:]
# text_train = text_train[:split_index]
# labels_test = labels_train[split_index+1:]
# labels_train = labels_train[:split_index]


### Replace label as int
for idx, labels in enumerate(data_labels):
    data_labels[idx] = label_to_id[labels]


class ClassifierText(nn.Module):
    def __init__(self, dropout = 0.7):
        super(ClassifierText, self).__init__()

        self.bert = BertModel.from_pretrained("bert-base-german-cased")
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, len(dataset_topic[column].unique()))
        print(self.linear)
        self.relu = nn.ReLU()
    def forward(self, input_id, mask):
        _, pooled_output = self.bert(input_ids = input_id, attention_mask = mask, return_dict = False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        final_layer = self.relu(linear_output)

        return final_layer




{'lockerung': 0, 'überblick': 1, 'politik': 2, 'wirtschaft': 3, 'impfung': 4, 'demos': 5, 'infektion': 6, 'maßnahmen': 7}


In [None]:
torch.cuda.empty_cache()


In [None]:

def train(model, train_data, test_data, learning_rate, epochs):

    train, test = Dataset(train_data), Dataset(test_data)

    train_dataloader = torch.utils.data.DataLoader(train, batch_size=1, shuffle=True)
    test_dataloader = torch.utils.data.DataLoader(test, batch_size=2)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu") #"cuda:3" if use_cuda else "cpu"
    
    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr= learning_rate)

    if use_cuda:
        model = model.cuda()
        criterion = criterion.cuda()

    for epoch_num in range(epochs):

        total_acc_train = 0
        total_loss_train = 0

        for train_input, train_label in tqdm(train_dataloader):

            train_label = train_label.to(device)
            mask = train_input['attention_mask'].to(device)
            input_id = train_input['input_ids'].squeeze(1).to(device)

            output = model(input_id, mask)
            
            batch_loss = criterion(output, train_label.long())
            total_loss_train += batch_loss.item()
            
            acc = (output.argmax(dim=1) == train_label).sum().item()
            total_acc_train += acc

            model.zero_grad()
            batch_loss.backward()
            optimizer.step()
        
        total_acc_val = 0
        total_loss_val = 0

        with torch.no_grad():

                for test_input, test_label in test_dataloader:

                    test_label = test_label.to(device)
                    mask = test_input['attention_mask'].to(device)
                    input_id = test_input['input_ids'].squeeze(1).to(device)

                    output = model(input_id, mask)

                    batch_loss = criterion(output, test_label.long())
                    total_loss_val += batch_loss.item()
                    
                    acc = (output.argmax(dim=1) == test_label).sum().item()
                    total_acc_val += acc

        print(
            f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_data): .3f} \
            | Train Accuracy: {total_acc_train / len(train_data): .3f} \
            | Val Loss: {total_loss_val / len(test_data): .3f} \
            | Val Accuracy: {total_acc_val / len(test_data): .3f}')
        writer.add_scalar("Loss/Train", total_loss_train, epoch_num) 
        writer.add_scalar("Loss/Test", total_loss_val, epoch_num)
        writer.add_scalar("Accuracy/Train", total_acc_train, epoch_num)
        writer.add_scalar("Accuracy/Test", total_acc_val, epoch_num)
            
EPOCHS = 4
model = ClassifierText()
LR = 1e-5

train(model, df_train, df_test, LR, EPOCHS)
# for train_index, test_index in folds.split(data_text, data_labels):
#     text_train, labels_train  = data_text[train_index], data_labels[train_index]
#     text_test, labels_test  = data_text[test_index], data_labels[test_index]

#     train_dict = {"texts": text_train, "labels" : labels_train}
#     test_dict = {"texts": text_test, "labels" : labels_test}              
#     train(model, text_train, LR, EPOCHS)

In [12]:
def evaluate(model, test_data):

    test = Dataset(test_data)

    test_dataloader = torch.utils.data.DataLoader(test, batch_size=8)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:

        model = model.cuda()

    total_acc_test = 0
    with torch.no_grad():

        for test_input, test_label in test_dataloader:

              test_label = test_label.to(device)
              mask = test_input['attention_mask'].to(device)
              input_id = test_input['input_ids'].squeeze(1).to(device)

              output = model(input_id, mask)

              acc = (output.argmax(dim=1) == test_label).sum().item()
              
              total_acc_test += acc
    
    print(f'Test Accuracy: {total_acc_test / len(test_data): .3f}')
df_test.drop(df_test.loc[df_test["Comment"].isnull()].index, axis = 0, inplace = True)




In [49]:
# Modelname zum Speichern angeben
torch.save(model.state_dict(),"Model_OpinionFULL.pt")

In [14]:
# Load model here
model = ClassifierText()
model.load_state_dict(torch.load("mein_model/Model_SentimentFULL.pt"))
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
model = model.to(device)

Some weights of the model checkpoint at bert-base-german-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Linear(in_features=768, out_features=3, bias=True)


In [15]:
# evaluate here
geladenes_model = model
evaluate(geladenes_model, df_test)

Test Accuracy:  0.693


In [47]:
# This can be used to get the accuracy per class
from numpy.ma.core import argmax
from transformers import TextClassificationPipeline

use_cuda = torch.cuda.is_available()

device = torch.device("cuda" if use_cuda else "cpu")

if use_cuda:

    mein_model = geladenes_model.cuda()

correct_per_topic = {x:0 for x in label_to_id.keys()}
true = 0
with torch.no_grad():

  for _, sample in df_test.iterrows():
    input_ids = tokenizer(sample["Comment"], return_tensors="pt", truncation=True)["input_ids"].to(device)

    masks = tokenizer(sample["Comment"], return_tensors="pt", truncation=True)["attention_mask"].to(device)
    
    prediction = model( input_ids, masks)
 
    class_pred = np.argmax(prediction.cpu()).item()
    
    topic = label_to_id[sample[column]]
#     topic = label_to_id[sample["Sentiment"]]

    #print(class_pred, topic)
    
    
    if class_pred == topic:
      true+= 1
      correct_per_topic[id_to_label[topic]] += 1


print(true/len(df_test))

for topic in df_test[column].unique():
    correct_per_topic[topic] /= df_test[column].value_counts()[topic]

    
# for sentiment in df_test["Sentiment"].unique():
#   correct_per_topic[sentiment] /= df_test["Sentiment"].value_counts()[sentiment]

#     prediction = model(mask=masks, input_id=input_ids)

#     topic = np.argmax(prediction)

#     topic
print(correct_per_topic)
# print(df_train["Topic"].value_counts())
print(df_test[column].value_counts())

0.515
{'negative': 0.7025316455696202, 'positive': 0.0, 'neutral': 0.6148867313915858}
negative    316
neutral     309
positive    175
Name: Opinion, dtype: int64
