In [1]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding
import torch
from IPython.display import display
from scipy.stats import entropy
from datasets import load_dataset, Dataset, load_metric
import os.path
import requests
from bs4 import BeautifulSoup

In [2]:
training_args = TrainingArguments(
    output_dir='./results',
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=1,
    num_train_epochs=10,
    weight_decay=0.01,
    logging_steps=20,
    evaluation_strategy="epoch"
    )
metric = load_metric("accuracy")

In [3]:
def compute_metrics_accuracy(eval_pred):
        logits, labels = eval_pred
        predictions = np.argmax(logits, axis=-1)
        print("Predictions", predictions, "Labels", labels)
        return metric.compute(predictions=predictions, references=labels)
        
tokenizer = AutoTokenizer.from_pretrained("oliverguhr/german-sentiment-bert")
trainer = Trainer(model= AutoModelForSequenceClassification.from_pretrained("oliverguhr/german-sentiment-bert"),
                args=training_args,
                train_dataset=None,
                eval_dataset=None,
                tokenizer=tokenizer,
                data_collator=None,
                compute_metrics = compute_metrics_accuracy)

In [None]:

# trainer = Trainer(model= AutoModelForSequenceClassification.from_pretrained("oliverguhr/german-sentiment-bert"),
#                 args=training_args,
#                 train_dataset=None,
#                 eval_dataset=tokenized_dataset_test,
#                 tokenizer=tokenizer,
#                 data_collator=data_collator,
#                 compute_metrics = compute_metrics_accuracy)

In [4]:
pd.set_option('display.max_rows', None)
def calculate_entropy(logits):
    probas = torch.nn.Softmax(dim=1)(torch.from_numpy(logits))
    samples_entropy = entropy(probas.transpose(0, 1).cpu())
    samples_entropy = torch.from_numpy(samples_entropy)
    return samples_entropy

def preprocess_function(examples):
    return tokenizer(examples["texts"], padding = True, truncation=True)

def get_new_sample_active_learning(number_of_comments):

    ### We read in the csv that stores all of our already annotated data
    df = pd.read_csv("annotated_data/annotated_data_with_users.csv", names=["ID", "Date", "Time", "Comment Level", "Username", "Opinion", "Sentiment",
                     "topic_comment", "Topic_article", "Comment", "Method"])

    already_present_data = 0
    with open("annotated_data/annotated_data_training.txt", encoding="utf-8", mode="r+") as y:
        for line in y.readlines():
            if line != "\n":
                already_present_data += 1

    ### Check if we already have active learned comments that we annotated
    if os.path.isfile('annotated_data/active_learning_comments.csv'):
        progress_csv =pd.read_csv("annotated_data/active_learning_comments.csv",names=["ID", "Date", "Time", "Comment Level", "Username", "Opinion", "Sentiment", "topic_comment", "Topic_article", "Comment"], encoding="utf-8-sig", header=None)


    ### Stores comments as string that we already have in active learning csv
    bereits_comments = []
    for comment in progress_csv.Comment:
        bereits_comments.append(comment)

    ### Int which shows how many comments (manual random sampled + active learn sampled) we already have annotated
    progress = already_present_data + len(bereits_comments)


    ### Load in all scraped comments from web in csv, sliced on [progress: progress + number_of_comments]
    alle_kommentare = pd.read_csv("shuffled_corona_relevante_kommentare.txt", names=["ID", "Date", "Time", "Comment Level", "Username", "Opinion", "Sentiment", "topic_comment", "Topic_article", "Comment"], 
                                    delimiter="\t", index_col=False, skiprows=progress, nrows=number_of_comments )
    


    ### Drop all rows that are already present in the active learned csv to we don't annotate twice
    alle_kommentare = alle_kommentare.loc[~alle_kommentare["Opinion"].isin(bereits_comments)]
    display(alle_kommentare)
    alle_kommentare.reset_index(inplace=True, drop=True)
    ### Tokenize texts and get entropy, take topk 
    texte = {"texts" : [x for x in alle_kommentare["Opinion"]]}
    texte_ds= Dataset.from_dict(texte)
    tokenized_text = texte_ds.map(preprocess_function, batched=True)
    entropies = calculate_entropy(trainer.predict(tokenized_text).predictions)
    indexes = torch.topk(entropies, int(number_of_comments/10)).indices
    
    # if os.path.isfile('annotated_data/active_learning_comments.csv'):
    #     progress_csv =pd.read_csv("annotated_data/active_learning_comments.csv",names=["ID", "Date", "Time", "Comment Level", "Username", "Opinion", "Sentiment", "topic_comment", "Topic_article", "Comment"], encoding="utf-8-sig", header=None)
    #     print(type(progress_csv))
    #     display(progress_csv)
    #     for row, index in progress_csv.iterrows():
    #         print(row, index)
    newdf = pd.DataFrame(columns=["ID", "Date", "Time", "Comment Level", "Username", "Opinion", "Sentiment", "Topic_comment", "Topic_article", "Comment", "Method"])
    for nummer, x in enumerate(indexes):
        satz = texte["texts"][x]
        opinion = input(f"Opinion --- {satz}")
        while opinion not in ["neutral", "positive", "negative", "exit"]:
            opinion = input(f"Opinion --- {satz}")    
        sentiment = input(f"Sentiment --- {satz}")
        while sentiment not in ["neutral", "positive", "negative"]:
            sentiment = input(f"Sentiment --- {satz}")

        
        request = requests.get("https://www.spiegel.de/wissenschaft/medizin/corona-news-am-samstag-die-wichtigsten-entwicklungen-zu-sars-cov-2-und-covid-19-a-" +alle_kommentare.loc[int(x), "ID"])
        soup = BeautifulSoup(request.content, "html.parser")
        title = soup.find("title").text
        subtitle = soup.find("meta", property="og:description")["content"]

        topic_article = input("Topic Article ------" +title + "\n" + subtitle + " " + alle_kommentare.loc[int(x), "ID"])  
        topic_comment = input("Topic comment")


        row = [alle_kommentare.loc[alle_kommentare.Opinion == satz]]
        newdf = newdf.append(row)
        newdf.reset_index(inplace=True, drop=True)
        newdf.iloc[nummer,alle_kommentare.columns.get_loc("Topic_article")] = topic_article
        newdf.iloc[nummer,alle_kommentare.columns.get_loc("topic_comment")] = topic_comment
        newdf.iloc[nummer,alle_kommentare.columns.get_loc("Sentiment")] = sentiment
        newdf.iloc[nummer,alle_kommentare.columns.get_loc("Comment")] = satz
        newdf.iloc[nummer,alle_kommentare.columns.get_loc("Opinion")] = opinion

        print(opinion, sentiment,topic_article, topic_comment)
        

    newdf = newdf.drop(columns=list(newdf.columns[-2:]))
    display(newdf)
    newdf.to_csv("annotated_data/active_learning_comments.csv", mode="a", encoding="utf-8-sig", index=False, header=False)
            # df.append(line.split("\t")[0] + "\t" + line.split("\t")[1] + "\t" + line.split("\t")[2] + "\t" + line.split("\t")[3] + "\t" + line.split("\t")[4] + "\t" + opinion + "\t"
            #                       + sentiment + "\t" + klasse1 + "\t" +  klasse2 + "\t" + kommentar + "\n")
        



In [6]:
### Takes n/10 comments with highest entropy from n pool of comments

get_new_sample_active_learning(300)

Unnamed: 0,ID,Date,Time,Comment Level,Username,Opinion,Sentiment,topic_comment,Topic_article,Comment
0,90bfa506-9c7a-4444-bcf5-7a6159b4e1ae,2021-03-23,07:14,0,Monty_Burns,Nur mal so: Meine persönlichen Beobachtungen e...,,,,
2,382f315a-911f-418e-be65-643f3131ce01,2020-10-30,04:14,0,Cyman,Sofern China das Ganze wirklich im Griff haben...,,,,
3,738ebbdb-e130-4072-b63d-6123383731b4,2020-10-14,21:41,0,Joachim-sQqsHGxZg,Können wir in Deutschland eigentlich nur noch ...,,,,
7,b0d01e43-2eb0-4055-b13f-8ce94bb3289e,2020-04-05,18:04,0,Juan-D_XOANjZR,Dann möge Spanien aber bitte auch eine Restruk...,,,,
9,ca95ef8d-0ce4-4480-8362-224454c34fa5,2020-11-02,13:54,0,Calenberger,"Eine Covid19-Infektion, nur um sein Idol zu se...",,,,
10,83d0e51b-1c02-4e3d-b94d-fc750248d0f8,2021-01-05,18:41,0,Christian-5_xitdVGg,Das ganze verkommt zur Realsatire. Schließe mi...,,,,
13,1cfcdc0b-d820-4567-88e4-81e3059c5dd3,2020-10-15,10:45,0,HaBe,"Wer ist auf dem Bild ohne Maske? Der Scheuer, ...",,,,
15,33994960-591d-4e0b-b87d-9ab51e0286be,2020-04-11,04:53,0,Ma-ztbrxJXZR,Das Ergebnis der Versuch steht bereits heute f...,,,,
16,529f745f-c1a5-47ad-8be5-7121e770bf66,2021-05-01,22:55,0,Pet2020,Die Deutschen (Foristen) sind schon ein komisc...,,,,
17,3f98f45a-95e0-4eea-95d2-5bf053d2cc41,2020-05-10,06:35,0,Laureus,"Lieber Herr Kurbjuweit, ein „Zentrum der Mach...",,,,


100%|██████████| 1/1 [00:00<00:00,  7.58ba/s]
The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: texts.
***** Running Prediction *****
  Num examples = 210
  Batch size = 1
443it [1:51:32, 19.66it/s]                       

neutral neutral wirtschaft wirtschaft
positive neutral überblick impfung
negative negative usa usa
negative negative politik politik
neutral neutral lockdown lockdown
negative negative überblick maßnahmen
neutral neutral überblick maßnahmen
negative negative lockdown lockdown
neutral neutral impfung impfung
positive positive überblick impfung
negative negative infektion infektion
neutral neutral lockdown lockdown
neutral neutral lockdown schnelltests
negative negative politik politik
neutral negative infektion maßnahmen
negative negative lockdown politik
negative negative infektion wirtschaft
positive positive wirtschaft politik
neutral negative infektion infektion
neutral neutral reise reise
neutral negative impfung impfung
positive positive impfung politik
negative negative überblick politik
positive neutral reise reise
neutral neutral lockdown lockdown
neutral neutral überblick infektion
negative negative reise reise
neutral neutral intesivstationen intensivstationen
neutral neutral

Unnamed: 0,ID,Date,Time,Comment Level,Username,Opinion,Sentiment,Topic_comment,Topic_article,Comment
0,10f3286b-4390-4552-b8b2-8cdf8d05a27a,2020-02-20,16:40,0,Pat-Npc2hFPZR,neutral,neutral,wirtschaft,wirtschaft,Hat AirFrance eigentlich jemals Geld verdient ...
1,dd32ef96-caff-4fe5-bdaf-2751afc6dfe1,2021-07-10,07:55,0,Water-lhEZCt-GR,positive,neutral,impfung,überblick,Ich verstehe nicht warum man nicht noch 2 Woch...
2,ef9043f8-4c02-4345-ae32-3a68e86a5cbc,2020-04-29,16:10,0,Bernd-UTQujGGWg,negative,negative,usa,usa,"Laßt uns einfach mal so vorgehen, wie es uns d..."
3,fdfabd58-1a99-47be-93a8-cbdb22dd0e43,2020-04-22,18:56,0,Klaus-RfLFFIEWR,negative,negative,politik,politik,"Endlich wieder Opposition Habe den Eindruck,..."
4,51db8bf3-1660-4689-a132-10b9004362ad,2021-03-24,09:20,0,ebi800,neutral,neutral,lockdown,lockdown,Man wird wahrscheinlich über Test- und Impfstr...
5,aeab9367-d355-4de3-8f11-2f21a0c34939,2021-01-24,13:36,0,Anna-cezlbqjZg,negative,negative,maßnahmen,überblick,In den Niederlanden lassen sich also Menschen ...
6,7abfd5e9-0b6f-43f0-8f5a-2c73ca8f45a5,2021-05-20,09:55,0,Kamillo,neutral,neutral,maßnahmen,überblick,"Die Übersicht ist nett gemeint, aber jetzt mus..."
7,ec18ed82-88ea-4ea0-b1f7-fd0754cc2f4a,2021-04-15,11:06,0,Guido-HQtLHPgGR,negative,negative,lockdown,lockdown,LoL Wieder ein An griff auf berufstätige Elter...
8,2a8d6123-56ad-456a-9de2-a57b6e9aeff5,2021-01-11,21:35,0,Evolutionary_Road,neutral,neutral,impfung,impfung,Ich werfe mal ein Unterthema in den Raum. Mal ...
9,32bcd751-f7aa-4c17-bf5a-ea168fdec71b,2021-06-22,11:01,0,Hopfezupfe,positive,positive,impfung,überblick,"Primrose Hill, bestes Wetter, die Londoner fei..."


In [85]:
x

1448