In [3]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding
import torch
from IPython.display import display
from scipy.stats import entropy
from datasets import load_dataset, Dataset, load_metric
import os.path

In [4]:
training_args = TrainingArguments(
    output_dir='./results',
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=1,
    num_train_epochs=10,
    weight_decay=0.01,
    logging_steps=20,
    evaluation_strategy="epoch"
    )
metric = load_metric("accuracy")

In [5]:
def compute_metrics_accuracy(eval_pred):
        logits, labels = eval_pred
        predictions = np.argmax(logits, axis=-1)
        print("Predictions", predictions, "Labels", labels)
        return metric.compute(predictions=predictions, references=labels)
        
tokenizer = AutoTokenizer.from_pretrained("oliverguhr/german-sentiment-bert")
trainer = Trainer(model= AutoModelForSequenceClassification.from_pretrained("oliverguhr/german-sentiment-bert"),
                args=training_args,
                train_dataset=None,
                eval_dataset=None,
                tokenizer=tokenizer,
                data_collator=None,
                compute_metrics = compute_metrics_accuracy)

In [None]:

# trainer = Trainer(model= AutoModelForSequenceClassification.from_pretrained("oliverguhr/german-sentiment-bert"),
#                 args=training_args,
#                 train_dataset=None,
#                 eval_dataset=tokenized_dataset_test,
#                 tokenizer=tokenizer,
#                 data_collator=data_collator,
#                 compute_metrics = compute_metrics_accuracy)

In [14]:
def calculate_entropy(logits):
    probas = torch.nn.Softmax(dim=1)(torch.from_numpy(logits))
    samples_entropy = entropy(probas.transpose(0, 1).cpu())
    samples_entropy = torch.from_numpy(samples_entropy)
    return samples_entropy

def preprocess_function(examples):
    return tokenizer(examples["texts"], padding = True, truncation=True)

def get_new_sample_active_learning(number_of_comments):

    ### We read in the csv that stores all of our already annotated data
    df = pd.read_csv("annotated_data/annotated_data_with_users.csv", names=["ID", "Date", "Time", "Comment Level", "Username", "Opinion", "Sentiment",
                     "topic_comment", "Topic_article", "Comment", "Method"])


    ### Check if we already have active learned comments that we annotated
    if os.path.isfile('annotated_data/active_learning_comments.csv'):
        progress_csv =pd.read_csv("annotated_data/active_learning_comments.csv",names=["ID", "Date", "Time", "Comment Level", "Username", "Opinion", "Sentiment", "topic_comment", "Topic_article", "Comment"], encoding="utf-8-sig", header=None)


    ### Stores comments as string that we already have in active learning csv
    bereits_comments = []
    for comment in progress_csv.Comment:
        bereits_comments.append(comment)

    ### Int which shows how many comments (manual random sampled + active learn sampled) we already have annotated
    progress = len(df) + len(bereits_comments)


    ### Load in all scraped comments from web in csv, sliced on [progress: progress + number_of_comments]
    alle_kommentare = pd.read_csv("shuffled_corona_relevante_kommentare.txt", names=["ID", "Date", "Time", "Comment Level", "Username", "Opinion", "Sentiment", "topic_comment", "Topic_article", "Comment"], 
                                    delimiter="\t", index_col=False, skiprows=progress, nrows=number_of_comments )
    


    ### Drop all rows that are already present in the active learned csv to we don't annotate twice
    print(~alle_kommentare["Opinion"].isin(bereits_comments))
    alle_kommentare = alle_kommentare.loc[~alle_kommentare["Opinion"].isin(bereits_comments)]
    display(alle_kommentare)

    ### Tokenize texts and get entropy, take topk 
    texte = {"texts" : [x for x in alle_kommentare["Opinion"]]}
    texte_ds= Dataset.from_dict(texte)
    tokenized_text = texte_ds.map(preprocess_function, batched=True)
    entropies = calculate_entropy(trainer.predict(tokenized_text).predictions)
    indexes = torch.topk(entropies, int(number_of_comments/10)).indices
    
    # if os.path.isfile('annotated_data/active_learning_comments.csv'):
    #     progress_csv =pd.read_csv("annotated_data/active_learning_comments.csv",names=["ID", "Date", "Time", "Comment Level", "Username", "Opinion", "Sentiment", "topic_comment", "Topic_article", "Comment"], encoding="utf-8-sig", header=None)
    #     print(type(progress_csv))
    #     display(progress_csv)
    #     for row, index in progress_csv.iterrows():
    #         print(row, index)
    newdf = pd.DataFrame(columns=["ID", "Date", "Time", "Comment Level", "Username", "Opinion", "Sentiment", "Topic_comment", "Topic_article", "Comment", "Method"])
    for nummer, x in enumerate(indexes):
        print(x, nummer)
        satz = texte["texts"][x]
        opinion = input(f"Opinion --- {satz}")
        while opinion not in ["neutral", "positive", "negative", "exit"]:
            opinion = input(f"Opinion --- {satz}")    
        sentiment = input(f"Sentiment --- {satz}")
        while sentiment not in ["neutral", "positive", "negative"]:
            sentiment = input(f"Sentiment --- {satz}")
        topic_comment = input("Topic comment")
        topic_article = input("Topic article")


        row = [alle_kommentare.loc[alle_kommentare.Opinion == satz]]
        newdf = newdf.append(row)
        newdf.reset_index(inplace=True, drop=True)
        print(topic_article)
        newdf.iloc[nummer,alle_kommentare.columns.get_loc("Topic_article")] = topic_article
        newdf.iloc[nummer,alle_kommentare.columns.get_loc("topic_comment")] = topic_comment
        newdf.iloc[nummer,alle_kommentare.columns.get_loc("Sentiment")] = sentiment
        newdf.iloc[nummer,alle_kommentare.columns.get_loc("Comment")] = satz
        newdf.iloc[nummer,alle_kommentare.columns.get_loc("Opinion")] = opinion

        
        

    newdf = newdf.drop(columns=list(newdf.columns[-2:]))
    display(newdf)
    newdf.to_csv("annotated_data/active_learning_comments.csv", mode="a", encoding="utf-8-sig", index=False, header=False)
            # df.append(line.split("\t")[0] + "\t" + line.split("\t")[1] + "\t" + line.split("\t")[2] + "\t" + line.split("\t")[3] + "\t" + line.split("\t")[4] + "\t" + opinion + "\t"
            #                       + sentiment + "\t" + klasse1 + "\t" +  klasse2 + "\t" + kommentar + "\n")
        



In [22]:
alle_kommentare = pd.read_csv("shuffled_corona_relevante_kommentare.txt", names=["ID", "Date", "Time", "Comment Level", "Username", "Opinion", "Sentiment", "topic_comment", "Topic_article", "Comment"], 
                                    delimiter="\t", index_col=False, skiprows=1100, nrows=20 )
    
alle_kommentare.columns.get_loc("Topic_article")

for x in alle_kommentare.columns[-2:]:
    print (x)

Topic_article
Comment


In [15]:
get_new_sample_active_learning(20)

['Wenn ich mir diesen Trump so betrachte und sehe dass die Zustimmung  für Donald Trump wächst, kann man sich nur wünschen, dass er und seine Befürworter an covid 19 langsam krepieren. Dann hat der Rest der USA und die Welt ein riesiges Problem weniger.', 'Alle jubeln, Söder hat einen Fehler gemacht. Die CDU ist froh einen lästigen Kanzlerkandidaten los zu sein.', 'Wenn ich mir diesen Trump so betrachte und sehe dass die Zustimmung  für Donald Trump wächst, kann man sich nur wünschen, dass er und seine Befürworter an covid 19 langsam krepieren. Dann hat der Rest der USA und die Welt ein riesiges Problem weniger.', 'Alle jubeln, Söder hat einen Fehler gemacht. Die CDU ist froh einen lästigen Kanzlerkandidaten los zu sein.']
0      True
1      True
2      True
3     False
4      True
5      True
6      True
7      True
8      True
9      True
10     True
11     True
12     True
13     True
14     True
15     True
16    False
17     True
18     True
19     True
Name: Opinion, dtype: bool


Unnamed: 0,ID,Date,Time,Comment Level,Username,Opinion,Sentiment,topic_comment,Topic_article,Comment
0,b83f12db-f415-4eaf-a960-2c1c2f484791,2020-03-30,11:02,0,dirk-a7uDe1XWR,"Gechlossene Wissenslücke Nun wissen es alle, w...",,,,
1,ef3ed5b3-f09a-418e-80ee-dbc95df97cb0,2021-04-29,16:49,0,Nelchen,„ Erleichterungen gerade für ältere Menschen E...,,,,
2,98736fba-91ef-4b60-9b97-87695b410138,2020-12-25,12:22,0,shivamich,"Wenn ich sowas lese, frage ich mich umso mehr,...",,,,
4,02fdea13-f70c-4c73-8970-97db86b98840,2020-03-28,22:16,0,Reiner-7pPeyMPZg,"Da die US-Amerikaner sehr auf ihren Rechten, s...",,,,
5,f9736a44-470c-4b06-9688-78f690927615,2020-12-03,17:01,0,René-aezmU-QWg,Wann legen wir die selbstherrlichen Regionalfü...,,,,
6,c449c319-bd29-47f8-87fb-9ec76d99c967,2021-03-15,14:39,0,Sonic,Auch da wieder: Warum braucht Deutschland mehr...,,,,
7,c6b4377d-e8e6-40d1-a967-fd9ada3b1453,2020-08-25,11:33,0,Wilfried-zvyAH0EZg,Schulleitungsvereinigung. Was es nicht alles g...,,,,
8,8ac89f4f-ac0b-4063-8dd7-db8002fa4f30,2021-03-28,21:36,0,Wilfried-8dD6JHiMg,Die Dame hatte die Wahl: - als Heldin: den P...,,,,
9,76aab31b-ad5f-4b85-9055-7d82a53831f5,2021-05-15,09:51,0,MitStaunenUndZittern,"Ich finde es mutig, von der Politik zu fordern...",,,,
10,c2bca4d7-c8b7-49a6-9cb0-6db073f7ee12,2021-02-16,18:06,0,Ratzeputz,Ich kann nicht mehr vor Lachen. Wir haben in S...,,,,


100%|██████████| 1/1 [00:00<00:00, 201.63ba/s]
The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: texts.
***** Running Prediction *****
  Num examples = 18
  Batch size = 1
91it [04:17,  1.12it/s]

tensor(2) 0


92it [04:29,  1.12it/s]

KeyboardInterrupt: Interrupted by user