# Validation

In this part we will validate our topic models and sentiment analysis based on Reading Tea Leaves: How Humans Interpret Topic Models 

In [326]:
from bertopic import BERTopic

import pickle

from sklearn.metrics import cohen_kappa_score

from functools import partial

import pandas as pd

import ipywidgets as widgets
from IPython.display import clear_output
from ipywidgets import IntProgress

import random
import numpy as np

from tqdm.notebook import tqdm
tqdm.pandas()

* Einlesen des Output von gensim
* Umformen in Struktur für Ausgabe
* Darstellen des Outputs
* Sammeln der Daten
* Berechnen des Scores

# 1. Load model

In [2]:
topic_model = BERTopic.load("../models/bertopic_tweets")

In [151]:
speeches_processed = pickle.load(open( "../data/processed/speeches_processed.p", "rb" ))

# 2. Word intrusion

## 2.1 Prepare word intrusion

In [None]:
def choose_random_document(index):
    rand_document = random.randrange(-1, 99)
    if rand_document != index:
        return rand_document 
    else:
        return choose_random_document(index)

In [None]:
for i in range(100):
    if choose_random_document(i) == None:
        print("None value in index: " + str(i))

In [None]:
records_list = []
for i in range(100): 
    word_list = []
    for j in range(5):
        word_list.append(topic_model.get_topic(i)[j][0])
    intruder_word = topic_model.get_topic(choose_random_document(i))[0][0]
    intruder_position = random.randrange(4)
    word_list.insert(intruder_position, intruder_word)
    word_list.append(intruder_word)
    word_list.append(intruder_position)
    records_list.append(word_list)
df = pd.DataFrame.from_records(records_list)
df.columns = ["word_0", "word_1", "word_2", "word_3", "word_4", "word_5", "intruder_word", "intruder_index"]

In [None]:
def generate_annotator_set(df, number_label, number_iaa, name_1, name_2):
    length = df.shape[0]
    if 2*number_label + number_iaa > length:
        print("Too many labels for the size of the dataframe")
    df_shuffeled = df.sample(frac=1).reset_index(drop=True)
    df_shuffeled[name_1] = [1] * (number_label+number_iaa) + [0] * (length-number_label-number_iaa)
    df_shuffeled[name_2] = [0] * (number_label) + [1] * (number_label+number_iaa) + [0] * (length-2*number_label-number_iaa)
    df_shuffeled["iaa_flag"] = [0] * number_label + [1] * number_iaa + [0] * (length-number_label-number_iaa)
    df_shuffeled["wis_label"] = [1] * number_label + [0] * number_iaa + [1] * (length-number_label-number_iaa)
    return df_shuffeled

In [None]:
word_df = generate_annotator_set(df, 10, 5, "Jakob", "Stjepan")

In [None]:
def word_intrusion_test(word_df, name):
    intrusion_df = word_df[word_df[name] == 1].reset_index(drop = True)
    
    max_count = intrusion_df.shape[0]
    global i
    i = 0
    
    button_0 = widgets.Button(description = intrusion_df.word_0[i])
    button_1 = widgets.Button(description = intrusion_df.word_1[i])
    button_2 = widgets.Button(description = intrusion_df.word_2[i])
    button_3 = widgets.Button(description = intrusion_df.word_3[i])
    button_4 = widgets.Button(description = intrusion_df.word_4[i])
    button_5 = widgets.Button(description = intrusion_df.word_5[i])


    chosen_words = []
    chosen_positions= []

    display("Word Intrusion Test")

    f = IntProgress(min=0, max=max_count)
    display(f)

    display(button_0)
    display(button_1)
    display(button_2)
    display(button_3)
    display(button_4)
    display(button_5)


    def btn_eventhandler(position, obj):
        global i 
        i += 1
        
        
        clear_output(wait=True)
        
        display("Word Intrusion Text")
        display(f)
        f.value += 1
        
        choosen_text = obj.description
        chosen_words.append(choosen_text)
        
        chosen_positions.append(position)
        
        if i < max_count:

            button_0 = widgets.Button(description = intrusion_df.word_0[i])
            button_1 = widgets.Button(description = intrusion_df.word_1[i])
            button_2 = widgets.Button(description = intrusion_df.word_2[i])
            button_3 = widgets.Button(description = intrusion_df.word_3[i])
            button_4 = widgets.Button(description = intrusion_df.word_4[i])
            button_5 = widgets.Button(description = intrusion_df.word_5[i])
            
            display(button_0)
            display(button_1)
            display(button_2)
            display(button_3)
            display(button_4)
            display(button_5)
            
            button_0.on_click(partial(btn_eventhandler,0))
            button_1.on_click(partial(btn_eventhandler,1))
            button_2.on_click(partial(btn_eventhandler,2))
            button_3.on_click(partial(btn_eventhandler,3))
            button_4.on_click(partial(btn_eventhandler,4))
            button_5.on_click(partial(btn_eventhandler,5))
        else:
            print ("Thanks " + name + " you finished all the work!")
            intrusion_df["chosen_word"] = chosen_words
            intrusion_df["chosen_position"] = chosen_positions
            intrusion_df.to_csv("../data/processed/word_intrusion_test_" + name + ".csv", index = False)



    button_0.on_click(partial(btn_eventhandler,0))
    button_1.on_click(partial(btn_eventhandler,1))
    button_2.on_click(partial(btn_eventhandler,2))
    button_3.on_click(partial(btn_eventhandler,3))
    button_4.on_click(partial(btn_eventhandler,4))
    button_5.on_click(partial(btn_eventhandler,5))
    
    return intrusion_df

## 2.2 Execute word intrusion

In [None]:
df_word_intrusion_jakob = word_intrusion_test(word_df, "Jakob")

In [None]:
df_word_intrusion_stjepan = word_intrusion_test(word_df, "Stjepan")

## 2.3 Evaluate word intrusion

In [None]:
def calculate_word_intrusion(name_1, name_2):
    df_word_intrusion_1 = pd.read_csv("../data/processed/word_intrusion_test_" + name_1 + ".csv")
    df_word_intrusion_2 = pd.read_csv("../data/processed/word_intrusion_test_" + name_2 + ".csv")
    iaa_values_1 = df_word_intrusion_1[df_word_intrusion_1.iaa_flag == 1].chosen_position.values
    iaa_values_2 = df_word_intrusion_2[df_word_intrusion_2.iaa_flag == 1].chosen_position.values
    kappa = cohen_kappa_score(iaa_values_1, iaa_values_2)
    df_word_intrusion = df_word_intrusion_1.append(df_word_intrusion_2)
    df_word = df_word_intrusion[df_word_intrusion["wis_label"] == 1]
    df_word["intruder_chosen"] = df_word["intruder_word"] == df_word["chosen_word"]
    return  df_word["intruder_chosen"].mean(), kappa

In [None]:
intrusion_score, kappa = calculate_word_intrusion("Jakob", "Stjepan")

In [None]:
intrusion_score

In [None]:
kappa

# 3. Topic Intrusion

## 3.1 Prepare topic intrusion

In [15]:
with open('../data/processed/probabilities_speeches.pickle', 'rb') as handle:
    probabilities_speeches = pickle.load(handle)

In [127]:
def choose_random_topic(index):
    rand_document = random.randrange(-1, 99)
    if rand_document != index:
        return rand_document 
    else:
        return choose_random_document(index)

In [80]:
def create_topic_string(topic_info):
    word_list = []
    for i in range(8):
        word_list.append(topic_info[i][0])
    return ", ".join(word_list)

In [251]:
records_list = []
for i in range(100): 
    topic_list = []
    high_probability_documents = sorted(zip(probabilities_speeches[i].tolist(), list(range(25))), reverse=True)[:3]
    low_probability_documents = sorted(zip(probabilities_speeches[i].tolist(), list(range(25))), reverse=True)[3:]
    for j in range(3):
        topic_index = high_probability_documents[j][1]
        topic_list.append(create_topic_string(topic_model.get_topic(topic_index)))
    intruder_document = low_probability_documents[random.randrange(22)]
    intruder_topic = create_topic_string(topic_model.get_topic(intruder_document[1]))
    intruder_position = random.randrange(4)
    topic_list.insert(intruder_position, intruder_topic)
    for j in range(3):
        topic_index = high_probability_documents[j][1]
        topic_list.append(high_probability_documents[j][0])
    topic_list.insert(intruder_position+4, intruder_document[0])
    topic_list.append(intruder_topic)
    topic_list.append(intruder_document[0])
    topic_list.append(intruder_position)
    topic_list.append(speeches_processed["text_preprocessed_sentence"][i])
    records_list.append(topic_list)
df = pd.DataFrame.from_records(records_list)
df.columns = ["topic_0", "topic_1", "topic_2", "topic_3","probability_topic_0","probability_topic_1",
              "probability_topic_2","probability_topic_3", "intruder_topic", "intruder_topic_probability",
              "intruder_index", "text"]

In [253]:
def generate_annotator_set(df, number_label, number_iaa, name_1, name_2):
    length = df.shape[0]
    if 2*number_label + number_iaa > length:
        print("Too many labels for the size of the dataframe")
    df_shuffeled = df.sample(frac=1).reset_index(drop=True)
    df_shuffeled[name_1] = [1] * (number_label+number_iaa) + [0] * (length-number_label-number_iaa)
    df_shuffeled[name_2] = [0] * (number_label) + [1] * (number_label+number_iaa) + [0] * (length-2*number_label-number_iaa)
    df_shuffeled["iaa_flag"] = [0] * number_label + [1] * number_iaa + [0] * (length-number_label-number_iaa)
    df_shuffeled["wis_label"] = [1] * number_label + [0] * number_iaa + [1] * (length-number_label-number_iaa)
    return df_shuffeled

In [268]:
topic_df = generate_annotator_set(df, 10, 5, "Jakob", "Stjepan")

In [269]:
topic_df

Unnamed: 0,topic_0,topic_1,topic_2,topic_3,probability_topic_0,probability_topic_1,probability_topic_2,probability_topic_3,intruder_topic,intruder_topic_probability,intruder_index,text,Jakob,Stjepan,iaa_flag,wis_label
0,"wahl, kandidat, glückwunsch, herzliche, wählen...","tweet, twitter, tweets, twittern, ironie, lese...","polizei, polizist, polizistin, bundespolizei, ...","deutschland, deutsch, außenpolitik, deutsche, ...",0.454786,0.145002,0.034918,0.022909,"deutschland, deutsch, außenpolitik, deutsche, ...",0.022909,3,dame herr moment warten herzlich willkommen ge...,1,0,0,1
1,"antisemitismus, israel, jude, jüdisch, antisem...","tweet, twitter, tweets, twittern, ironie, lese...","tb, hoffentlich, abwarten, warten, finden, war...","digitalisierung, digital, digitale, digitalpak...",0.197229,0.138609,0.004080,0.044783,"tb, hoffentlich, abwarten, warten, finden, war...",0.004080,2,sorge netzwerke angst bußgeldern löschen könne...,1,0,0,1
2,"tweet, twitter, tweets, twittern, ironie, lese...","steuer, einkommen, soli, steuerzahler, zahlen,...","migration, einwanderungsgesetz, migranten, ein...","antisemitismus, israel, jude, jüdisch, antisem...",0.086161,0.015146,0.012761,0.006689,"antisemitismus, israel, jude, jüdisch, antisem...",0.006689,3,herr präsident herr minister lage nordostasien...,1,0,0,1
3,"glückwunsch, herzliche, erfolg, zusammenarbeit...","tweet, twitter, tweets, twittern, ironie, lese...","verfassungsschutz, bundesverfassungsgericht, v...","kaffee, deutsch, tee, kommune, fein, fertigen,...",0.663484,0.059342,0.009290,0.018660,"verfassungsschutz, bundesverfassungsgericht, v...",0.009290,2,glauben reden parlament reichen stelle ausdrüc...,1,0,0,1
4,"polizei, polizist, polizistin, bundespolizei, ...","tweet, twitter, tweets, twittern, ironie, lese...","frage, beantworten, fragen, mail, antworten, v...","wahl, kandidat, glückwunsch, herzliche, wählen...",0.181578,0.175177,0.037731,0.025974,"wahl, kandidat, glückwunsch, herzliche, wählen...",0.025974,3,geehrt frau präsidentin geehrt dame herr liebe...,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,"tweet, twitter, tweets, twittern, ironie, lese...","rassismus, rassistisch, hanau, rassisten, biel...","deutschland, deutsch, außenpolitik, deutsche, ...","wahl, kandidat, glückwunsch, herzliche, wählen...",0.335955,0.000956,0.003171,0.002163,"rassismus, rassistisch, hanau, rassisten, biel...",0.000956,1,bitte,0,0,0,1
96,"tweet, twitter, tweets, twittern, ironie, lese...","digitalisierung, digital, digitale, digitalpak...","deutschland, deutsch, außenpolitik, deutsche, ...","polizei, polizist, polizistin, bundespolizei, ...",0.172338,0.021082,0.024131,0.023502,"digitalisierung, digital, digitale, digitalpak...",0.021082,1,geehrt herr präsident lieben kollegin kollege ...,0,0,0,1
97,"tweet, twitter, tweets, twittern, ironie, lese...","schule, lehrer, schüler, kind, unterricht, bil...","wahl, kandidat, glückwunsch, herzliche, wählen...","verfassungsschutz, bundesverfassungsgericht, v...",0.160264,0.091347,0.013546,0.046975,"wahl, kandidat, glückwunsch, herzliche, wählen...",0.013546,2,verehrter herr präsident verehren dame herr vo...,0,0,0,1
98,"europa, europawahl, europäisch, europäische, e...","tweet, twitter, tweets, twittern, ironie, lese...","deutschland, deutsch, außenpolitik, deutsche, ...","arbeit, homeoffice, job, arbeitsmarkt, arbeite...",0.197568,0.195617,0.087688,0.014569,"arbeit, homeoffice, job, arbeitsmarkt, arbeite...",0.014569,3,frau präsidentin verehren dame herr gemeinsame...,0,0,0,1


In [332]:
def topic_intrusion_test(intrusion_df, name):
    intrusion_df = intrusion_df[intrusion_df[name] == 1].reset_index(drop = True)
    
    max_count = intrusion_df.shape[0]
    global i
    i = 0

    button_0 = widgets.Button(description = intrusion_df.topic_0[i])
    button_1 = widgets.Button(description = intrusion_df.topic_1[i])
    button_2 = widgets.Button(description = intrusion_df.topic_2[i])
    button_3 = widgets.Button(description = intrusion_df.topic_3[i])
    
    chosen_elements = []
    chosen_positions = []
    chosen_probabilities = []

    display("Topic Intrusion Test")

    f = IntProgress(min=0, max=max_count)
    display(f)
    
    display(intrusion_df.text[i][0:1000])

    display(button_0)
    display(button_1)
    display(button_2)
    display(button_3)


    def btn_eventhandler(position, column, obj):
        
        global i
        
        clear_output(wait=True)
        
        display("Topic Intrusion Text")
        display(f)
        f.value += 1
                
        choosen_text = obj.description
        chosen_elements.append(choosen_text)
        chosen_positions.append(position)
        chosen_probabilities.append(intrusion_df[column][i])
        
        i += 1
        
        if i < max_count:

            button_0 = widgets.Button(description = intrusion_df.topic_0[i])
            button_1 = widgets.Button(description = intrusion_df.topic_1[i])
            button_2 = widgets.Button(description = intrusion_df.topic_2[i])
            button_3 = widgets.Button(description = intrusion_df.topic_3[i])
            
            display(intrusion_df.text[i][0:1000])
            
            display(button_0)
            display(button_1)
            display(button_2)
            display(button_3)
            
            button_0.on_click(partial(btn_eventhandler,0,"probability_topic_0"))
            button_1.on_click(partial(btn_eventhandler,1,"probability_topic_1"))
            button_2.on_click(partial(btn_eventhandler,2,"probability_topic_2"))
            button_3.on_click(partial(btn_eventhandler,3,"probability_topic_3"))
        else:
            print ("Thanks " + name + " you finished all the work!")
            intrusion_df["chosen_topic"] = chosen_elements
            intrusion_df["chosen_position"] = chosen_positions
            intrusion_df["chosen_topic_probability"] = chosen_probabilities
            intrusion_df.to_csv("../data/processed/topic_intrusion_test_" + name + ".csv", index = False)



    button_0.on_click(partial(btn_eventhandler,0,"probability_topic_0"))
    button_1.on_click(partial(btn_eventhandler,1,"probability_topic_1"))
    button_2.on_click(partial(btn_eventhandler,2,"probability_topic_2"))
    button_3.on_click(partial(btn_eventhandler,3,"probability_topic_3"))
    
    return intrusion_df

## 3.2 Execute topic intrusion

In [333]:
df_topic_intrusion_jakob = topic_intrusion_test(topic_df, "Jakob")

'Topic Intrusion Text'

IntProgress(value=14, max=15)

Thanks Jakob you finished all the work!


In [334]:
df_topic_intrusion_stjepan = topic_intrusion_test(topic_df, "Stjepan")

'Topic Intrusion Text'

IntProgress(value=14, max=15)

Thanks Stjepan you finished all the work!


## 3.3 Calculate topic intrusion

In [335]:
def calculate_topic_intrusion(name_1, name_2):
    df_topic_intrusion_1 = pd.read_csv("../data/processed/topic_intrusion_test_" + name_1 + ".csv")
    df_topic_intrusion_2 = pd.read_csv("../data/processed/topic_intrusion_test_" + name_2 + ".csv")
    iaa_values_1 = df_topic_intrusion_1[df_topic_intrusion_1.iaa_flag == 1].chosen_position.values
    iaa_values_2 = df_topic_intrusion_2[df_topic_intrusion_2.iaa_flag == 1].chosen_position.values
    kappa = cohen_kappa_score(iaa_values_1, iaa_values_2)
    df_topic_intrusion = df_topic_intrusion_1.append(df_topic_intrusion_2)
    df_topic = df_topic_intrusion[df_topic_intrusion["wis_label"] == 1]
    df_topic["intruder_score"] = np.log(df_topic["intruder_topic_probability"]) - np.log(df_topic["chosen_topic_probability"])
    return  df_topic["intruder_score"].mean(), kappa

In [336]:
calculate_topic_intrusion("Jakob", "Stjepan")

(-37.489787225464866, 1.0)

# 4. Sentiment Analyis Gold Standard

In [None]:
def sentiment_gold_dictionary(sentiment_df, name):
    max_count = sentiment_df.shape[0]
    global i
    i = 0

    button_0 = widgets.Button(description = "Positive")
    button_1 = widgets.Button(description = "Neutral")
    button_2 = widgets.Button(description = "Negative")
    
    chosen_elements = []

    display("Sentiment Gold Standard")

    f = IntProgress(min=0, max=max_count)
    display(f)
    
    display(sentiment_df.text[i])

    display(button_0)
    display(button_1)
    display(button_2)


    def btn_eventhandler(obj):
        global i 
        i += 1
        
        clear_output(wait=True)
        
        display("Sentiment Gold Standard")
        display(f)
        f.value += 1
                
        choosen_text = obj.description
        chosen_elements.append(choosen_text)
        
        if i < max_count:
            
            display(sentiment_df.text[i])
            
            display(button_0)
            display(button_1)
            display(button_2)
            
            button_0.on_click(btn_eventhandler)
            button_1.on_click(btn_eventhandler)
            button_2.on_click(btn_eventhandler)
            
        else:
            print ("Thanks " + name + " you finished all the work!")
            sentiment_df["choosen_sentiment"] = chosen_elements
            sentiment_df.to_csv("../data/processed/sentiment_gold_standard_" + name + ".csv", index = False)

    button_0.on_click(btn_eventhandler)
    button_1.on_click(btn_eventhandler)
    button_2.on_click(btn_eventhandler)
    
    return sentiment_df

In [None]:
sentiment_dic = {"text": ["Ich liebe dich", "Ich hasse dich", "Ich neutrale dich"]}

In [None]:
sentiment_df = pd.DataFrame(sentiment_dic)

In [None]:
sentiment_df = sentiment_gold_dictionary(sentiment_df, "Jakob")