# Validation

In this part we will validate our topic models and sentiment analysis based on Reading Tea Leaves: How Humans Interpret Topic Models 

In [6]:
from bertopic import BERTopic

from sklearn.metrics import cohen_kappa_score

from functools import partial

import pandas as pd

import ipywidgets as widgets
from IPython.display import clear_output
from ipywidgets import IntProgress

from tqdm.notebook import tqdm
tqdm.pandas()

* Einlesen des Output von gensim
* Umformen in Struktur für Ausgabe
* Darstellen des Outputs
* Sammeln der Daten
* Berechnen des Scores

# 1. Load model

In [7]:
topic_model = BERTopic.load("../models/bertopic_tweets")

# 2. Import word intrusion

# 3. Word intrusion test

In [3]:
# Input DataFrame with positions and words

In [8]:
test_dic = {"word_0": ["essen","deutschland","europa", "year"],
            "word_1": ["hunger","außenpolitik","europäer", "lol"],
            "word_2": ["currywurst","außenpolitisch","gemeinsam", "nice"],
            "word_3": ["kochen","land","handlungsfähig", "weak"],
            "word_4": ["pizza","brauchen","handlungsfähig", "hehe"],
            "word_5": ["salat","veränderung","handlungsfähig", "abcd"],
            "intruder_word": ["essen","deutschland","europa", "abcd"]}

In [9]:
df = pd.DataFrame(test_dic)

In [11]:
lenght = df.shape[0]

In [12]:
def generate_annotator_set(df, number_label, number_iaa, name_1, name_2):
    length = df.shape[0]
    df_shuffeled = df.sample(frac=1).reset_index(drop=True)
    df_shuffeled[name_1] = [1] * (number_label+number_iaa) + [0] * (length-number_label-number_iaa)
    df_shuffeled[name_2] = [0] * (number_label) + [1] * (number_label+number_iaa) + [0] * (length-2*number_label-number_iaa)
    df_shuffeled["iaa_flag"] = [0] * number_label + [1] * number_iaa + [0] * (length-number_label-number_iaa)
    df_shuffeled["wis_label"] = [1] * number_label + [0] * number_iaa + [1] * (length-number_label-number_iaa)
    return df_shuffeled

In [13]:
word_df = generate_annotator_set(df, 1, 2, "Jakob", "Stjepan")

In [14]:
word_df

Unnamed: 0,word_0,word_1,word_2,word_3,word_4,word_5,intruder_word,Jakob,Stjepan,iaa_flag,wis_label
0,essen,hunger,currywurst,kochen,pizza,salat,essen,1,0,0,1
1,year,lol,nice,weak,hehe,abcd,abcd,1,1,1,0
2,europa,europäer,gemeinsam,handlungsfähig,handlungsfähig,handlungsfähig,europa,1,1,1,0
3,deutschland,außenpolitik,außenpolitisch,land,brauchen,veränderung,deutschland,0,1,0,1


In [15]:
def word_intrusion_test(word_df, name):
    intrusion_df = word_df[word_df[name] == 1].reset_index(drop = True)
    
    max_count = intrusion_df.shape[0]
    global i
    i = 0
    
    button_0 = widgets.Button(description = intrusion_df.word_0[i])
    button_1 = widgets.Button(description = intrusion_df.word_1[i])
    button_2 = widgets.Button(description = intrusion_df.word_2[i])
    button_3 = widgets.Button(description = intrusion_df.word_3[i])
    button_4 = widgets.Button(description = intrusion_df.word_4[i])
    button_5 = widgets.Button(description = intrusion_df.word_5[i])


    chosen_words = []
    chosen_positions= []

    display("Word Intrusion Test")

    f = IntProgress(min=0, max=max_count)
    display(f)

    display(button_0)
    display(button_1)
    display(button_2)
    display(button_3)
    display(button_4)
    display(button_5)


    def btn_eventhandler(position, obj):
        global i 
        i += 1
        
        
        clear_output(wait=True)
        
        display("Word Intrusion Text")
        display(f)
        f.value += 1
        
        choosen_text = obj.description
        chosen_words.append(choosen_text)
        
        chosen_positions.append(position)
        
        if i < max_count:

            button_0 = widgets.Button(description = intrusion_df.word_0[i])
            button_1 = widgets.Button(description = intrusion_df.word_1[i])
            button_2 = widgets.Button(description = intrusion_df.word_2[i])
            button_3 = widgets.Button(description = intrusion_df.word_3[i])
            button_4 = widgets.Button(description = intrusion_df.word_4[i])
            button_5 = widgets.Button(description = intrusion_df.word_5[i])
            
            display(button_0)
            display(button_1)
            display(button_2)
            display(button_3)
            display(button_4)
            display(button_5)
            
            button_0.on_click(partial(btn_eventhandler,0))
            button_1.on_click(partial(btn_eventhandler,1))
            button_2.on_click(partial(btn_eventhandler,2))
            button_3.on_click(partial(btn_eventhandler,3))
            button_4.on_click(partial(btn_eventhandler,4))
            button_5.on_click(partial(btn_eventhandler,5))
        else:
            print ("Thanks " + name + " you finished all the work!")
            intrusion_df["chosen_word"] = chosen_words
            intrusion_df["chosen_position"] = chosen_positions
            intrusion_df.to_csv("../data/processed/word_intrusion_test_" + name + ".csv", index = False)



    button_0.on_click(partial(btn_eventhandler,0))
    button_1.on_click(partial(btn_eventhandler,1))
    button_2.on_click(partial(btn_eventhandler,2))
    button_3.on_click(partial(btn_eventhandler,3))
    button_4.on_click(partial(btn_eventhandler,4))
    button_5.on_click(partial(btn_eventhandler,5))
    
    return intrusion_df

In [16]:
df_word_intrusion_jakob = word_intrusion_test(word_df, "Jakob")

'Word Intrusion Text'

IntProgress(value=2, max=3)

Thanks Jakob you finished all the work!


In [17]:
df_word_intrusion_stjepan = word_intrusion_test(word_df, "Stjepan")

'Word Intrusion Text'

IntProgress(value=2, max=3)

Thanks Stjepan you finished all the work!


In [18]:
def calculate_word_intrusion(name_1, name_2):
    df_word_intrusion_1 = pd.read_csv("../data/processed/word_intrusion_test_" + name_1 + ".csv")
    df_word_intrusion_2 = pd.read_csv("../data/processed/word_intrusion_test_" + name_2 + ".csv")
    iaa_values_1 = df_word_intrusion_1[df_word_intrusion_1.iaa_flag == 1].chosen_position.values
    iaa_values_2 = df_word_intrusion_2[df_word_intrusion_2.iaa_flag == 1].chosen_position.values
    kappa = cohen_kappa_score(iaa_values_1, iaa_values_2)
    df_word_intrusion = df_word_intrusion_1.append(df_word_intrusion_2)
    df_word = df_word_intrusion[df_word_intrusion["wis_label"] == 1]
    df_word["intruder_chosen"] = df_word["intruder_word"] == df_word["chosen_word"]
    return  df_word["intruder_chosen"].mean(), kappa

In [19]:
intrusion_score, kappa = calculate_word_intrusion("Jakob", "Stjepan")

In [20]:
intrusion_score

0.5

In [21]:
kappa

-0.33333333333333326

# 4. Topic Intrusion

In [None]:
topic_dic = {"topic_0": ["essen","deutschland","europa"],
            "topic_1": ["hunger","außenpolitik","europäer"],
            "topic_2": ["currywurst","außenpolitisch","gemeinsam"],
            "topic_3": ["kochen","land","handlungsfähig"],
            "topic_4": ["pizza","brauchen","handlungsfähig"],
            "topic_5": ["salat","veränderung","handlungsfähig"],
            "text": ["Text ist cool", "for school", "hello"],
            "intruder_topic": ["essen","deutschland","europa"]}

In [None]:
topic_df = pd.DataFrame(topic_dic)

In [None]:
topic_df

In [None]:
def topic_intrusion_test(topic_df, name):
    max_count = df.shape[0]
    global i
    i = 0

    button_0 = widgets.Button(description = topic_df.topic_0[i])
    button_1 = widgets.Button(description = topic_df.topic_1[i])
    button_2 = widgets.Button(description = topic_df.topic_2[i])
    button_3 = widgets.Button(description = topic_df.topic_3[i])
    button_4 = widgets.Button(description = topic_df.topic_4[i])
    button_5 = widgets.Button(description = topic_df.topic_5[i])
    
    chosen_elements = []

    display("Topic Intrusion Test")

    f = IntProgress(min=0, max=max_count)
    display(f)
    
    display(topic_df.text[i])

    display(button_0)
    display(button_1)
    display(button_2)
    display(button_3)
    display(button_4)
    display(button_5)


    def btn_eventhandler(obj):
        global i 
        i += 1
        
        clear_output(wait=True)
        
        display("Topic Intrusion Text")
        display(f)
        f.value += 1
                
        choosen_text = obj.description
        chosen_elements.append(choosen_text)
        
        if i < max_count:

            button_0 = widgets.Button(description = topic_df.topic_0[i])
            button_1 = widgets.Button(description = topic_df.topic_1[i])
            button_2 = widgets.Button(description = topic_df.topic_2[i])
            button_3 = widgets.Button(description = topic_df.topic_3[i])
            button_4 = widgets.Button(description = topic_df.topic_4[i])
            button_5 = widgets.Button(description = topic_df.topic_5[i])
            
            display(topic_df.text[i])
            
            display(button_0)
            display(button_1)
            display(button_2)
            display(button_3)
            display(button_4)
            display(button_5)
            
            button_0.on_click(btn_eventhandler)
            button_1.on_click(btn_eventhandler)
            button_2.on_click(btn_eventhandler)
            button_3.on_click(btn_eventhandler)
            button_4.on_click(btn_eventhandler)
            button_5.on_click(btn_eventhandler)
        else:
            print ("Thanks " + name + " you finished all the work!")
            topic_df["choosen_topic"] = chosen_elements
            topic_df.to_csv("../data/processed/topic_intrusion_test_" + name + ".csv", index = False)



    button_0.on_click(btn_eventhandler)
    button_1.on_click(btn_eventhandler)
    button_2.on_click(btn_eventhandler)
    button_3.on_click(btn_eventhandler)
    button_4.on_click(btn_eventhandler)
    button_5.on_click(btn_eventhandler)
    
    return topic_df

In [None]:
topic_results = topic_intrusion_test(topic_df, "Jakob")

In [None]:
topic_results

In [None]:
def calculate_topic_intrusion(df_topic_intrusion):
    # We need to calculate the mean of the difference between the log likelihood of the true
    # and the choosen intruder topic
    return df_topic_intrusion

# 6. Sentiment Analyis Gold Standard

In [None]:
def sentiment_gold_dictionary(sentiment_df, name):
    max_count = df.shape[0]
    global i
    i = 0

    button_0 = widgets.Button(description = "Positive")
    button_1 = widgets.Button(description = "Neutral")
    button_2 = widgets.Button(description = "Negative")
    
    chosen_elements = []

    display("Sentiment Gold Standard")

    f = IntProgress(min=0, max=max_count)
    display(f)
    
    display(sentiment_df.text[i])

    display(button_0)
    display(button_1)
    display(button_2)


    def btn_eventhandler(obj):
        global i 
        i += 1
        
        clear_output(wait=True)
        
        display("Sentiment Gold Standard")
        display(f)
        f.value += 1
                
        choosen_text = obj.description
        chosen_elements.append(choosen_text)
        
        if i < max_count:
            
            display(sentiment_df.text[i])
            
            display(button_0)
            display(button_1)
            display(button_2)
            
            button_0.on_click(btn_eventhandler)
            button_1.on_click(btn_eventhandler)
            button_2.on_click(btn_eventhandler)
            
        else:
            print ("Thanks " + name + " you finished all the work!")
            sentiment_df["choosen_sentiment"] = chosen_elements
            sentiment_df.to_csv("../data/processed/sentiment_gold_standard_" + name + ".csv", index = False)

    button_0.on_click(btn_eventhandler)
    button_1.on_click(btn_eventhandler)
    button_2.on_click(btn_eventhandler)
    
    return sentiment_df

In [None]:
sentiment_dic = {"text": ["Ich liebe dich", "Ich hasse dich", "Ich neutrale dich"]}

In [None]:
sentiment_df = pd.DataFrame(sentiment_dic)

In [None]:
sentiment_df = sentiment_gold_dictionary(sentiment_df, "Jakob")

In [None]:
sentiment_df