In [992]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from summarytools import dfSummary
import json
from germansentiment import SentimentModel

In [993]:
import nltk
nltk.download('vader_lexicon')

[nltk_data] Error loading vader_lexicon: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1000)>


False

In [994]:
click_data = pd.read_csv('dataset/click_data.csv')
emotional_events = pd.read_csv('dataset/emotional_events.csv')
messages_data = pd.read_csv('dataset/messages_data.csv')
# task_types = pd.read_csv('dataset/task_types.csv')
user_information = pd.read_csv('dataset/user_information.csv')

In [995]:
# Preprocessing Data Types
emotional_events['valence'] = emotional_events['valence'].replace('AO07', '7').astype(float)
emotional_events['arousal'] = emotional_events['arousal'].replace('AO07', '7').astype(float)

In [996]:
# Filtering Data
ee = emotional_events[emotional_events['task_type'] == 'information_finding'].drop(columns=['task_type'])

valid_interactions = ee[['userId', 'task']].drop_duplicates()

md = pd.merge(messages_data, valid_interactions, on=['userId', 'task'], how='inner')
cd = pd.merge(click_data, valid_interactions, on=['userId', 'task'], how='inner')

print(len(ee))
print(len(md))
print(len(cd))


1155
2447
970


# Create Emotion Quartiles

In [997]:
ee['arousal'].unique()

array([ 6., nan,  5.,  3.,  2.,  7.,  4.,  1.])

In [998]:
ee['valence'].unique()

array([ 5., nan,  1.,  3.,  2.,  6.,  4.])

In [999]:
ee.loc[:,'valence_high'] = np.where(ee['valence'] > 3, 1, 0)
ee.loc[:,'arousal_high'] = np.where(ee['arousal'] > 3, 1, 0)

x: valence
y: arousal

|2|4|
|---|---|
|1|3|

In [1000]:
def get_emotion_score(df):
    if df['valence_high'] == 1 and df['arousal_high'] == 1:
        return 4
    elif df['valence_high'] == 1 and df['arousal_high'] == 0:
        return 3
    elif df['valence_high'] == 0 and df['arousal_high'] == 1:
        return 2
    elif df['valence_high'] == 0 and df['arousal_high'] == 0:
        return 1
    else:
        return np.nan
    
ee.loc[:, 'emotion_quartile'] = ee.apply(get_emotion_score, axis=1)

In [1001]:
ee.emotion_quartile.value_counts()

emotion_quartile
1    552
4    293
2    247
3     63
Name: count, dtype: int64

# Erstellung der neuen Features

In [1002]:
ee.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1155 entries, 0 to 1634
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                0 non-null      float64
 1   userId            1155 non-null   int64  
 2   task              1155 non-null   int64  
 3   input_type        1155 non-null   object 
 4   input             1150 non-null   object 
 5   input_assessment  0 non-null      float64
 6   confidence        405 non-null    float64
 7   understanding     405 non-null    float64
 8   valence           758 non-null    float64
 9   arousal           758 non-null    float64
 10  task_start        405 non-null    float64
 11  task_end          1155 non-null   float64
 12  task_time         405 non-null    float64
 13  valence_high      1155 non-null   int64  
 14  arousal_high      1155 non-null   int64  
 15  emotion_quartile  1155 non-null   int64  
dtypes: float64(9), int64(5), object(2)
memory usage

In [1003]:
md.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2447 entries, 0 to 2446
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   userId          2447 non-null   int64 
 1   task            2447 non-null   int64 
 2   message_type    2447 non-null   object
 3   timestamp       2447 non-null   object
 4   input           2447 non-null   object
 5   timestamp_unix  2447 non-null   int64 
dtypes: int64(3), object(3)
memory usage: 114.8+ KB


In [1004]:
md['agent_output'] = md['input'].apply(lambda x: json.loads(x).get('output'))
md['user_input'] = md['input'].apply(lambda x: json.loads(x).get('input'))
md

Unnamed: 0,userId,task,message_type,timestamp,input,timestamp_unix,agent_output,user_input
0,129,4,human,2024-06-25 09:19:21.478092,"{""input"": ""Wie hoch ist die Kitaquote in der S...",1719307161,,Wie hoch ist die Kitaquote in der Stadt Freibu...
1,129,4,agent_finish,2024-06-25 09:19:36.649222,"{""output"": ""Die Anzahl der Kitapl\u00e4tze bet...",1719307176,Die Anzahl der Kitaplätze beträgt 11.201 und d...,
2,129,4,human,2024-06-25 09:20:27.03288,"{""input"": ""Gebe die Anzahl der Kitapl\u00e4tze...",1719307227,,Gebe die Anzahl der Kitaplätze für die Stadt F...
3,129,4,agent_finish,2024-06-25 09:20:38.416896,"{""output"": ""Die Anzahl der Kitapl\u00e4tze f\u...",1719307238,Die Anzahl der Kitaplätze für die Stadt Freibu...,
4,129,4,human,2024-06-25 09:21:19.762643,"{""input"": ""Wie viele Kinder unter 6 Jahren war...",1719307279,,Wie viele Kinder unter 6 Jahren waren 2020 in ...
...,...,...,...,...,...,...,...,...
2442,238,5,agent_finish,2024-07-09 16:42:55.708772,"{""output"": ""In Oberwiehre gab es 2010 die meis...",1720543375,In Oberwiehre gab es 2010 die meisten Altbauwo...,
2443,238,5,human,2024-07-09 16:44:19.09898,"{""input"": ""wie viele Baugenehmigungen wurden i...",1720543459,,wie viele Baugenehmigungen wurden in Oberwiehr...
2444,238,5,agent_finish,2024-07-09 16:44:25.837833,"{""output"": ""In Oberwiehre wurden insgesamt X B...",1720543465,In Oberwiehre wurden insgesamt X Baugenehmigun...,
2445,238,5,human,2024-07-09 16:45:41.986446,"{""input"": ""wie viele Baugenehmigungen sind in...",1720543541,,wie viele Baugenehmigungen sind in Oberwiehre...


In [1005]:
ee.drop(columns='id', inplace=True)
ee.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1155 entries, 0 to 1634
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   userId            1155 non-null   int64  
 1   task              1155 non-null   int64  
 2   input_type        1155 non-null   object 
 3   input             1150 non-null   object 
 4   input_assessment  0 non-null      float64
 5   confidence        405 non-null    float64
 6   understanding     405 non-null    float64
 7   valence           758 non-null    float64
 8   arousal           758 non-null    float64
 9   task_start        405 non-null    float64
 10  task_end          1155 non-null   float64
 11  task_time         405 non-null    float64
 12  valence_high      1155 non-null   int64  
 13  arousal_high      1155 non-null   int64  
 14  emotion_quartile  1155 non-null   int64  
dtypes: float64(8), int64(5), object(2)
memory usage: 144.4+ KB


## Building Features Dataframe

In [1006]:
features_df = ee[ee['input_type'].isin(['final_output', 'llm_answer'])].copy()

# beginOrEndOfInteraction determines whether firstly or lastly requested valence/arousal values are considered
features_df['beginOrEndOfInteraction'] = features_df['input_type'].apply(lambda x: 'END' if x == 'final_output' else 'BEGIN')   

features_df = features_df[['userId', 'task', 'beginOrEndOfInteraction', 'valence', 'arousal', 'emotion_quartile']]

features_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 758 entries, 0 to 1634
Data columns (total 6 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   userId                   758 non-null    int64  
 1   task                     758 non-null    int64  
 2   beginOrEndOfInteraction  758 non-null    object 
 3   valence                  758 non-null    float64
 4   arousal                  758 non-null    float64
 5   emotion_quartile         758 non-null    int64  
dtypes: float64(2), int64(3), object(1)
memory usage: 41.5+ KB


## First & Last Prompts and Answers of the Interactions

In [1007]:
md_human_sorted = md[md['message_type'] == 'human'].sort_values(by=['userId', 'task', 'timestamp_unix'])
md_agent_sorted = md[md['message_type'] == 'agent_finish'].sort_values(by=['userId', 'task', 'timestamp_unix'])

first_prompt_per_interaction = md_human_sorted.groupby(['userId', 'task']).first().reset_index()
first_prompt_per_interaction['beginOrEndOfInteraction'] = 'BEGIN'

first_answer_per_interaction = md_agent_sorted.groupby(['userId', 'task']).first().reset_index()
first_answer_per_interaction['beginOrEndOfInteraction'] = 'BEGIN'


last_prompt_per_interaction = md_human_sorted.groupby(['userId', 'task']).last().reset_index()
last_prompt_per_interaction['beginOrEndOfInteraction'] = 'END'

last_answer_per_interaction = md_agent_sorted.groupby(['userId', 'task']).last().reset_index()
last_answer_per_interaction['beginOrEndOfInteraction'] = 'END'

second_last_answer_per_interaction = md_agent_sorted.groupby(['userId', 'task']).nth(-2).reset_index()
second_last_answer_per_interaction['beginOrEndOfInteraction'] = 'END'


first_prompt_per_interaction.info()
# print('\n')
# first_answer_per_interaction.info()
# print('\n')
# last_prompt_per_interaction.info()
# print('\n')
# last_answer_per_interaction.info()



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 397 entries, 0 to 396
Data columns (total 9 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   userId                   397 non-null    int64 
 1   task                     397 non-null    int64 
 2   message_type             397 non-null    object
 3   timestamp                397 non-null    object
 4   input                    397 non-null    object
 5   timestamp_unix           397 non-null    int64 
 6   agent_output             0 non-null      object
 7   user_input               397 non-null    object
 8   beginOrEndOfInteraction  397 non-null    object
dtypes: int64(3), object(6)
memory usage: 28.0+ KB


## Length of Prompts (in words)

In [1008]:
# Add length of first prompt to features_df
df = first_prompt_per_interaction.copy()
df['first_prompt_length'] = df['user_input'].apply(lambda x: len(x.split()))

features_df = pd.merge(features_df, df[['userId', 'task', 'beginOrEndOfInteraction', 'first_prompt_length']], on=['userId', 'task', 'beginOrEndOfInteraction'], how='left')

# Add length of last prompt to features_df
df = last_prompt_per_interaction.copy()
df['last_prompt_length'] = df['user_input'].apply(lambda x: len(x.split()))

features_df = pd.merge(features_df, df[['userId', 'task', 'beginOrEndOfInteraction', 'last_prompt_length']], on=['userId', 'task', 'beginOrEndOfInteraction'], how='left')


# average length of prompt in interaction
df = md_human_sorted.copy()
df['avg_prompt_length'] = df['user_input'].apply(lambda x: len(x.split()))

df = df.groupby(['userId', 'task'])['avg_prompt_length'].mean().reset_index()

df['beginOrEndOfInteraction'] = 'END'

features_df = pd.merge(features_df, df[['userId', 'task', 'beginOrEndOfInteraction', 'avg_prompt_length']],
                       on=['userId', 'task', 'beginOrEndOfInteraction'], how='left')

features_df

Unnamed: 0,userId,task,beginOrEndOfInteraction,valence,arousal,emotion_quartile,first_prompt_length,last_prompt_length,avg_prompt_length
0,124,1,END,5.0,6.0,4,,27.0,27.000000
1,124,2,END,1.0,5.0,2,,16.0,16.000000
2,124,3,END,3.0,5.0,2,,7.0,11.500000
3,124,3,BEGIN,3.0,3.0,1,16.0,,
4,124,4,END,1.0,5.0,2,,3.0,7.500000
...,...,...,...,...,...,...,...,...,...
753,238,3,BEGIN,1.0,4.0,2,9.0,,
754,238,4,END,2.0,5.0,2,,12.0,16.500000
755,238,4,BEGIN,2.0,4.0,2,21.0,,
756,238,5,END,1.0,5.0,2,,13.0,12.333333


## Length of Agent Answers (in words)

In [1009]:
# Add length of first prompt to features_df
df = first_answer_per_interaction.copy()
df['first_answer_length'] = df['agent_output'].apply(lambda x: len(x.split()))

features_df = pd.merge(features_df, df[['userId', 'task', 'beginOrEndOfInteraction', 'first_answer_length']], on=['userId', 'task', 'beginOrEndOfInteraction'], how='left')

# Add length of last prompt to features_df
df = last_answer_per_interaction.copy()
df['last_answer_length'] = df['agent_output'].apply(lambda x: len(x.split()))

features_df = pd.merge(features_df, df[['userId', 'task', 'beginOrEndOfInteraction', 'last_answer_length']], on=['userId', 'task', 'beginOrEndOfInteraction'], how='left')



# average length of agent answer in interaction
df = md_agent_sorted.copy()
df['avg_agent_answer_length'] = df['agent_output'].apply(lambda x: len(x.split()))

df = df.groupby(['userId', 'task'])['avg_agent_answer_length'].mean().reset_index()

df['beginOrEndOfInteraction'] = 'END'

features_df = pd.merge(features_df, df[['userId', 'task', 'beginOrEndOfInteraction', 'avg_agent_answer_length']],
                       on=['userId', 'task', 'beginOrEndOfInteraction'], how='left')

features_df

Unnamed: 0,userId,task,beginOrEndOfInteraction,valence,arousal,emotion_quartile,first_prompt_length,last_prompt_length,avg_prompt_length,first_answer_length,last_answer_length,avg_agent_answer_length
0,124,1,END,5.0,6.0,4,,27.0,27.0,,46.0,46.0
1,124,2,END,1.0,5.0,2,,16.0,16.0,,,
2,124,3,END,3.0,5.0,2,,7.0,11.5,,6.0,10.0
3,124,3,BEGIN,3.0,3.0,1,16.0,,,14.0,,
4,124,4,END,1.0,5.0,2,,3.0,7.5,,20.0,20.0
5,124,4,BEGIN,2.0,2.0,1,12.0,,,20.0,,
6,124,5,END,1.0,5.0,2,,22.0,22.0,,18.0,18.0
7,125,1,END,5.0,7.0,4,,16.0,39.0,,3.0,6.5
8,125,1,BEGIN,5.0,5.0,4,62.0,,,10.0,,
9,125,2,END,6.0,7.0,4,,7.0,13.0,,11.0,9.0


##  Prompt Formulation Time (in seconds)

In [841]:
# (last prompt timestamp) - (2nd last agent answer timestamp)

df = pd.merge(last_prompt_per_interaction[['userId', 'task', 'timestamp_unix', 'beginOrEndOfInteraction']], second_last_answer_per_interaction[['userId', 'task', 'timestamp_unix', 'beginOrEndOfInteraction']], on=['userId', 'task', 'beginOrEndOfInteraction'], how='inner')
df['formulation_time'] = df['timestamp_unix_x'] - df['timestamp_unix_y']

features_df = pd.merge(features_df, df[['userId', 'task', 'beginOrEndOfInteraction', 'formulation_time']], on=['userId', 'task', 'beginOrEndOfInteraction'], how='left')

features_df

Unnamed: 0,userId,task,beginOrEndOfInteraction,valence,arousal,emotion_quartile,first_prompt_length,last_prompt_length,first_answer_length,last_answer_length,formulation_time
0,124,1,END,5.0,6.0,4,,27.0,,46.0,
1,124,2,END,1.0,5.0,2,,16.0,,,
2,124,3,END,3.0,5.0,2,,7.0,,6.0,58.0
3,124,3,BEGIN,3.0,3.0,1,16.0,,14.0,,
4,124,4,END,1.0,5.0,2,,3.0,,20.0,54.0
...,...,...,...,...,...,...,...,...,...,...,...
753,238,3,BEGIN,1.0,4.0,2,9.0,,24.0,,
754,238,4,END,2.0,5.0,2,,12.0,,15.0,109.0
755,238,4,BEGIN,2.0,4.0,2,21.0,,37.0,,
756,238,5,END,1.0,5.0,2,,13.0,,11.0,76.0


## Agent Answer Time (in seconds) 

In [842]:
# (first agent answer timestamp) - (first prompt timestamp)
df = pd.merge(first_answer_per_interaction[['userId', 'task', 'timestamp_unix', 'beginOrEndOfInteraction']], first_prompt_per_interaction[['userId', 'task', 'timestamp_unix', 'beginOrEndOfInteraction']], on=['userId', 'task', 'beginOrEndOfInteraction'], how='inner')
df['first_agent_answer_time'] = df['timestamp_unix_x'] - df['timestamp_unix_y']

features_df = pd.merge(features_df, df[['userId', 'task', 'beginOrEndOfInteraction', 'first_agent_answer_time']], on=['userId', 'task', 'beginOrEndOfInteraction'], how='left')

# (last agent answer timestamp) - (last prompt timestamp)

df = pd.merge(last_answer_per_interaction[['userId', 'task', 'timestamp_unix', 'beginOrEndOfInteraction']], last_prompt_per_interaction[['userId', 'task', 'timestamp_unix', 'beginOrEndOfInteraction']], on=['userId', 'task', 'beginOrEndOfInteraction'], how='inner')
df['last_agent_answer_time'] = df['timestamp_unix_x'] - df['timestamp_unix_y']

features_df = pd.merge(features_df, df[['userId', 'task', 'beginOrEndOfInteraction', 'last_agent_answer_time']], on=['userId', 'task', 'beginOrEndOfInteraction'], how='left')

features_df

Unnamed: 0,userId,task,beginOrEndOfInteraction,valence,arousal,emotion_quartile,first_prompt_length,last_prompt_length,first_answer_length,last_answer_length,formulation_time,first_agent_answer_time,last_agent_answer_time
0,124,1,END,5.0,6.0,4,,27.0,,46.0,,,16.0
1,124,2,END,1.0,5.0,2,,16.0,,,,,
2,124,3,END,3.0,5.0,2,,7.0,,6.0,58.0,,8.0
3,124,3,BEGIN,3.0,3.0,1,16.0,,14.0,,,15.0,
4,124,4,END,1.0,5.0,2,,3.0,,20.0,54.0,,9.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
753,238,3,BEGIN,1.0,4.0,2,9.0,,24.0,,,28.0,
754,238,4,END,2.0,5.0,2,,12.0,,15.0,109.0,,4.0
755,238,4,BEGIN,2.0,4.0,2,21.0,,37.0,,,5.0,
756,238,5,END,1.0,5.0,2,,13.0,,11.0,76.0,,7.0


## Prompt Count per Interaction

In [843]:
df = md.copy()

df = df[df['message_type'] == 'human'].groupby(['userId', 'task']).size().reset_index(name='number_of_prompts')
df['beginOrEndOfInteraction'] = 'END'

features_df = pd.merge(features_df, df[['userId', 'task', 'beginOrEndOfInteraction', 'number_of_prompts']], on=['userId', 'task', 'beginOrEndOfInteraction'], how='left')

features_df

Unnamed: 0,userId,task,beginOrEndOfInteraction,valence,arousal,emotion_quartile,first_prompt_length,last_prompt_length,first_answer_length,last_answer_length,formulation_time,first_agent_answer_time,last_agent_answer_time,number_of_prompts
0,124,1,END,5.0,6.0,4,,27.0,,46.0,,,16.0,1.0
1,124,2,END,1.0,5.0,2,,16.0,,,,,,1.0
2,124,3,END,3.0,5.0,2,,7.0,,6.0,58.0,,8.0,2.0
3,124,3,BEGIN,3.0,3.0,1,16.0,,14.0,,,15.0,,
4,124,4,END,1.0,5.0,2,,3.0,,20.0,54.0,,9.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
753,238,3,BEGIN,1.0,4.0,2,9.0,,24.0,,,28.0,,
754,238,4,END,2.0,5.0,2,,12.0,,15.0,109.0,,4.0,2.0
755,238,4,BEGIN,2.0,4.0,2,21.0,,37.0,,,5.0,,
756,238,5,END,1.0,5.0,2,,13.0,,11.0,76.0,,7.0,3.0


## Number of Click Events per Interaction

In [844]:
df = cd.copy()
print(f'Click Event Types: {df['event_type'].unique()}')

# Number of Clicks & Maximizations on Tables
df = df[(df['event_type'] == 'click_table') | (df['event_type'] == 'maximize_table')].groupby(['userId', 'task']).size().reset_index(name='number_of_clicks_on_tables')
df['beginOrEndOfInteraction'] = 'END'

features_df = pd.merge(features_df, df[['userId', 'task', 'beginOrEndOfInteraction', 'number_of_clicks_on_tables']], on=['userId', 'task', 'beginOrEndOfInteraction'], how='left')

# Number of Clicks on Steps & Descriptions
df = cd.copy()
df = df[(df['event_type'] == 'click_step') | (df['event_type'] == 'click_description')].groupby(['userId', 'task']).size().reset_index(name='number_of_clicks_on_steps_and_descriptions')
df['beginOrEndOfInteraction'] = 'END'

features_df = pd.merge(features_df, df[['userId', 'task', 'beginOrEndOfInteraction', 'number_of_clicks_on_steps_and_descriptions']], on=['userId', 'task', 'beginOrEndOfInteraction'], how='left')

features_df

Click Event Types: ['click_table' 'maximize_table' 'click_step' 'click_description']


Unnamed: 0,userId,task,beginOrEndOfInteraction,valence,arousal,emotion_quartile,first_prompt_length,last_prompt_length,first_answer_length,last_answer_length,formulation_time,first_agent_answer_time,last_agent_answer_time,number_of_prompts,number_of_clicks_on_tables,number_of_clicks_on_steps_and_descriptions
0,124,1,END,5.0,6.0,4,,27.0,,46.0,,,16.0,1.0,,
1,124,2,END,1.0,5.0,2,,16.0,,,,,,1.0,,
2,124,3,END,3.0,5.0,2,,7.0,,6.0,58.0,,8.0,2.0,,
3,124,3,BEGIN,3.0,3.0,1,16.0,,14.0,,,15.0,,,,
4,124,4,END,1.0,5.0,2,,3.0,,20.0,54.0,,9.0,2.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
753,238,3,BEGIN,1.0,4.0,2,9.0,,24.0,,,28.0,,,,
754,238,4,END,2.0,5.0,2,,12.0,,15.0,109.0,,4.0,2.0,,
755,238,4,BEGIN,2.0,4.0,2,21.0,,37.0,,,5.0,,,,
756,238,5,END,1.0,5.0,2,,13.0,,11.0,76.0,,7.0,3.0,,


## Technical Error 

## No answer given

In [40]:
# conversation ended in a technical error
df = pd.merge(ee, md, left_on=['userId', 'task'], right_on=['userId', 'task'], how='left')

In [41]:
df['no_answer_given'] = df['agent_output'] == "Leider habe ich keine Antwort auf diese Frage. Versuchen Sie bitte Ihre Frage anders zu formulieren, oder komplexe Fragestellungen in einfachere Fragen zu unterteilen."
print(df.no_answer_given.value_counts())

no_answer_given
False    6812
True      506
Name: count, dtype: int64


## Prompt Sentiment

In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7318 entries, 0 to 7317
Data columns (total 23 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                0 non-null      float64
 1   userId            7318 non-null   int64  
 2   task_type         7318 non-null   object 
 3   task              7318 non-null   int64  
 4   input_type        7318 non-null   object 
 5   input_x           7313 non-null   object 
 6   input_assessment  0 non-null      float64
 7   confidence        2434 non-null   float64
 8   understanding     2434 non-null   float64
 9   valence           4871 non-null   float64
 10  arousal           4871 non-null   float64
 11  task_start        2434 non-null   float64
 12  task_end          7318 non-null   float64
 13  task_time         2434 non-null   float64
 14  valence_high      7318 non-null   int64  
 15  arousal_high      7318 non-null   int64  
 16  emotion_quartile  7318 non-null   int64  


In [24]:
model = SentimentModel()
texts = [
    "Mit keinem guten Ergebniss","Das ist gar nicht mal so gut",
    "Total awesome!","nicht so schlecht wie erwartet",
    "Der Test verlief positiv.","Sie fährt ein grünes Auto."]
       
model.predict_sentiment(["Mit keinem guten Ergebniss"])

KeyboardInterrupt: 

In [None]:
df['input_y'].astype(str)

In [None]:
unpacked_messages = pd.json_normalize(messages_data['input'].apply(json.loads).tolist()).values.flatten().tolist()

In [None]:
print(type(unpacked_messages[0]))

In [None]:
df['sentiment'] = model.predict_sentiment(unpacked_messages)

In [None]:
df

In [None]:
cols_for_histplot = user_information.columns[4:]

In [None]:
for col in cols_for_histplot:
    
    sns.histplot(user_information[col])
    plt.title(col)
    plt.savefig(f'{col}_histplot.png')
    plt.show()

In [None]:
# dfSummary(messages_data)
# dfSummary(emotional_events)
# dfSummary(user_information)
# dfSummary(click_data)

In [None]:
user_information['data_analysis_experience_bins'] = pd.cut(user_information['experience_analysis_tools'], bins=[0, 3, 5, 7], labels=['3 and lower', '4', '5 and higher'])
user_information['data_analysis_experience_bins'].value_counts()

In [None]:
user_info_with_avg_valence = user_information.merge(emotional_events.groupby('userId')['valence'].mean().reset_index(), left_on='id', right_on='userId', how='left')
user_info_with_avg_valence

In [None]:
sns.histplot(user_info_with_avg_valence[user_info_with_avg_valence['data_analysis_experience_bins'] == "5 and higher"]['valence'])
plt.title('Valence for users with 5 and higher experience')
plt.show()

In [None]:
sns.histplot(user_info_with_avg_valence[user_info_with_avg_valence['data_analysis_experience_bins'] == "3 and lower"]['valence'])
plt.title('Valence for users with 3 and lower experience')
plt.show()

## Age


In [None]:
user_information['age_bins'] = pd.cut(user_information['age'], bins=[0, 30, 50, 200], labels=['0-30', '30-50', 'older'])
user_info_with_avg_valence = user_information.merge(emotional_events.groupby('userId')['valence'].mean().reset_index(), left_on='id', right_on='userId', how='left')

In [None]:
for bins in user_info_with_avg_valence['age_bins'].unique():
    subset = user_info_with_avg_valence[user_info_with_avg_valence['age_bins'] == bins]
    sns.histplot(subset['valence'])
    plt.title(f'Valence for users in {bins} age group')
    plt.savefig(f'valence_{bins}_age.png')
    plt.show()

## General Satisfaction

In [None]:
user_information.columns

In [None]:
user_information['satisfaction_bins'] = pd.cut(user_information['Satisfaction'], bins=[0, 3, 5, 7], labels=['3 and lower', '4', '5 and higher'])
user_info_with_avg_valence = user_information.merge(emotional_events.groupby('userId')['valence'].mean().reset_index(), left_on='id', right_on='userId', how='left')

In [None]:
for bins in user_info_with_avg_valence['satisfaction_bins'].unique():
    subset = user_info_with_avg_valence[user_info_with_avg_valence['satisfaction_bins'] == bins]
    sns.histplot(subset['valence'])
    plt.title(f'Valence for users with {bins} satisfaction')
    plt.savefig(f'valence_{bins}_satisfaction.png')
    plt.show()

## Trust

In [None]:
user_information['trust_bins'] = pd.cut(user_information['Trust 1'], bins=[0, 3, 5, 7], labels=['3 and lower', '4', '5 and higher'])
user_info_with_avg_valence = user_information.merge(emotional_events.groupby('userId')['valence'].mean().reset_index(), left_on='id', right_on='userId', how='left')

In [None]:
for bins in user_info_with_avg_valence['trust_bins'].unique():
    subset = user_info_with_avg_valence[user_info_with_avg_valence['trust_bins'] == bins]
    sns.histplot(subset['valence'])
    plt.title(f'Valence for users with {bins} trust (1)')
    plt.savefig(f'valence_for_users_with_{bins}_trust.png')
    plt.show()

## education

In [None]:
user_info_with_avg_valence = user_information.merge(emotional_events.groupby('userId')['valence'].mean().reset_index(), left_on='id', right_on='userId', how='left')


In [None]:
for bins in user_info_with_avg_valence['education'].unique():
    subset = user_info_with_avg_valence[user_info_with_avg_valence['education'] == bins]
    sns.histplot(subset['valence'])
    plt.title(f'Valence for users with {bins} education')
    plt.savefig(f'Valence for users with {bins} education.png')
    plt.show()

## Mögliche Ideen
- wie wirkt sich der Assistent auf die Nutzer aus, je nach dem ob sie Vorerfahrung mit Datenanalyse haben oder nicht?
- 