In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from summarytools import dfSummary
import json
from germansentiment import SentimentModel

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import nltk
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/maximiliangraf/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [3]:
click_data = pd.read_csv('dataset/click_data.csv')
emotional_events = pd.read_csv('dataset/emotional_events.csv')
messages_data = pd.read_csv('dataset/messages_data.csv')
# task_types = pd.read_csv('dataset/task_types.csv')
user_information = pd.read_csv('dataset/user_information.csv')

In [4]:
# Preprocessing Data Types
emotional_events['valence'] = emotional_events['valence'].replace('AO07', '7').astype(float)
emotional_events['arousal'] = emotional_events['arousal'].replace('AO07', '7').astype(float)

In [5]:
# Filtering Data

ee = emotional_events[emotional_events['task_type'] == 'information_finding']
len(ee)

1155

# Create Emotion Quartiles

In [6]:
ee['arousal'].unique()

array([ 6., nan,  5.,  3.,  2.,  7.,  4.,  1.])

In [7]:
ee['valence'].unique()

array([ 5., nan,  1.,  3.,  2.,  6.,  4.])

In [8]:
ee['valence_high'] = np.where(ee['valence'] > 3, 1, 0)
ee['arousal_high'] = np.where(ee['arousal'] > 3, 1, 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ee['valence_high'] = np.where(ee['valence'] > 3, 1, 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ee['arousal_high'] = np.where(ee['arousal'] > 3, 1, 0)


x: valence
y: arousal

|2|4|
|---|---|
|1|3|

In [9]:
def get_emotion_score(df):
    if df['valence_high'] == 1 and df['arousal_high'] == 1:
        return 4
    elif df['valence_high'] == 1 and df['arousal_high'] == 0:
        return 3
    elif df['valence_high'] == 0 and df['arousal_high'] == 1:
        return 2
    elif df['valence_high'] == 0 and df['arousal_high'] == 0:
        return 1
    else:
        return np.nan
    
ee['emotion_quartile'] = ee.apply(get_emotion_score, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ee['emotion_quartile'] = ee.apply(get_emotion_score, axis=1)


In [10]:
ee.emotion_quartile.value_counts()

emotion_quartile
1    552
4    293
2    247
3     63
Name: count, dtype: int64

# Erstellung der neuen Features

In [11]:
ee.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1155 entries, 0 to 1634
Data columns (total 17 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                0 non-null      float64
 1   userId            1155 non-null   int64  
 2   task_type         1155 non-null   object 
 3   task              1155 non-null   int64  
 4   input_type        1155 non-null   object 
 5   input             1150 non-null   object 
 6   input_assessment  0 non-null      float64
 7   confidence        405 non-null    float64
 8   understanding     405 non-null    float64
 9   valence           758 non-null    float64
 10  arousal           758 non-null    float64
 11  task_start        405 non-null    float64
 12  task_end          1155 non-null   float64
 13  task_time         405 non-null    float64
 14  valence_high      1155 non-null   int64  
 15  arousal_high      1155 non-null   int64  
 16  emotion_quartile  1155 non-null   int64  
dtype

In [12]:
messages_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2560 entries, 0 to 2559
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   userId          2560 non-null   int64 
 1   task            2560 non-null   int64 
 2   message_type    2560 non-null   object
 3   timestamp       2560 non-null   object
 4   input           2560 non-null   object
 5   timestamp_unix  2560 non-null   int64 
dtypes: int64(3), object(3)
memory usage: 120.1+ KB


In [13]:
messages_data['agent_output'] = messages_data['input'].apply(lambda x: json.loads(x).get('output'))
messages_data['user_input'] = messages_data['input'].apply(lambda x: json.loads(x).get('input'))
messages_data

Unnamed: 0,userId,task,message_type,timestamp,input,timestamp_unix,agent_output,user_input
0,127,1,human,2024-06-25 09:01:09.815275,"{""input"": ""Gibt es zu einem oder mehreren Stad...",1719306069,,Gibt es zu einem oder mehreren Stadtbezirken a...
1,127,5,human,2024-06-25 09:05:50.543523,"{""input"": ""In welchem Stadtbezirk gab es 2010 ...",1719306350,,In welchem Stadtbezirk gab es 2010 die meisten...
2,127,5,agent_finish,2024-06-25 09:05:59.495982,"{""output"": ""Der Stadtbezirk mit den meisten Al...",1719306359,Der Stadtbezirk mit den meisten Altbauwohnunge...,
3,127,5,human,2024-06-25 09:10:14.895027,"{""input"": ""Stadtbezirk mit den meisten Wohnung...",1719306614,,Stadtbezirk mit den meisten Wohnungen mit Bauj...
4,129,4,human,2024-06-25 09:19:21.478092,"{""input"": ""Wie hoch ist die Kitaquote in der S...",1719307161,,Wie hoch ist die Kitaquote in der Stadt Freibu...
...,...,...,...,...,...,...,...,...
2555,238,5,agent_finish,2024-07-09 16:42:55.708772,"{""output"": ""In Oberwiehre gab es 2010 die meis...",1720543375,In Oberwiehre gab es 2010 die meisten Altbauwo...,
2556,238,5,human,2024-07-09 16:44:19.09898,"{""input"": ""wie viele Baugenehmigungen wurden i...",1720543459,,wie viele Baugenehmigungen wurden in Oberwiehr...
2557,238,5,agent_finish,2024-07-09 16:44:25.837833,"{""output"": ""In Oberwiehre wurden insgesamt X B...",1720543465,In Oberwiehre wurden insgesamt X Baugenehmigun...,
2558,238,5,human,2024-07-09 16:45:41.986446,"{""input"": ""wie viele Baugenehmigungen sind in...",1720543541,,wie viele Baugenehmigungen sind in Oberwiehre...


In [14]:
ee.drop(columns='id', inplace=True)
ee.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1155 entries, 0 to 1634
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   userId            1155 non-null   int64  
 1   task_type         1155 non-null   object 
 2   task              1155 non-null   int64  
 3   input_type        1155 non-null   object 
 4   input             1150 non-null   object 
 5   input_assessment  0 non-null      float64
 6   confidence        405 non-null    float64
 7   understanding     405 non-null    float64
 8   valence           758 non-null    float64
 9   arousal           758 non-null    float64
 10  task_start        405 non-null    float64
 11  task_end          1155 non-null   float64
 12  task_time         405 non-null    float64
 13  valence_high      1155 non-null   int64  
 14  arousal_high      1155 non-null   int64  
 15  emotion_quartile  1155 non-null   int64  
dtypes: float64(8), int64(5), object(3)
memory usage

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ee.drop(columns='id', inplace=True)


## No answer given 

In [15]:
# conversation ended in a technical error
df = pd.merge(ee, messages_data, left_on=['userId', 'task'], right_on=['userId', 'task'], how='left')

In [16]:
df['no_answer_given'] = df['agent_output'] == "Leider habe ich keine Antwort auf diese Frage. Versuchen Sie bitte Ihre Frage anders zu formulieren, oder komplexe Fragestellungen in einfachere Fragen zu unterteilen."
print(df.no_answer_given.value_counts())

no_answer_given
False    6812
True      506
Name: count, dtype: int64


## Prompt Sentiment

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7318 entries, 0 to 7317
Data columns (total 23 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   userId            7318 non-null   int64  
 1   task_type         7318 non-null   object 
 2   task              7318 non-null   int64  
 3   input_type        7318 non-null   object 
 4   input_x           7313 non-null   object 
 5   input_assessment  0 non-null      float64
 6   confidence        2434 non-null   float64
 7   understanding     2434 non-null   float64
 8   valence           4871 non-null   float64
 9   arousal           4871 non-null   float64
 10  task_start        2434 non-null   float64
 11  task_end          7318 non-null   float64
 12  task_time         2434 non-null   float64
 13  valence_high      7318 non-null   int64  
 14  arousal_high      7318 non-null   int64  
 15  emotion_quartile  7318 non-null   int64  
 16  message_type      7298 non-null   object 


In [18]:
model = SentimentModel()
texts = [
    "Mit keinem guten Ergebniss","Das ist gar nicht mal so gut",
    "Total awesome!","nicht so schlecht wie erwartet",
    "Der Test verlief positiv.","Sie fährt ein grünes Auto."]
       
model.predict_sentiment(["Mit keinem guten Ergebniss"])

['negative']

In [23]:
df['input_y'].astype(str)

0       {"input": "Nennen Sie alle Bezirke in einer du...
1       {"output": "Unterwiehre-S\u00fcd, Herdern-Nord...
2       {"input": "Nennen Sie alle Bezirke in einer du...
3       {"output": "Unterwiehre-S\u00fcd, Herdern-Nord...
4       {"input": "Nennen Sie die Einwohnerzahlen die ...
                              ...                        
7313    {"output": "In Oberwiehre gab es 2010 die meis...
7314    {"input": "wie viele Baugenehmigungen wurden i...
7315    {"output": "In Oberwiehre wurden insgesamt X B...
7316    {"input": "wie viele Baugenehmigungen sind  in...
7317    {"output": "In Oberwiehre wurden insgesamt X B...
Name: input_y, Length: 7318, dtype: object

In [29]:
unpacked_messages = pd.json_normalize(messages_data['input'].apply(json.loads).tolist()).values.flatten().tolist()

AttributeError: 'list' object has no attribute 'dtpyes'

In [31]:
print(type(unpacked_messages[0]))

<class 'str'>


In [28]:
df['sentiment'] = model.predict_sentiment(unpacked_messages)

AttributeError: 'float' object has no attribute 'replace'

In [None]:
df

In [None]:
cols_for_histplot = user_information.columns[4:]

In [None]:
for col in cols_for_histplot:
    
    sns.histplot(user_information[col])
    plt.title(col)
    plt.savefig(f'{col}_histplot.png')
    plt.show()

In [None]:
# dfSummary(messages_data)
# dfSummary(emotional_events)
# dfSummary(user_information)
# dfSummary(click_data)

In [None]:
user_information['data_analysis_experience_bins'] = pd.cut(user_information['experience_analysis_tools'], bins=[0, 3, 5, 7], labels=['3 and lower', '4', '5 and higher'])
user_information['data_analysis_experience_bins'].value_counts()

In [None]:
user_info_with_avg_valence = user_information.merge(emotional_events.groupby('userId')['valence'].mean().reset_index(), left_on='id', right_on='userId', how='left')
user_info_with_avg_valence

In [None]:
sns.histplot(user_info_with_avg_valence[user_info_with_avg_valence['data_analysis_experience_bins'] == "5 and higher"]['valence'])
plt.title('Valence for users with 5 and higher experience')
plt.show()

In [None]:
sns.histplot(user_info_with_avg_valence[user_info_with_avg_valence['data_analysis_experience_bins'] == "3 and lower"]['valence'])
plt.title('Valence for users with 3 and lower experience')
plt.show()

## Age


In [None]:
user_information['age_bins'] = pd.cut(user_information['age'], bins=[0, 30, 50, 200], labels=['0-30', '30-50', 'older'])
user_info_with_avg_valence = user_information.merge(emotional_events.groupby('userId')['valence'].mean().reset_index(), left_on='id', right_on='userId', how='left')

In [None]:
for bins in user_info_with_avg_valence['age_bins'].unique():
    subset = user_info_with_avg_valence[user_info_with_avg_valence['age_bins'] == bins]
    sns.histplot(subset['valence'])
    plt.title(f'Valence for users in {bins} age group')
    plt.savefig(f'valence_{bins}_age.png')
    plt.show()

## General Satisfaction

In [None]:
user_information.columns

In [None]:
user_information['satisfaction_bins'] = pd.cut(user_information['Satisfaction'], bins=[0, 3, 5, 7], labels=['3 and lower', '4', '5 and higher'])
user_info_with_avg_valence = user_information.merge(emotional_events.groupby('userId')['valence'].mean().reset_index(), left_on='id', right_on='userId', how='left')

In [None]:
for bins in user_info_with_avg_valence['satisfaction_bins'].unique():
    subset = user_info_with_avg_valence[user_info_with_avg_valence['satisfaction_bins'] == bins]
    sns.histplot(subset['valence'])
    plt.title(f'Valence for users with {bins} satisfaction')
    plt.savefig(f'valence_{bins}_satisfaction.png')
    plt.show()

## Trust

In [None]:
user_information['trust_bins'] = pd.cut(user_information['Trust 1'], bins=[0, 3, 5, 7], labels=['3 and lower', '4', '5 and higher'])
user_info_with_avg_valence = user_information.merge(emotional_events.groupby('userId')['valence'].mean().reset_index(), left_on='id', right_on='userId', how='left')

In [None]:
for bins in user_info_with_avg_valence['trust_bins'].unique():
    subset = user_info_with_avg_valence[user_info_with_avg_valence['trust_bins'] == bins]
    sns.histplot(subset['valence'])
    plt.title(f'Valence for users with {bins} trust (1)')
    plt.savefig(f'valence_for_users_with_{bins}_trust.png')
    plt.show()

## education

In [None]:
user_info_with_avg_valence = user_information.merge(emotional_events.groupby('userId')['valence'].mean().reset_index(), left_on='id', right_on='userId', how='left')


In [None]:
for bins in user_info_with_avg_valence['education'].unique():
    subset = user_info_with_avg_valence[user_info_with_avg_valence['education'] == bins]
    sns.histplot(subset['valence'])
    plt.title(f'Valence for users with {bins} education')
    plt.savefig(f'Valence for users with {bins} education.png')
    plt.show()

## Mögliche Ideen
- wie wirkt sich der Assistent auf die Nutzer aus, je nach dem ob sie Vorerfahrung mit Datenanalyse haben oder nicht?
- 