In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from summarytools import dfSummary

In [2]:
click_data = pd.read_csv('dataset/click_data.csv')
emotional_events = pd.read_csv('dataset/emotional_events.csv')
messages_data = pd.read_csv('dataset/messages_data.csv')
# task_types = pd.read_csv('dataset/task_types.csv')
user_information = pd.read_csv('dataset/user_information.csv')

In [7]:
# Preprocessing Data Types
emotional_events['valence'] = emotional_events['valence'].replace('AO07', '7').astype(float)
emotional_events['arousal'] = emotional_events['arousal'].replace('AO07', '7').astype(float)

# Create Emotion Quartiles

In [11]:
emotional_events['arousal'].unique()

array([ 6., nan,  5.,  3.,  2.,  4.,  7.,  1.])

In [12]:
emotional_events['valence'].unique()

array([ 5., nan,  1.,  3.,  2.,  6.,  7.,  4.])

In [13]:
emotional_events['valence_high'] = np.where(emotional_events['valence'] > 3, 1, 0)
emotional_events['arousal_high'] = np.where(emotional_events['arousal'] > 3, 1, 0)

x: valence
y: arousal

|2|4|
|---|---|
|1|3|

In [15]:
def get_emotion_score(df):
    if df['valence_high'] == 1 and df['arousal_high'] == 1:
        return 4
    elif df['valence_high'] == 1 and df['arousal_high'] == 0:
        return 3
    elif df['valence_high'] == 0 and df['arousal_high'] == 1:
        return 2
    elif df['valence_high'] == 0 and df['arousal_high'] == 0:
        return 1
    else:
        return np.nan
    
emotional_events['emotion_quartile'] = emotional_events.apply(get_emotion_score, axis=1)

In [16]:
emotional_events.emotion_quartile.value_counts()

emotion_quartile
4    620
1    570
2    273
3    178
Name: count, dtype: int64

In [None]:
cols_for_histplot = user_information.columns[4:]

In [None]:
for col in cols_for_histplot:
    
    sns.histplot(user_information[col])
    plt.title(col)
    plt.savefig(f'{col}_histplot.png')
    plt.show()

In [None]:
# dfSummary(messages_data)
# dfSummary(emotional_events)
# dfSummary(user_information)
# dfSummary(click_data)

In [None]:
user_information['data_analysis_experience_bins'] = pd.cut(user_information['experience_analysis_tools'], bins=[0, 3, 5, 7], labels=['3 and lower', '4', '5 and higher'])
user_information['data_analysis_experience_bins'].value_counts()

In [None]:
user_info_with_avg_valence = user_information.merge(emotional_events.groupby('userId')['valence'].mean().reset_index(), left_on='id', right_on='userId', how='left')
user_info_with_avg_valence

In [None]:
sns.histplot(user_info_with_avg_valence[user_info_with_avg_valence['data_analysis_experience_bins'] == "5 and higher"]['valence'])
plt.title('Valence for users with 5 and higher experience')
plt.show()

In [None]:
sns.histplot(user_info_with_avg_valence[user_info_with_avg_valence['data_analysis_experience_bins'] == "3 and lower"]['valence'])
plt.title('Valence for users with 3 and lower experience')
plt.show()

## Age


In [None]:
user_information['age_bins'] = pd.cut(user_information['age'], bins=[0, 30, 50, 200], labels=['0-30', '30-50', 'older'])
user_info_with_avg_valence = user_information.merge(emotional_events.groupby('userId')['valence'].mean().reset_index(), left_on='id', right_on='userId', how='left')

In [None]:
for bins in user_info_with_avg_valence['age_bins'].unique():
    subset = user_info_with_avg_valence[user_info_with_avg_valence['age_bins'] == bins]
    sns.histplot(subset['valence'])
    plt.title(f'Valence for users in {bins} age group')
    plt.savefig(f'valence_{bins}_age.png')
    plt.show()

## General Satisfaction

In [None]:
user_information.columns

In [None]:
user_information['satisfaction_bins'] = pd.cut(user_information['Satisfaction'], bins=[0, 3, 5, 7], labels=['3 and lower', '4', '5 and higher'])
user_info_with_avg_valence = user_information.merge(emotional_events.groupby('userId')['valence'].mean().reset_index(), left_on='id', right_on='userId', how='left')

In [None]:
for bins in user_info_with_avg_valence['satisfaction_bins'].unique():
    subset = user_info_with_avg_valence[user_info_with_avg_valence['satisfaction_bins'] == bins]
    sns.histplot(subset['valence'])
    plt.title(f'Valence for users with {bins} satisfaction')
    plt.savefig(f'valence_{bins}_satisfaction.png')
    plt.show()

## Trust

In [None]:
user_information['trust_bins'] = pd.cut(user_information['Trust 1'], bins=[0, 3, 5, 7], labels=['3 and lower', '4', '5 and higher'])
user_info_with_avg_valence = user_information.merge(emotional_events.groupby('userId')['valence'].mean().reset_index(), left_on='id', right_on='userId', how='left')

In [None]:
for bins in user_info_with_avg_valence['trust_bins'].unique():
    subset = user_info_with_avg_valence[user_info_with_avg_valence['trust_bins'] == bins]
    sns.histplot(subset['valence'])
    plt.title(f'Valence for users with {bins} trust (1)')
    plt.savefig(f'valence_for_users_with_{bins}_trust.png')
    plt.show()

## education

In [None]:
user_info_with_avg_valence = user_information.merge(emotional_events.groupby('userId')['valence'].mean().reset_index(), left_on='id', right_on='userId', how='left')


In [None]:
for bins in user_info_with_avg_valence['education'].unique():
    subset = user_info_with_avg_valence[user_info_with_avg_valence['education'] == bins]
    sns.histplot(subset['valence'])
    plt.title(f'Valence for users with {bins} education')
    plt.savefig(f'Valence for users with {bins} education.png')
    plt.show()

## Mögliche Ideen
- wie wirkt sich der Assistent auf die Nutzer aus, je nach dem ob sie Vorerfahrung mit Datenanalyse haben oder nicht?
- 