In [None]:
import pandas as pd
import ollama
import matplotlib.pyplot as plt
from datetime import timedelta

In [None]:
def preprocess(df):
    df = df[df['created_at'] <= '2024-04-24 17:37']
    # 1 bis 6
    # keep only rows where 'user_name' includes 'ths3'
    #df = df[df['user_name'].str.contains('ths3', case=False)]
    df = df[df['user_name'].str.contains(r'ths3p[1-6]', case=False)]
    # if two rows have same session id and same answers to all questions, keep only one row
    df.drop_duplicates(subset=['session_id', 'yrActivities', 'onYrMind'], keep='first', inplace=True)

    # drop rows where the session id is longer than 5 digits
    df = df[df['session_id'].astype(str).map(len) <= 5]
    
    # if two rows have the same session id but different answers to the questions, add one day to the date in "created_at" column
    # Convert created_at to datetime
    df['created_at'] = pd.to_datetime(df['created_at'])
    
    # Identify duplicates by session_id
    duplicates = df[df.duplicated(subset=['session_id'], keep=False)]
    
    # Iterate over each group of duplicates
    for session_id, group in duplicates.groupby('session_id'):
        if len(group) > 1:
            # Sort by created_at to add days incrementally
            group = group.sort_values(by='created_at')
            for i in range(1, len(group)):
                df.loc[group.index[i], 'created_at'] += timedelta(days=i)
                
    df = df.dropna(subset=['yrDay'])
    
    return df

# Prepare dfs
df = pd.read_csv('data/pepper_dump_19_06_2024.csv')

df_eval_c = pd.read_csv('data/pepper_biopyschosocialenv_manually_labeled_responses_redacted.csv')
df_eval_c['created_at'] = pd.to_datetime(df_eval_c['created_at'], errors='coerce')
df_eval_c['created_at'] = df_eval_c['created_at'].dt.strftime('%Y-%m-%d %H:%M:%S')

# merge all cols to df_eval_c on FsDaily_id
additional_columns = [col for col in df.columns if col not in df_eval_c.columns]
df_eval_c = df_eval_c.merge(df[['FsDaily_id'] + additional_columns], on='FsDaily_id', how='inner')

# Apply preprocessing
df = preprocess(df)
df_eval = preprocess(df_eval_c)
df_eval

In [None]:
def extract_label(response):
    candidate_labels = ['environmental', 'physical', 'mental', 'social', 'other']

    for label in candidate_labels:
        if label in response:
            return label
    return None

def analyze_sentiment(df, column):

    prompt_template = ("""
        Bitte analysiere den folgenden Text und ordne ihn einer der vorgegebenen Kategorien zu: 'environmental', 'physical', 'mental', 'social'.
        
        Falls der Text keine Informationen enthält, die eine genaue Zuordnung zulassen, antworte mit 'other'.
        
        - 'social': Sobald im Text von einer anderen Person berichtet wird oder soziale Aktivitäten erwähnt werden. Beispiel: "Ich habe meinen Freund getroffen."
        - 'environmental': Sobald es im Text um das Wetter oder die Umwelt geht. Beispiel: "Heute war es sehr sonnig."
        - 'physical': Sobald im Text über körperliche Beschwerden, wie Schmerzen oder körperliche Aktivitäten, gesprochen wird. Beispiel: "Ich habe heute Sport gemacht."
        - 'mental': Sobald es im Text um das psychische Wohlbefinden geht. Beispiel: "Ich fühle mich heute sehr gestresst."
        - 'other': Sobald es im text um eine Tätigkeit oder ein Thema geht, das nicht in die anderen Kategorien passt. Beispiel: "Das essen war lecker"
        
        Gib NUR das passende Label zurück, ohne zusätzliche Formatierungen oder Erklärungen.
        
        Text: {}
    """)
 
    system_message = {
        'role': 'system',
        'content': "Du bist ein Bewerter, der Texte in Kategorien einteilt. Deine Aufgabe ist es, jeden gegebenen Text genau zu analysieren und das passendste Label auszuwählen. Es geht darum, das Thema zu identifizieren, über das die Leute sprechen: physische Belange ihres Körpers, soziale Aktivitäten mit anderen Menschen, psychologische Themen, oder die Umwelt und das Wetter.",
    }

    classifications = []

    for sentence in df[column].fillna(''):
        current_prompt = prompt_template.format(sentence)

        response = ollama.chat(model='llama3', messages=[
                system_message,
                {
                    'role': 'user',
                    'content': current_prompt,
                },
            ],
            options = {

          'temperature': 0
        }) 

        label = extract_label(response['message']['content'])
        classifications.append(label)

    df[column + '_classification'] = classifications

    return df

In [None]:
# res_yr_activities
res_yr_activities = analyze_sentiment(df, column='yrActivities')
res_selected_res_yr_activities = res_yr_activities[['yrActivities', 'yrActivities_classification']]
res_selected_res_yr_activities

In [None]:
df_merged = pd.merge(res_yr_activities[['FsDaily_id', 'yrActivities_classification']], 
                     df_eval[['FsDaily_id', 'manual_label_yrActivities']], 
                     on='FsDaily_id', how='inner')

def calculate_accuracy(predictions, labels):
    return (predictions == labels).mean()

acc_yrActivities = calculate_accuracy(df_merged['yrActivities_classification'], df_merged['manual_label_yrActivities'])

print(f"Acc yrActivities: {acc_yrActivities:.2f}")

In [None]:
# res_yr_activities
res_madeYouHappy = analyze_sentiment(df, column='madeYouHappy')
res_selected_res_madeYouHappy = res_madeYouHappy[['madeYouHappy', 'madeYouHappy_classification']]
res_selected_res_madeYouHappy

In [None]:
df_merged = pd.merge(res_madeYouHappy[['FsDaily_id', 'madeYouHappy_classification']], 
                     df_eval[['FsDaily_id', 'manual_label_madeYouHappy']], 
                     on='FsDaily_id', how='inner')

def calculate_accuracy(predictions, labels):
    return (predictions == labels).mean()

acc_yrDay = calculate_accuracy(df_merged['madeYouHappy_classification'], df_merged['manual_label_madeYouHappy'])

print(f"Acc madeYouHappy: {acc_yrDay:.2f}")

In [None]:
# onYrMind
res_onYrMind = analyze_sentiment(df, column='onYrMind')
res_selected_res_onYrMind = res_onYrMind[['onYrMind', 'onYrMind_classification']]
res_selected_res_onYrMind

In [None]:
df_merged = pd.merge(res_onYrMind[['FsDaily_id', 'onYrMind_classification']], 
                     df_eval[['FsDaily_id', 'manual_label_onYrMind']], 
                     on='FsDaily_id', how='inner')

def calculate_accuracy(predictions, labels):
    return (predictions == labels).mean()

acc_onYrMind = calculate_accuracy(df_merged['onYrMind_classification'], df_merged['manual_label_onYrMind'])

print(f"Acc onYrMind: {acc_onYrMind:.2f}")

In [None]:
# yrDay
res_yrDay = analyze_sentiment(df, column='yrDay')
res_selected_res_yrDay = res_yrDay[['yrDay', 'yrDay_classification']]
res_selected_res_yrDay

In [None]:
df_merged = pd.merge(res_yrDay[['FsDaily_id', 'yrDay_classification']], 
                     df_eval[['FsDaily_id', 'manual_label_yrDay']], 
                     on='FsDaily_id', how='inner')

def calculate_accuracy(predictions, labels):
    return (predictions == labels).mean()

acc_yrDay = calculate_accuracy(df_merged['yrDay_classification'], df_merged['manual_label_yrDay'])

print(f"Genauigkeit yrDay: {acc_yrDay:.2f}")

In [None]:
pivot_yr_activities = res_yr_activities.pivot_table(index='user_name', columns='yrActivities_classification', aggfunc='size', fill_value=0)
pivot_madeYouHappy = res_madeYouHappy.pivot_table(index='user_name', columns='madeYouHappy_classification', aggfunc='size', fill_value=0)
pivot_onYrMind = res_onYrMind.pivot_table(index='user_name', columns='onYrMind_classification', aggfunc='size', fill_value=0)
pivot_yrDay = res_yrDay.pivot_table(index='user_name', columns='yrDay_classification', aggfunc='size', fill_value=0)

fig, axes = plt.subplots(2, 2, figsize=(15, 10))

pivot_yr_activities.plot(kind='bar', stacked=True, ax=axes[0, 0], legend=False)
axes[0, 0].set_title('yrActivities')
axes[0, 0].set_xlabel('user_name')
axes[0, 0].set_ylabel('Number of Entries')

pivot_madeYouHappy.plot(kind='bar', stacked=True, ax=axes[0, 1], legend=False)
axes[0, 1].set_title('madeYouHappy')
axes[0, 1].set_xlabel('user_name')
axes[0, 1].set_ylabel('Number of Entries')

pivot_onYrMind.plot(kind='bar', stacked=True, ax=axes[1, 0], legend=False)
axes[1, 0].set_title('onYrMind')
axes[1, 0].set_xlabel('user_name')
axes[1, 0].set_ylabel('Number of Entries')

bars = pivot_yrDay.plot(kind='bar', stacked=True, ax=axes[1, 1], legend=False)
axes[1, 1].set_title('yrDay')
axes[1, 1].set_xlabel('user_name')
axes[1, 1].set_ylabel('Number of Entries')

handles, labels = axes[1, 1].get_legend_handles_labels()
fig.legend(handles, labels, title='Categories', bbox_to_anchor=(1.05, 1), loc='upper left')

plt.tight_layout()
plt.show()

In [None]:

import plotly.graph_objects as go

user_df = res_yr_activities[res_yr_activities['user_name'] == 'ths3a6']

user_classifications = user_df['madeYouHappy_classification'].value_counts(normalize=True).reset_index()
user_classifications.columns = ['classification', 'proportion']

for category in ['physisch', 'psychologisch', 'sozial']:
    if category not in user_classifications['classification'].values:
        user_classifications = user_classifications.append({'classification': category, 'proportion': 0}, ignore_index=True)

user_classifications = user_classifications.set_index('classification').reindex(['physisch', 'psychologisch', 'sozial']).reset_index()


fig = go.Figure()

fig.add_trace(go.Scatterternary({
    'mode': 'markers',
    'name': 'ths3a1',
    'a': [user_classifications.loc[user_classifications['classification'] == 'physisch', 'proportion'].values[0]],
    'b': [user_classifications.loc[user_classifications['classification'] == 'psychologisch', 'proportion'].values[0]],
    'c': [user_classifications.loc[user_classifications['classification'] == 'sozial', 'proportion'].values[0]],
    'marker': {'symbol': 100, 'size': 14}
}))

fig.update_layout({
    'ternary': {
        'sum': 1,
        'aaxis': {'title': 'Physisch', 'min': 0.01, 'linewidth': 2, 'ticks': 'outside'},
        'baxis': {'title': 'Psychologisch', 'min': 0.01, 'linewidth': 2, 'ticks': 'outside'},
        'caxis': {'title': 'Sozial', 'min': 0.01, 'linewidth': 2, 'ticks': 'outside'}
    },
    'title': 'Klassifikationen von ths3a1 in einem ternären Diagramm'
})

fig.show()


In [None]:
df_eval[df_eval['manual_label_yrActivities'] == 'other']

In [None]:
df_eval