In [None]:
import pandas as pd
from collections import Counter
import json

In [None]:
def time2str(timestamp):
    if timestamp.hour < 6:
        return "night"
    elif timestamp.hour < 12:
        return "morning"
    elif timestamp.hour < 18:
        return "afternoon"
    else:
        return "evening"


def time2int(timestamp):
    # morning, afternoon, evening, night
    if timestamp.hour < 6:
        return 0
    elif timestamp.hour < 12:
        return 1
    elif timestamp.hour < 18:
        return 2
    else:
        return 3

In [None]:
df = pd.read_csv("/Users/janpf/projects/hiwi/jörn_alexa/jan/all_data_categorized.csv")
df["datum"] = pd.to_datetime(df["datum"])
df["timeofday"] = df["datum"].apply(time2int)
df = df.sort_values(by="datum")
df.drop(columns=["kategorie_single", "funktion_single"], inplace=True)
df.dropna(inplace=True)
df

In [None]:
feature_df = pd.read_excel(
    "/Users/janpf/projects/hiwi/jörn_alexa/graphenstuff/kontrollierte_korrekte_daten/Datensatz_LZS (1).xlsx"
)
list(feature_df.columns)

In [None]:
feature_df = feature_df[
    [
        "G003_01",
        # "RSQ.Skala_Angst.vor.Nähe_D002",
        # "RSQ.Skala_fehlendes.Vertrauen_D002",
        "lonliness_emotional_D009",
        "lonliness_social_D009",
        "NEOFFI_N_D008",
        "Uncanny.Valley_Humaness_A010_E2",
        "disclosiveness.towardsVA_amount_E006_E2",
        "bailenson.scale_social.presence_A013_E2",
        "PSI_cognitive_A015_E2",
        "Intimate.Friendship_complete.scale_E001_E2",
        # "Intimate.Friendship_Frankness_E001_E2",
        # "Intimate.Friendship_Sensitivity_E001_E2",
        # "Intimate.Friendship_Attachment_E001_E2",
        # "Intimate.Friendship_Exclusiveness_E001_E2",
        # "Intimate.Friendship_Giving_E001_E2",
        # "Intimate.Friendship_Trust_E001_E2",
        # "Intimate.Friendship_complete.scale_E001_E2",
        # "cluster_neu",
    ]
]

feature_df["nutzer"] = "empty"


for i in set(df["user"]):
    for j in range(0, len(feature_df["G003_01"])):
        if i[0:4] == feature_df["G003_01"][j]:
            feature_df["nutzer"][j] = i


feature_df = feature_df.drop(feature_df[feature_df["nutzer"] == "empty"].index)
feature_df = feature_df.reset_index()
feature_df = feature_df.drop(14).reset_index(drop=True)
feature_df.dropna(inplace=True)
if "cluster_neu" in feature_df.columns:
    feature_df["cluster"] = feature_df["cluster_neu"].astype(int)
    feature_df.drop(columns=["cluster_neu"], inplace=True)
feature_df.drop(columns=["index", "G003_01"], inplace=True)
feature_df

In [None]:
fkt2id = {"start": 0} | {fkt: i + 1 for i, fkt in enumerate(df["funktion_single_antwort"].value_counts().index)}
kat2id = {"start": 0} | {kat: i + 1 for i, kat in enumerate(df["kategorie_single_antwort"].value_counts().index)}
fkt2id, kat2id

In [None]:
walks = []
# generate random walks over the data
# a walk continues as long as a user is active again within 10 minutes
# a walk is a list of tuples (user, function, category, timeofday, weekday)
for user in df["user"].value_counts().index:
    user_df = df[df["user"] == user]
    features = feature_df[feature_df["nutzer"] == user]
    if len(features) == 0:
        print(f"no features for user {user}")
        continue
    elif len(features) > 1:
        print(f"more than one feature for user {user}")
        continue
    last_time = None
    current_walk = []
    for _, row in user_df.iterrows():
        if row["funktion_single_antwort"] in [
            # "Gerät missversteht oder kann Befehl nicht ausführen",
            "multiple",
        ] or row["kategorie_single_antwort"] in [
            # "Verfügbarkeit / Misserfolg",
            "multiple",
        ]:
            continue
        if last_time is None or ((row["datum"] - last_time).seconds / 60) < 15:
            current_walk.append(
                (
                    user,
                    row["funktion_single_antwort"],
                    row["kategorie_single_antwort"],
                    {
                        "timeofday": row["timeofday"],
                        "weekday": row["wochentag"],
                    }
                    | features.iloc[0].to_dict(),
                )
            )
            last_time = row["datum"]
        else:
            if len(current_walk) > 1:
                walks.append(current_walk)
            current_walk = [
                (
                    user,
                    row["funktion_single_antwort"],
                    row["kategorie_single_antwort"],
                    {
                        "timeofday": row["timeofday"],
                        "weekday": row["wochentag"],
                    }
                    | features.iloc[0].to_dict(),
                )
            ]
        del current_walk[-1][-1]["nutzer"]
walks[0]

In [None]:
print(len(walks)), Counter([len(walk) for walk in walks])

In [None]:
out = "/Users/janpf/projects/deeptrails/data/amz_real_data"

In [None]:
dataset = {}
dataset["args"] = {}
dataset["annotated_walks"] = walks
dataset["args"]["fkt2id"] = fkt2id
dataset["args"]["kat2id"] = kat2id

In [None]:
with open(f"{out}/dataset.jsonl", "w") as f:
    json.dump(dataset, f)

In [None]:
walks[0]

In [None]:
flat_walks = []
for walk in walks:
    for i, step in enumerate(walk):
        flat_walks.append(
            {
                "user": step[0],
                "funktion": step[1],
                "previous_funktion": walk[i - 1][1] if i > 0 else "start",
                "kategorie": step[2],
                "previous_kategorie": walk[i - 1][2] if i > 0 else "start",
                **step[3],
            }
        )
        # del flat_walks[-1]["nutzer"]
len(flat_walks), flat_walks[0]

In [None]:
kat_walks = []
attributes = None

for walk in flat_walks:
    kat_walks.append(dict(walk.items()))
    del kat_walks[-1]["user"]
    kat_walks[-1]["from"] = kat2id[kat_walks[-1]["previous_kategorie"]]
    del kat_walks[-1]["previous_kategorie"]
    kat_walks[-1]["to"] = kat2id[kat_walks[-1]["kategorie"]]
    del kat_walks[-1]["kategorie"]
    del kat_walks[-1]["previous_funktion"]
    del kat_walks[-1]["funktion"]
    if attributes is None:
        attributes = list(kat_walks[-1].keys())
    else:
        assert attributes == list(kat_walks[-1].keys())
    kat_walks[-1] = list(kat_walks[-1].values())

print(kat_walks[0])

arff.dump(
    "/Users/janpf/projects/deeptrails/data/amz_real_data/kat_walks.arff",
    names=attributes,
    row_iterator=kat_walks,
)

In [None]:
fkt_walks = []
attributes = None

for walk in flat_walks:
    fkt_walks.append(dict(walk.items()))
    del fkt_walks[-1]["user"]
    fkt_walks[-1]["from"] = fkt2id[fkt_walks[-1]["previous_funktion"]]
    del fkt_walks[-1]["previous_funktion"]
    fkt_walks[-1]["to"] = fkt2id[fkt_walks[-1]["funktion"]]
    del fkt_walks[-1]["funktion"]
    del fkt_walks[-1]["previous_kategorie"]
    del fkt_walks[-1]["kategorie"]
    if attributes is None:
        attributes = list(fkt_walks[-1].keys())
    else:
        assert attributes == list(fkt_walks[-1].keys())
    fkt_walks[-1] = list(fkt_walks[-1].values())

print(fkt_walks[0])

arff.dump(
    "/Users/janpf/projects/deeptrails/data/amz_real_data/fkt_walks.arff",
    names=attributes,
    row_iterator=fkt_walks,
)