# Data Preparation

This file contains data preparation for the PV251 visualization project.

## Import modules

In [106]:
import pandas as pd

## Data

In [107]:
data_days = pd.read_csv("data/days.csv")
time_slept = pd.read_csv("data/timeSlept.csv")

In [108]:
data_days.head()

Unnamed: 0,date,mindState,associations,descriptions,context
0,23-12-2024,2,Family;Friends,Peaceful;Happy;Joyful,Nephew birthday party
1,21-12-2024,2,Family;Friends,Happy;Grateful,Scout's Christmas Event
2,18-12-2024,1,Education;Friends,Calm;Content,Two tests today; one was tough; one easy; deli...


In [109]:
# Update date format to YYYY-MM-DD (format now is DD-MM-YYYY)
data_days["date"] = pd.to_datetime(
    data_days["date"], format="%d-%m-%Y"
).dt.strftime("%Y-%m-%d")

time_slept["date"] = pd.to_datetime(
    time_slept["date"], format="%d-%m-%Y"
).dt.strftime("%Y-%m-%d")

data_days["date"].head()

0    2024-12-23
1    2024-12-21
2    2024-12-18
Name: date, dtype: object

In [110]:
# Remove semicolons from context
data_days["context"] = data_days["context"].str.replace(";", "")

In [None]:
# Map `mindState` column to string values
MIND_STATE_MAP = {
    -3: "Very unpleasant",
    -2: "Unpleasant",
    -1: "Slightly unpleasant",
    0: "Neutral",
    1: "Slightly pleasant",
    2: "Pleasant",
    3: "Very pleasant",
}

INVERSE_MIND_STATE_MAP = {v: k for k, v in MIND_STATE_MAP.items()}

data_days["mindState"] = data_days["mindState"].map(MIND_STATE_MAP)
data_days.head()

Unnamed: 0,date,mindState,associations,descriptions,context
0,2024-12-23,Pleasant,Family;Friends,Peaceful;Happy;Joyful,Nephew birthday party
1,2024-12-21,Pleasant,Family;Friends,Happy;Grateful,Scout's Christmas Event
2,2024-12-18,Slightly pleasant,Education;Friends,Calm;Content,Two tests today one was tough one easy delicio...


### Associations

In [112]:
# Associations are in format: `association1;association2;...`
# Extract all associations into separate rows
date_associations = data_days[["date", "associations", "mindState"]].copy()
date_associations["associations"] = date_associations[
    "associations"
].str.split(";")

date_associations = date_associations.explode("associations")

date_associations.head()

Unnamed: 0,date,associations,mindState
0,2024-12-23,Family,Pleasant
0,2024-12-23,Friends,Pleasant
1,2024-12-21,Family,Pleasant
1,2024-12-21,Friends,Pleasant
2,2024-12-18,Education,Slightly pleasant


### Descriptions

In [113]:
# Do the same with `descriptions` column
date_descriptions = data_days[["date", "descriptions", "mindState"]].copy()
date_descriptions["descriptions"] = date_descriptions[
    "descriptions"
].str.split(";")

date_descriptions = date_descriptions.explode("descriptions")

date_descriptions.head(10)

Unnamed: 0,date,descriptions,mindState
0,2024-12-23,Peaceful,Pleasant
0,2024-12-23,Happy,Pleasant
0,2024-12-23,Joyful,Pleasant
1,2024-12-21,Happy,Pleasant
1,2024-12-21,Grateful,Pleasant
2,2024-12-18,Calm,Slightly pleasant
2,2024-12-18,Content,Slightly pleasant


### Context

In [114]:
# For the context, we split by space and explode
date_context = data_days[["date", "context", "mindState"]].copy()
date_context["context"] = date_context["context"].str.split(" ")

date_context = date_context.explode("context")

# Strip leading and trailing whitespaces and capitalize
date_context["context"] = date_context["context"].str.strip().str.capitalize()

date_context.head(10)

Unnamed: 0,date,context,mindState
0,2024-12-23,Nephew,Pleasant
0,2024-12-23,Birthday,Pleasant
0,2024-12-23,Party,Pleasant
1,2024-12-21,Scout's,Pleasant
1,2024-12-21,Christmas,Pleasant
1,2024-12-21,Event,Pleasant
2,2024-12-18,Two,Slightly pleasant
2,2024-12-18,Tests,Slightly pleasant
2,2024-12-18,Today,Slightly pleasant
2,2024-12-18,One,Slightly pleasant


In [115]:
# TODO: UPDATE WORDS
update_words = {
    # Those with empty string are removed
    "a": "",
    "the": "",
    "and": "",
    "of": "",
    "to": "",
    "was": "",
    "in": "",
    "on": "",
    "with": "",
    "for": "",
    "my": "",
    "at": "",
    "from": "",
    "i": "",
    "by": "",
    "it": "",
    "is": "",
    "that": "",
}

date_context["context"] = date_context["context"].replace(update_words)
# Remove empty strings
date_context = date_context[date_context["context"] != ""]

# Remove `'s` from words
date_context["context"] = date_context["context"].str.replace("'s", "")

date_context.head()

Unnamed: 0,date,context,mindState
0,2024-12-23,Nephew,Pleasant
0,2024-12-23,Birthday,Pleasant
0,2024-12-23,Party,Pleasant
1,2024-12-21,Scout,Pleasant
1,2024-12-21,Christmas,Pleasant


### Combine

In [116]:
# Combine all dataframes into one by concatenating them
date_word = pd.concat(
    [
        date_associations.rename(columns={"associations": "word"}),
        date_descriptions.rename(columns={"descriptions": "word"}),
        date_context.rename(columns={"context": "word"}),
    ]
).reset_index(drop=True)

# Set all words capital
date_word["word"] = date_word["word"].str.capitalize()

date_word.head()

Unnamed: 0,date,word,mindState
0,2024-12-23,Family,Pleasant
1,2024-12-23,Friends,Pleasant
2,2024-12-21,Family,Pleasant
3,2024-12-21,Friends,Pleasant
4,2024-12-18,Education,Slightly pleasant


### Date, Time and Mind State

In [117]:
time_slept.head()

Unnamed: 0,date,sleep
0,2024-12-23,7:46
1,2024-12-21,4:43
2,2024-12-18,7:30


In [118]:
# Sleep time is in format HH:MM
# Transform to hours only
time_slept["sleep"] = (
    time_slept["sleep"]
    .str.split(":")
    .apply(lambda x: int(x[0]) + int(x[1]) / 60)
    .round(2)
)

time_slept.head()

Unnamed: 0,date,sleep
0,2024-12-23,7.77
1,2024-12-21,4.72
2,2024-12-18,7.5


In [119]:
# Merge time slept with data_days to create a single dataframe
# With columns date, timeSlept, mindState
date_mind_sleep = pd.merge(
    data_days[["date", "mindState"]], time_slept, on="date", how="left"
)

# Change mindState to numerical values
date_mind_sleep["mindState"] = date_mind_sleep["mindState"].map(
    INVERSE_MIND_STATE_MAP
)

date_mind_sleep.head()

Unnamed: 0,date,mindState,sleep
0,2024-12-23,2,7.77
1,2024-12-21,2,4.72
2,2024-12-18,1,7.5


## Network Table

### Nodes

In [120]:
nodes = date_word.drop("mindState", axis=1)
print("Shape of nodes:", nodes.shape)
nodes.head()

Shape of nodes: (29, 2)


Unnamed: 0,date,word
0,2024-12-23,Family
1,2024-12-23,Friends
2,2024-12-21,Family
3,2024-12-21,Friends
4,2024-12-18,Education


## Export data

In [121]:
# Export data as CSV
nodes.to_csv("public/nodes.csv", index=False)

date_mind_sleep.to_csv("public/day_stats.csv", index=False)