# Data Preparation

This file contains data preparation for the PV251 visualization project.

## Import modules

In [473]:
import pandas as pd

## Data

In [None]:
data_days = pd.read_csv("data/days.csv", sep=";")
time_slept = pd.read_csv("data/timeSlept.csv", sep=";")


print(data_days["mindState"].value_counts(dropna=False))
data_days.head()

mindState
NaN                                    43
1                                      13
-1                                      7
2                                       5
0                                       5
3                                       1
Friends,family,CurrentEvents,travel     1
Name: count, dtype: int64


Unnamed: 0,date,mindState,associations,descriptions,context
0,23-12-2024,2,"Family,Friends","Peaceful,Happy,Joyful",Nephew birthday party
1,21-12-2024,2,"Family,Friends","Happy,Grateful",Scout's Christmas Event
2,18-12-2024,1,"Education,Friends","Calm,Content","Two tests today, one was tough, one easy, deli..."
3,17-12-2024,1,"Education,Friends",Calm,"Ok day, preparation for statistics"
4,16-12-2024,1,"Friends,Current Events,Travel","Calm,Happy","Travelling day, did something to school, chill..."


In [475]:
print("Nan values in data_days:")
print(data_days.isnull().sum())

Nan values in data_days:
date             0
mindState       43
associations    43
descriptions    43
context         46
dtype: int64


### Drop NaNs

In [476]:
# Drop rows with NaN values
data_days = data_days.dropna()
data_days = data_days.reset_index(drop=True)

print("Nan values in data_days after dropping:")
print(data_days.isnull().sum())

Nan values in data_days after dropping:
date            0
mindState       0
associations    0
descriptions    0
context         0
dtype: int64


In [477]:
print("Max and min dates in data_days:")
print(data_days["date"].max())
print(data_days["date"].min())

Max and min dates in data_days:
30-11-2024
01-12-2024


## Data Preparation

In [478]:
# Update date format to YYYY-MM-DD (format now is DD-MM-YYYY)
data_days["date"] = pd.to_datetime(
    data_days["date"], format="%d-%m-%Y"
).dt.strftime("%Y-%m-%d")

data_days["mindState"] = data_days["mindState"].astype(int)

time_slept["date"] = pd.to_datetime(
    time_slept["date"], format="%d-%m-%Y"
).dt.strftime("%Y-%m-%d")

data_days["date"].head()

0    2024-12-23
1    2024-12-21
2    2024-12-18
3    2024-12-17
4    2024-12-16
Name: date, dtype: object

In [479]:
# Remove semicolons from context
data_days["context"] = data_days["context"].str.replace(";", "")

In [480]:
# Map `mindState` column to string values
MIND_STATE_MAP = {
    -3: "Very unpleasant",
    -2: "Unpleasant",
    -1: "Slightly unpleasant",
    0: "Neutral",
    1: "Slightly pleasant",
    2: "Pleasant",
    3: "Very pleasant",
}

INVERSE_MIND_STATE_MAP = {v: k for k, v in MIND_STATE_MAP.items()}

data_days["mindState"] = data_days["mindState"].map(MIND_STATE_MAP)
data_days.head()

Unnamed: 0,date,mindState,associations,descriptions,context
0,2024-12-23,Pleasant,"Family,Friends","Peaceful,Happy,Joyful",Nephew birthday party
1,2024-12-21,Pleasant,"Family,Friends","Happy,Grateful",Scout's Christmas Event
2,2024-12-18,Slightly pleasant,"Education,Friends","Calm,Content","Two tests today, one was tough, one easy, deli..."
3,2024-12-17,Slightly pleasant,"Education,Friends",Calm,"Ok day, preparation for statistics"
4,2024-12-16,Slightly pleasant,"Friends,Current Events,Travel","Calm,Happy","Travelling day, did something to school, chill..."


### Associations

In [481]:
# Associations are in format: `association1;association2;...`
# Extract all associations into separate rows
date_associations = data_days[["date", "associations", "mindState"]].copy()
date_associations["associations"] = date_associations[
    "associations"
].str.split(",")

date_associations = date_associations.explode("associations")

# Add space in CurrentEvents
date_associations["associations"] = date_associations[
    "associations"
].str.replace("CurrentEvents", "Current events")

# Add type column
date_associations["type"] = "association"

date_associations.head()

Unnamed: 0,date,associations,mindState,type
0,2024-12-23,Family,Pleasant,association
0,2024-12-23,Friends,Pleasant,association
1,2024-12-21,Family,Pleasant,association
1,2024-12-21,Friends,Pleasant,association
2,2024-12-18,Education,Slightly pleasant,association


### Descriptions

In [482]:
# Do the same with `descriptions` column
date_descriptions = data_days[["date", "descriptions", "mindState"]].copy()
date_descriptions["descriptions"] = date_descriptions[
    "descriptions"
].str.split(",")

date_descriptions = date_descriptions.explode("descriptions")

# Add type column
date_descriptions["type"] = "description"

date_descriptions.head(10)

Unnamed: 0,date,descriptions,mindState,type
0,2024-12-23,Peaceful,Pleasant,description
0,2024-12-23,Happy,Pleasant,description
0,2024-12-23,Joyful,Pleasant,description
1,2024-12-21,Happy,Pleasant,description
1,2024-12-21,Grateful,Pleasant,description
2,2024-12-18,Calm,Slightly pleasant,description
2,2024-12-18,Content,Slightly pleasant,description
3,2024-12-17,Calm,Slightly pleasant,description
4,2024-12-16,Calm,Slightly pleasant,description
4,2024-12-16,Happy,Slightly pleasant,description


### Context

In [483]:
# For the context, we split by space and explode
date_context = data_days[["date", "context", "mindState"]].copy()
date_context["context"] = date_context["context"].str.split(" ")

date_context = date_context.explode("context")

# Strip leading and trailing whitespaces and capitalize
date_context["context"] = date_context["context"].str.strip().str.capitalize()

# Add type column
date_context["type"] = "context"

date_context.head(10)

Unnamed: 0,date,context,mindState,type
0,2024-12-23,Nephew,Pleasant,context
0,2024-12-23,Birthday,Pleasant,context
0,2024-12-23,Party,Pleasant,context
1,2024-12-21,Scout's,Pleasant,context
1,2024-12-21,Christmas,Pleasant,context
1,2024-12-21,Event,Pleasant,context
2,2024-12-18,Two,Slightly pleasant,context
2,2024-12-18,Tests,Slightly pleasant,context
2,2024-12-18,"Today,",Slightly pleasant,context
2,2024-12-18,One,Slightly pleasant,context


In [484]:
# TODO: UPDATE WORDS
update_words = {
    # Those with empty string are removed
    "a": "",
    "the": "",
    "and": "",
    "of": "",
    "to": "",
    "was": "",
    "in": "",
    "on": "",
    "with": "",
    "for": "",
    "my": "",
    "at": "",
    "from": "",
    "i": "",
    "by": "",
    "it": "",
    "is": "",
    "that": "",
}
# Add boundaries to the words
update_words = {rf"\b{k}\b": v for k, v in update_words.items()}

date_context["context"] = (
    date_context["context"]
    .str.lower()
    .replace(update_words, regex=True)
    .str.capitalize()
)

# Remove empty strings
date_context = date_context[date_context["context"] != ""]

# Remove `'s` from words
date_context["context"] = date_context["context"].str.replace("'s", "")
# "," after words
date_context["context"] = date_context["context"].str.replace(",", "")
# "." after words
date_context["context"] = date_context["context"].str.replace(".", "")

date_context.head()

Unnamed: 0,date,context,mindState,type
0,2024-12-23,Nephew,Pleasant,context
0,2024-12-23,Birthday,Pleasant,context
0,2024-12-23,Party,Pleasant,context
1,2024-12-21,Scout,Pleasant,context
1,2024-12-21,Christmas,Pleasant,context


### Combine

In [485]:
# Combine all dataframes into one by concatenating them
date_word = pd.concat(
    [
        date_associations.rename(columns={"associations": "word"}),
        date_descriptions.rename(columns={"descriptions": "word"}),
        date_context.rename(columns={"context": "word"}),
    ]
).reset_index(drop=True)

# Set all words capital
date_word["word"] = date_word["word"].str.capitalize()

date_word.head()

Unnamed: 0,date,word,mindState,type
0,2024-12-23,Family,Pleasant,association
1,2024-12-23,Friends,Pleasant,association
2,2024-12-21,Family,Pleasant,association
3,2024-12-21,Friends,Pleasant,association
4,2024-12-18,Education,Slightly pleasant,association


### Date, Time and Mind State

In [486]:
time_slept.head()

Unnamed: 0,date,sleep
0,2024-12-23,7:46
1,2024-12-21,4:43
2,2024-12-18,7:30
3,2024-12-17,8:37
4,2024-12-16,8:09


In [487]:
time_slept["sleep"] = time_slept["sleep"].fillna("1:00")

# Sleep time is in format HH:MM
# Transform to hours only
time_slept["sleep"] = (
    time_slept["sleep"]
    .str.split(":")
    .apply(lambda x: int(x[0]) + int(x[1]) / 60)
    .round(2)
)

time_slept.head()

Unnamed: 0,date,sleep
0,2024-12-23,7.77
1,2024-12-21,4.72
2,2024-12-18,7.5
3,2024-12-17,8.62
4,2024-12-16,8.15


In [488]:
# Merge time slept with data_days to create a single dataframe
# With columns date, timeSlept, mindState
date_mind_sleep = pd.merge(
    data_days[["date", "mindState"]], time_slept, on="date", how="left"
)

# Change mindState to numerical values
date_mind_sleep["mindState"] = date_mind_sleep["mindState"].map(
    INVERSE_MIND_STATE_MAP
)

date_mind_sleep.head()

Unnamed: 0,date,mindState,sleep
0,2024-12-23,2,7.77
1,2024-12-21,2,4.72
2,2024-12-18,1,7.5
3,2024-12-17,1,8.62
4,2024-12-16,1,8.15


## Network Table

### Nodes

In [489]:
nodes = date_word.drop("mindState", axis=1)
print("Shape of nodes:", nodes.shape)
nodes.head()

Shape of nodes: (311, 3)


Unnamed: 0,date,word,type
0,2024-12-23,Family,association
1,2024-12-23,Friends,association
2,2024-12-21,Family,association
3,2024-12-21,Friends,association
4,2024-12-18,Education,association


## Export data

In [490]:
print("Nodes shape:", nodes.shape)
print("Day statistics shape:", date_mind_sleep.shape)

Nodes shape: (311, 3)
Day statistics shape: (29, 3)


In [491]:
# Export data as CSV
nodes.to_csv("public/nodes.csv", index=False)

date_mind_sleep.to_csv("public/day_stats.csv", index=False)