# Exploratory Data Analysis (original dataset)

In [1]:
import numpy as np
import pandas as pd

In [2]:
df1 = pd.read_csv('../data/raw/1ene2024_25jun2024.csv')
# df2 = pd.read_csv('../data/raw/10ago2024_23jun2025.csv')
df2 = pd.read_csv('../data/raw/10ago2024_20dic2025.csv')

In [3]:
df1.head()

Unnamed: 0,full_date,date,weekday,time,mood,activities,note_title,note
0,2024-06-25,June 25,Tuesday,10:18 AM,Bien,Feliz | Relajado | Contento | Cansado | Ocupad...,,
1,2024-06-24,June 24,Monday,8:00 PM,Bien,Contento | Ocupado | Confundido | Buen sueño |...,,
2,2024-06-23,June 23,Sunday,8:00 PM,Bien,Feliz | Relajado | Contento | Motivado | Produ...,,
3,2024-06-22,June 22,Saturday,8:00 PM,Bien,Feliz | Relajado | Contento | Cansado | Motiva...,,
4,2024-06-21,June 21,Friday,8:00 PM,Bien,Relajado | Contento | Ansioso | Decepcionado |...,,


In [4]:
df2.head()

Unnamed: 0,full_date,date,weekday,time,mood,activities,note_title,note
0,2025-12-20,20 Dec,Saturday,10:45 p.m.,Bien,Buen sueño | Relajado | Contento | Aburrido | ...,,
1,2025-12-19,19 Dec,Friday,10:45 p.m.,Bien,Buen sueño | Contento | Procrastinar | Ejercic...,,
2,2025-12-18,18 Dec,Thursday,5:06 p.m.,Bien,Buen sueño | Relajado | Contento | Ejercicio |...,,
3,2025-12-17,17 Dec,Wednesday,4:59 p.m.,Bien,Buen sueño | Feliz | Relajado | Contento | Abu...,,
4,2025-12-16,16 Dec,Tuesday,11:42 p.m.,Meh,Buen sueño | Feliz | Relajado | Contento | Eno...,,


Before beginning the exploratory data analysis (EDA), the DataFrames are preprocessed, as they have inconsistent formats, particularly for the `time` and `date` columns. Furthermore, the `note_title` and `note` columns are not needed.

In [5]:
df1 = df1.drop(columns=['date', 'note_title', 'note'])
df2 = df2.drop(columns=['date', 'note_title', 'note'])

In [6]:
# Convert the 'time' column in df2 to AM - PM format
df2['time'] = df2['time'].str.replace(r'\.m\.', 'M', regex=True).str.replace(r'a', 'A', regex=True).str.replace(r'p', 'P', regex=True)

In [7]:
# Dataframe 2024
# df2_filtered = df2[df2['full_date'] <= '2024-12-31']

In [7]:
df = pd.concat([df2, df1])

df = df.sort_values(by='full_date', ascending=True)

df = df.reset_index(drop=True)

In [8]:
df.head()

Unnamed: 0,full_date,weekday,time,mood,activities
0,2024-01-01,Monday,1:56 PM,Meh,Relajado | Aburrido | Sueño medio
1,2024-01-02,Tuesday,8:17 PM,Bien,Feliz | Relajado | Contento | Sueño medio | Ej...
2,2024-01-03,Wednesday,7:26 PM,Bien,Feliz | Contento | Buen sueño | Ejercicio | Ag...
3,2024-01-04,Thursday,9:36 PM,Bien,Feliz | Relajado | Buen sueño | Ejercicio | Co...
4,2024-01-05,Friday,7:14 PM,Bien,Cansado | Aburrido | Ansioso | Sueño medio | E...


In [9]:
df.tail()

Unnamed: 0,full_date,weekday,time,mood,activities
540,2025-12-16,Tuesday,11:42 PM,Meh,Buen sueño | Feliz | Relajado | Contento | Eno...
541,2025-12-17,Wednesday,4:59 PM,Bien,Buen sueño | Feliz | Relajado | Contento | Abu...
542,2025-12-18,Thursday,5:06 PM,Bien,Buen sueño | Relajado | Contento | Ejercicio |...
543,2025-12-19,Friday,10:45 PM,Bien,Buen sueño | Contento | Procrastinar | Ejercic...
544,2025-12-20,Saturday,10:45 PM,Bien,Buen sueño | Relajado | Contento | Aburrido | ...


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545 entries, 0 to 544
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   full_date   545 non-null    object
 1   weekday     545 non-null    object
 2   time        545 non-null    object
 3   mood        545 non-null    object
 4   activities  545 non-null    object
dtypes: object(5)
memory usage: 21.4+ KB


In [11]:
df.describe(include=['O'])

Unnamed: 0,full_date,weekday,time,mood,activities
count,545,545,545,545,545
unique,545,7,256,5,537
top,2024-01-01,Saturday,10:45 PM,Bien,Buen sueño | Relajado | Contento
freq,1,82,138,401,4


# Preprocessing

In [12]:
# Convert mood to number
mood_map = {
    'Horrible': 0,
    'Mal': 1,
    'Meh': 2,
    'Bien': 3,
    'Increíble': 4
}

df['mood_numeric'] = df['mood'].map(mood_map)

In [13]:
# date
df['full_date'] = pd.to_datetime(df['full_date'], errors='coerce')
    
df['month'] = df['full_date'].dt.month
df['year'] = df['full_date'].dt.year

# weekday to number
df['weekday_numeric'] = df['full_date'].dt.dayofweek

# time
df['time_decimal'] = (
    pd.to_datetime(
        df['time'].astype(str).str.replace(r'[^\x00-\x7F]+', ' ', regex=True), 
        errors='coerce',
        format='mixed'
    ).dt.hour 
    + 
    (pd.to_datetime(
        df['time'].astype(str).str.replace(r'[^\x00-\x7F]+', ' ', regex=True), 
        errors='coerce', 
        format='mixed'
    ).dt.minute / 60)
)

df['time_decimal'] = df['time_decimal'].round(2)

In [14]:
# Convert activities to a list
df['activities'] = df['activities'].str.split(' \| ')
print(type(df['activities'].iloc[0]))

<class 'list'>


In [15]:
unique_activities = df['activities'].explode().dropna().unique()

print(f"Total de actividades únicas: {len(unique_activities)}")
print(unique_activities)

Total de actividades únicas: 66
['Relajado' 'Aburrido' 'Sueño medio' 'Feliz' 'Contento' 'Ejercicio' 'Agua'
 'Estudio' 'Buen sueño' 'Iniciar temprano' 'Enfocado' 'Inglés'
 'Algoritmos' 'IA & Data' 'Clase' 'Comer sano' 'Cansado' 'Ansioso'
 'Lectura' 'Compras' 'Estresado' 'Hacer deberes' 'Inseguro' 'Limpiar'
 'Desesperado' 'Emocionado' 'Triste' 'Sueño malo' 'Enojado' 'Decepcionado'
 'Cocinar' 'Clases completas' 'Tarea' 'Proyecto grupal' 'Viaje'
 'Vacaciones' 'Salida' 'Enamorado' 'Visita' 'Skin care' 'Protector solar'
 'Miedo' 'Motivado' 'Examen' 'Productivo' 'Enfermo' 'Ocupado'
 'Competencia' 'Sin dormir' 'Desmotivado' 'Procrastinar' 'Haircut'
 'Sueño temprano' 'Correr / Cardio' 'Confundido' 'Reunión' 'Evento'
 'Doctor' 'Proyecto' 'Pensativo' 'Preocupado' 'Entretenimiento'
 'Nostálgico' 'Bicicleta' 'Paz' 'Conversación']


In [16]:
# Creation of column sleep_level
sleep_categories = {
    'Sin dormir', 
    'Sueño malo', 
    'Sueño medio', 
    'Buen sueño', 
    'Sueño temprano'
}

def get_sleep_level(activities):
    for activity in activities:
        if activity in sleep_categories:
            return activity
    return 'Sueño medio'

df['sleep_level'] = df['activities'].apply(get_sleep_level)

# sleep_level to ordinal
sleep_mapping = {
    'Sin dormir': 0,
    'Sueño malo': 1,
    'Sueño medio': 2,
    'Buen sueño': 3,
    'Sueño temprano': 4
}

df['sleep_level_numeric'] = df['sleep_level'].map(sleep_mapping)

In [17]:
# Number of activities per record
df['activities_count'] = df['activities'].apply(lambda x: len(x))

After analyzing the most suitable conversion method for each activity (since their individual influence needs to be analyzed), it was determined that the best option is to apply Multi-Hot Encoding to each activity, as this method prevents information loss.

It was also decided to create new categories based on a count of activities related to their corresponding category. The following categories were established:
- emotions
- health
- productivity
- places 
- tasks
- school

Furthermore, emotions were also divided into **positive, neutral, and negative** categories.

In [18]:
# Number of activities per category
# Categories
emotions = [
    "Feliz", "Emocionado", "Enamorado", "Relajado", "Contento", "Cansado", "Inseguro", "Aburrido", "Ansioso",
    "Procrastinar", "Enojado", "Estresado", "Triste", "Desesperado", "Productivo", "Ocupado", "Confundido", 
    "Motivado", "Decepcionado" ,"Enfermo", "Desmotivado", "Miedo", "Pensativo", "Preocupado", "Nostálgico"
]

health = [
    "Ejercicio", "Comer sano", "Agua", "Correr / Cardio", "Doctor", "Haircut", "Skin care", "Protector solar", "Bicicleta"
]

productivity = [
    "Lectura", "Entretenimiento", "Iniciar temprano", "Enfocado", "Inglés", "IA & Data", "Algoritmos", 
    "Reunión", "Proyecto"
]

places = ["Visita", "Salida", "Viaje", "Vacaciones", "Competencia", "Evento"]

tasks = ["Compras", "Limpiar", "Cocinar", "Hacer deberes"]

school = ["Clase", "Estudio", "Tarea", "Examen", "Proyecto grupal", "Clases completas"]

# Emotions
positive_emotions = [
    "Feliz", "Emocionado", "Enamorado", "Relajado", "Contento", "Productivo", "Motivado"
]

neutral_emotions = ["Ocupado", "Pensativo", "Cansado", "Nostálgico"]

negative_emotions = [
    "Confundido", "Inseguro", "Aburrido", "Ansioso", "Procrastinar", "Enojado", "Estresado", "Triste", 
    "Desesperado", "Decepcionado", "Enfermo", "Desmotivado", "Miedo",  "Preocupado"
]

categories = {
    'emotions': set(emotions),
    'health': set(health),
    'productivity': set(productivity),
    'places': set(places),
    'tasks': set(tasks),
    'school': set(school),
    'positive_emotions': set(positive_emotions),
    'neutral_emotions': set(neutral_emotions),
    'negative_emotions': set(negative_emotions),
}

for cat_name, cat_set in categories.items():
    col_name = f"{cat_name}_count"
    
    df[col_name] = df['activities'].apply(lambda acts: sum(1 for act in acts if act in cat_set))

In [19]:
# Multi-Hot Encoding for each activity

# This explodes list elements into separate rows, maintaining the original index
exploded_activities = df['activities'].explode()

dummies = pd.get_dummies(exploded_activities, dtype=int)

# max() to group by 1 or 0
activities_encoded = dummies.groupby(level=0, sort=False).max()

df = df.join(activities_encoded)

In [20]:
df.head(2)

Unnamed: 0,full_date,weekday,time,mood,activities,mood_numeric,month,year,weekday_numeric,time_decimal,...,Sin dormir,Skin care,Sueño malo,Sueño medio,Sueño temprano,Tarea,Triste,Vacaciones,Viaje,Visita
0,2024-01-01,Monday,1:56 PM,Meh,"[Relajado, Aburrido, Sueño medio]",2,1,2024,0,13.93,...,0,0,0,1,0,0,0,0,0,0
1,2024-01-02,Tuesday,8:17 PM,Bien,"[Feliz, Relajado, Contento, Sueño medio, Ejerc...",3,1,2024,1,20.28,...,0,0,0,1,0,0,0,0,0,0


In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545 entries, 0 to 544
Data columns (total 88 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   full_date                545 non-null    datetime64[ns]
 1   weekday                  545 non-null    object        
 2   time                     545 non-null    object        
 3   mood                     545 non-null    object        
 4   activities               545 non-null    object        
 5   mood_numeric             545 non-null    int64         
 6   month                    545 non-null    int32         
 7   year                     545 non-null    int32         
 8   weekday_numeric          545 non-null    int32         
 9   time_decimal             545 non-null    float64       
 10  sleep_level              545 non-null    object        
 11  sleep_level_numeric      545 non-null    int64         
 12  activities_count         545 non-nul

In [22]:
# Export DataFrame
df.to_csv('../data/processed/moods_2024_2025.csv', index=False)