### Imports ###

In [None]:
import pandas as pd
import numpy as np

### Read data ###

In [None]:
df_phone = pd.read_csv('data/dataset_mood_smartphone.csv', usecols=['id', 'time', 'variable', 'value'])
df_phone['time'] = pd.to_datetime(df_phone['time'])
print(df_phone.dtypes)

### Remove rows with NaN values ###

In [None]:
nan_stuff = df_phone[df_phone.isna().any(axis=1)]
nan_stuff.info()

In [None]:
print("The length of the entire dataframe:\t{0}\n".format(len(df_phone)))
print("The number of rows with NaN values:\t{0}\n".format(len(nan_stuff)))

df_phone = df_phone[df_phone['value'].notna()]

print("The length of the cleaned dataframe:\t{0}".format(len(df_phone)))

### Take a look ###

In [None]:
df_phone.info()

In [None]:
df_phone.head(20)

In [None]:
df_phone.tail(20)

### Analyze specific variables ###

<ul>
    <li> Mood is usually around 7/in the range [6, 8]. 
    <li> Arousal pretty evenly distributed, centered close to 0, mostly [-1, 1]
    <li> Valence a little higher than arousal, so tends to be positive, but std is lower than for arousal.
    <li> Activity score tends to be low.
    <li> People use their phone mostly for communication, entertainment, office, social and other.
    <li> Screen time and use of communication, games, office, social, entertainment and built-in apps hava a high std and some outliers who score far higher than the majority.
    <li> The highest std and most extreme outliers seem to occur in the use of office apps.
</ul>    

In [None]:
variables = list(df_phone['variable'].unique())
for variable in variables:
    df_phone_variable = df_phone[df_phone['variable'] == variable]
    print("Summary for {0}:\n\n{1}\n\n\n".format(variable, df_phone_variable.describe()))

### Reformat the data ###

Each row will contain the data for one day for one user. Each variable will have its own column.

In [None]:
users = df_phone['id'].unique()
all_variables = df_phone['variable'].unique()
mean_variables = ['mood', 'circumplex.arousal', 'circumplex.valence', 'activity']
data = []
for user in users:
    df_subset = df_phone[df_phone['id'] == user]
    dates = df_subset['time'].dt.date.unique()
    for date in dates:
        df_subsubset = df_subset[df_subset['time'].dt.date == date]
        variables = df_subsubset['variable'].unique()
        if 'mood' not in variables:
            continue
        features = [user, date]
        for variable in all_variables:
            if variable not in variables:
                features.append(0)
            else:        
                df_subsubsubset = df_subsubset[df_subsubset['variable'] == variable]
                if variable in mean_variables:
                    feature = df_subsubsubset['value'].mean()
                else:
                    feature = df_subsubsubset['value'].sum()
                features.append(feature)
        data.append(features)

column_names = ['id', 'date']
column_names.extend(all_variables)

df_reformatted = pd.DataFrame(data, columns=column_names)

In [None]:
print(len(df_reformatted))
df_reformatted.describe()

In [None]:
df_reformatted.to_csv('data/data_reformatted.csv', index=False)

### Split data ###

There are 27 users and we will use a 70-10-20 split, i.e., 19 users for training, 3 for validation and 5 for testing.
We will also split the data we have for each user into sequences of 5 days.

In [None]:
users = df_reformatted['id'].unique()
train_users = users[:19]
val_users = users[19:22]
test_users = users[22:]

df_train = df_reformatted[df_reformatted['id'].isin(train_users)]
df_val = df_reformatted[df_reformatted['id'].isin(val_users)]
df_test = df_reformatted[df_reformatted['id'].isin(test_users)]

In [None]:
df_train.to_csv('data/dataset_train.csv', index=False)
df_val.to_csv('data/dataset_val.csv', index=False)
df_test.to_csv('data/dataset_test.csv', index=False)

### Aggregate data for SVM ###

In [None]:
def get_x(df):
    values = df.to_numpy(dtype=float)
    features = values.mean(axis=0)
    return features

def get_y(df):
    value = df['mood']
    target = round(value)
    return target

def get_x_y(df, window_size):
    xs = []
    ys = []
    num_rows = len(df)
    i = 0
    j = window_size
    k = j + 1
    while k < num_rows:
        x = get_x(df[i:j])
        y = get_y(df.iloc[k])
        xs.append(x)
        ys.append(y)
        i += 1
        j += 1
        k += 1
    return xs, ys

def aggregate_df(df, window_size, filename):
    xs = []
    ys = []
    users = df['id'].unique()
    for user in users:
        df_user = df[df['id'] == user]
        df_user = df_user.drop(['id', 'date'], axis=1)
        xs_user, ys_user = get_x_y(df_user, window_size)
        xs.extend(xs_user)
        ys.extend(ys_user)
    df_x = pd.DataFrame(xs)
    df_y = pd.DataFrame(ys)
    df_x.to_csv('data/x{0}'.format(filename), header=False, index=False)
    df_y.to_csv('data/y{0}'.format(filename), header=False, index=False)        

In [None]:
aggregate_df(df_train, 2, '_train_2.csv')
aggregate_df(df_train, 3, '_train_3.csv')
aggregate_df(df_train, 5, '_train_5.csv')

aggregate_df(df_val, 2, '_val_2.csv')
aggregate_df(df_val, 3, '_val_3.csv')
aggregate_df(df_val, 5, '_val_5.csv')

aggregate_df(df_test, 2, '_test_2.csv')
aggregate_df(df_test, 3, '_test_3.csv')
aggregate_df(df_test, 5, '_test_5.csv')