### Imports ###

In [3]:
import itertools
import pandas as pd

### Read data ###

In [4]:
df_phone = pd.read_csv('dataset_mood_smartphone.csv', usecols=['id', 'time', 'variable', 'value'])
df_phone['time'] = pd.to_datetime(df_phone['time'])
print(df_phone.dtypes)

id                  object
time        datetime64[ns]
variable            object
value              float64
dtype: object


In [10]:
pd.read_csv('dataset_mood_smartphone.csv')[10000:10050]


Unnamed: 0.1,Unnamed: 0,id,time,variable,value
10000,10001,AS14.27,2014-04-05 20:00:00.000,circumplex.arousal,-2.0
10001,10002,AS14.27,2014-04-06 09:00:00.000,circumplex.arousal,-1.0
10002,10003,AS14.27,2014-04-06 12:00:00.000,circumplex.arousal,-1.0
10003,10004,AS14.27,2014-04-06 14:00:00.000,circumplex.arousal,0.0
10004,10005,AS14.27,2014-04-06 18:00:00.000,circumplex.arousal,-1.0
10005,10006,AS14.27,2014-04-06 20:00:00.000,circumplex.arousal,-2.0
10006,10007,AS14.27,2014-04-07 09:00:00.000,circumplex.arousal,1.0
10007,10008,AS14.27,2014-04-07 12:00:00.000,circumplex.arousal,1.0
10008,10009,AS14.27,2014-04-07 15:00:00.000,circumplex.arousal,1.0
10009,10010,AS14.27,2014-04-07 18:00:00.000,circumplex.arousal,1.0


### Remove rows with NaN values ###

In [None]:
nan_stuff = df_phone[df_phone.isna().any(axis=1)]
nan_stuff.info()

In [None]:
print("The length of the entire dataframe:\t{0}\n".format(len(df_phone)))
print("The number of rows with NaN values:\t{0}\n".format(len(nan_stuff)))

df_phone = df_phone[df_phone['value'].notna()]

print("The length of the cleaned dataframe:\t{0}".format(len(df_phone)))

### Get some statistics ###

In [None]:
df_phone.info()

In [None]:
df_phone.head(20)

In [None]:
df_phone.describe(datetime_is_numeric=True, include='all')

### Save cleaned data ###

In [None]:
df_phone.to_csv('dataset_mood_smartphone_clean.csv')

### Analyze specific variables ###

<ul>
    <li> Mood is usually around 7/in the range [6, 8]. 
    <li> Arousal pretty evenly distributed, centered close to 0, mostly [-1, 1]
    <li> Valence a little higher than arousal, so tends to be positive, but std is lower than for arousal.
    <li> Activity score tends to be low.
    <li> People use their phone mostly for communication, entertainment, office, social and other.
    <li> Screen time and use of communication, games, office, social, entertainment and built-in apps hava a high std and some outliers who score far higher than the majority.
    <li> The highest std and most extreme outliers seem to occur in the use of office apps.
</ul>    

In [None]:
variables = list(df_phone['variable'].unique())
for variable in variables:
    df_phone_variable = df_phone[df_phone['variable'] == variable]
    print("Summary for {0}:\n\n{1}\n\n\n".format(variable, df_phone_variable.describe()))

### Split data ###

There are 27 users and we will use a 70-10-20 split, i.e., 19 users for training, 3 for validation and 5 for testing.
We will also split the data we have for each user into sequences of 5 days.

In [None]:
users = df_phone['id'].unique()
train_users = users[:19]
val_users = users[19:22]
test_users = users[22:]

df_train = df_phone[df_phone['id'].isin(train_users)]
df_val = df_phone[df_phone['id'].isin(val_users)]
df_test = df_phone[df_phone['id'].isin(test_users)]