### Imports ###

In [None]:
import pandas as pd

### Read data ###

In [None]:
df_phone = pd.read_csv('dataset_mood_smartphone.csv', usecols=['id', 'time', 'variable', 'value'])
df_phone['time'] = pd.to_datetime(df_phone['time'])
print(df_phone.dtypes)

### Remove rows with NaN values ###

In [None]:
nan_stuff = df_phone[df_phone.isna().any(axis=1)]
nan_stuff.info()

In [None]:
print("The length of the entire dataframe:\t{0}\n".format(len(df_phone)))
print("The number of rows with NaN values:\t{0}\n".format(len(nan_stuff)))

df_phone = df_phone[df_phone['value'].notna()]

print("The length of the cleaned dataframe:\t{0}".format(len(df_phone)))

### Get some statistics ###

In [None]:
df_phone.info()

In [None]:
df_phone.head(20)

In [None]:
df_phone.describe(datetime_is_numeric=True, include='all')

### Save cleaned data ###

In [None]:
df_phone.to_csv('dataset_mood_smartphone_clean.csv')

### Analyze specific variables ###

<ul>
    <li> Mood is usually around 7/in the range [6, 8]. 
    <li> Arousal pretty evenly distributed, centered close to 0, mostly [-1, 1]
    <li> Valence a little higher than arousal, so tends to be positive, but std is lower than for arousal.
    <li> Activity score tends to be low.
    <li> People use their phone mostly for communication, entertainment, office, social and other.
    <li> Screen time and use of communication, games, office, social, entertainment and built-in apps hava a high std and some outliers who score far higher than the majority.
    <li> The highest std and most extreme outliers seem to occur in the use of office apps.
</ul>    

In [None]:
variables = list(df_phone['variable'].unique())
for variable in variables:
    df_phone_variable = df_phone[df_phone['variable'] == variable]
    print("Summary for {0}:\n\n{1}\n\n\n".format(variable, df_phone_variable.describe()))