# Data Cleaning

Cleans raw public data to produce the following two files:

1. '../data/crosscheck_daily_data_cleaned_w_sameday.csv'
2. '../data/studentlife_daily_data_cleaned_w_sameday_03192020.csv'

The original raw datasets will need to be downloaded from the following links to use this code:

* Download the "CrossCheck_Daily_Data.csv" from https://cornell.box.com/s/rkx46bgv36lkmo2eu349ka95senn48gh
* Download the raw StudentLife data and unzip from https://studentlife.cs.dartmouth.edu/dataset.html

The code below will guide you to place the paths to each of the downloaded files in a variable.

Requires the following code files in the repo:

1. '../src/util.py'
2. '.../src/cleaning_util.py'

In [1]:
import sys
import pandas as pd
from datetime import timedelta
import numpy as np

Import local code files

In [None]:
sys.path.insert(0, '../code/')

In [None]:
import util
import cleaning_util

## CrossCheck Daily Data Feature Prep

In [None]:
# Download the "CrossCheck_Daily_Data.csv" from https://cornell.box.com/s/rkx46bgv36lkmo2eu349ka95senn48gh
# fill in local path in variable below
crosscheck_daily_data_path = ''
daily_data = pd.read_csv(crosscheck_daily_data_path)

In [None]:
# Preppinng
daily_data['date'] = pd.to_datetime(daily_data['day'], format='%Y%m%d')

In [None]:
# Get potential features
feature_cols = [f for f in daily_data.columns.values if f not in ['study_id', 'eureka_id', 'day', 'date']]
ema_cols = [f for f in feature_cols if 'ema' in f]
behavior_cols = [f for f in feature_cols if 'ema' not in f]

Averaging over 1-3 days for each feature. Same as:

[1] Rui Wang, Emily A. Scherer, Vincent W. S. Tseng, et al. 2016. CrossCheck: toward passive sensing and detection of mental health changes in people with schizophrenia. Proceedings of the 2016 ACM International Joint Conference on Pervasive and Ubiquitous Computing - UbiComp ’16, ACM Press, 886–897.

In [None]:
# Sort
daily_data = daily_data.sort_values(['study_id', 'date']).reset_index(drop=True)
# Copy over EMA columns
crosscheck_df = daily_data[['study_id', 'eureka_id', 'date'] + ema_cols].copy()
for f in behavior_cols:
    crosscheck_df[f] = None
# Add a column to collect missing days of data
crosscheck_df['missing_days'] = 0

# Go through each study ID
curr = 0
for s in daily_data.study_id.unique():
    if (curr % 1) == 0:
        print(curr)
    # Go through each EMA date, discarding the first EMA taken
    for ind in daily_data.loc[
        (daily_data.study_id == s) & (pd.isnull(daily_data[ema_cols]).sum(axis=1) == 0), :].index[1:]:
        # Get date
        d = daily_data.loc[ind, 'date']
        # Now see if data exists in other df
        start_date = d - timedelta(days=2)
        end_date = d
        filtered_df = daily_data.loc[
            (daily_data.study_id == s) & (daily_data.date >= start_date) & (daily_data.date <= end_date), :
        ]
        if filtered_df.shape[0] > 0:
            # Get mean
            crosscheck_df.loc[ind, behavior_cols] = filtered_df[behavior_cols].mean().values
            # Check for null values across all columns
        crosscheck_df.loc[ind, 'missing_days'] = 3 - filtered_df.shape[0]
        
    curr += 1

In [None]:
# Drop all rowss where there is not EMA data
crosscheck_df_cleaned = crosscheck_df.dropna(subset=ema_cols)
# Drop all rows where this is no behavioral data and no missing data was marked
# These should be the first EMA
crosscheck_df_cleaned = crosscheck_df_cleaned.loc[~(
        (pd.isnull(crosscheck_df_cleaned[behavior_cols]).sum(axis=1) == len(behavior_cols)) & \
        (crosscheck_df_cleaned.missing_days < 3)
    ), :
]

In [None]:
crosscheck_df_cleaned.to_csv('../data/crosscheck_daily_data_cleaned_w_sameday.csv', index=True)

## StudentLife Data

In [None]:
# Download the raw StudentLife data from: https://studentlife.cs.dartmouth.edu/dataset.html
# Unzip the file, and put the path to the unzipped file in the variable below

studentlife_unzipped = ''

### EMA File Prep

#### Upload EMA data

In [None]:
ema_social_files = util.upload_directory(
    studentlife_unzipped + '/dataset/EMA/response/Social/',
    file_type='json'
)

In [None]:
ema_stress_files = util.upload_directory(
    studentlife_unzipped + '/dataset/EMA/response/Stress/',
    file_type='json'
)

In [None]:
ema_sleep_files = util.upload_directory(
    studentlife_unzipped + '/dataset/EMA/response/Sleep/',
    file_type='json'
)

In [None]:
ema_behavior_files = util.upload_directory(
    studentlife_unzipped + '/dataset/EMA/response/Behavior/',
    file_type='json'
)

In [None]:
ema_mood_files = util.upload_directory(
    studentlife_unzipped + '/dataset/EMA/response/Mood/',
    file_type='json'
)

In [None]:
ema_pam_files = util.upload_directory(
    studentlife_unzipped + '/dataset/EMA/response/PAM/',
    file_type='json'
)

#### Prep EMA data

In [None]:
# Make dfs from EMA data
ema_mood_df = cleaning_util.prep_studentlife_df(ema_mood_files)
ema_social_df = cleaning_util.prep_studentlife_df(ema_social_files)
ema_stress_df = cleaning_util.prep_studentlife_df(ema_stress_files)
ema_sleep_df = cleaning_util.prep_studentlife_df(ema_sleep_files)
ema_behavior_df = cleaning_util.prep_studentlife_df(ema_behavior_files)
ema_pam_df = cleaning_util.prep_studentlife_df(ema_pam_files)

In [None]:
# Concatenate
studentlife_ema_df = cleaning_util.prep_ema_data(
    [ema_mood_df, ema_social_df, ema_stress_df, ema_sleep_df, ema_behavior_df, ema_pam_df]
)

### Sensor Data Prep

#### Upload sensor data 

In [None]:
activity_files = util.upload_directory(studentlife_unzipped + '/dataset/sensing/activity/')

In [None]:
conversation_files = util.upload_directory_from_magma(studentlife_unzipped + '/dataset/sensing/conversation/')

In [None]:
gps_files = util.upload_directory_from_magma(studentlife_unzipped + '/dataset/sensing/gps/', )

In [None]:
phone_lock_files = util.upload_directory_from_magma(studentlife_unzipped + '/dataset/sensing/phonelock/', )

In [None]:
dark_files = util.upload_directory_from_magma(studentlife_unzipped + '/dataset/sensing/dark/', )

In [None]:
audio_files = util.upload_directory_from_magma(studentlife_unzipped + '/dataset/sensing/audio/', )

In [None]:
phone_charge_files = util.upload_directory_from_magma(studentlife_unzipped + '/dataset/sensing/phonecharge/', )

#### Prep Sensor Data

##### Activity

In [None]:
activity_df = cleaning_util.clean_studentlife_activity(activity_files)

##### Conversations

In [None]:
conversation_df = cleaning_util.clean_studentlife_conversations(conversation_files)

##### Phone unlock

In [None]:
unlock_df = cleaning_util.clean_studentlife_unlock(phone_lock_files)

##### GPS location

In [None]:
gps_df = cleaning_util.clean_studentlife_location(gps_files)

##### Sleep

In [None]:
sleep_df = cleaning_util.clean_sleep_data(
    phone_lock_files, cutoff_duration=15, start_time=23, ema_df=studentlife_ema_df,
    correction='median'
)

#### Filter to days with >= 19 hours of day

Same procedure used in CrossCheck data cleaning:

[1] Rui Wang, Emily A. Scherer, Vincent W. S. Tseng, et al. 2016. CrossCheck: toward passive sensing and detection of mental health changes in people with schizophrenia. Proceedings of the 2016 ACM International Joint Conference on Pervasive and Ubiquitous Computing - UbiComp ’16, ACM Press, 886–897.

In [None]:
good_days = cleaning_util.get_good_days(dfs=activity_files)

### Merge Cleaned StudentLife Data

Note: We are only merging the feature files that align and were used in prediction analysis.

In [None]:
dfs = [studentlife_ema_df, activity_df, conversation_df, gps_df, sleep_df]

merged_df = good_days[['study_id', 'day']].copy()
for df in dfs:
    if merged_df is None:
        merged_df = df.copy()
    else:
        merged_df = pd.merge(left=merged_df, right=df, on=['study_id', 'day'], how='left')

### StudentLife prep for prediction

Averaging over 1-3 days for each feature. Same as:

[1] Rui Wang, Emily A. Scherer, Vincent W. S. Tseng, et al. 2016. CrossCheck: toward passive sensing and detection of mental health changes in people with schizophrenia. Proceedings of the 2016 ACM International Joint Conference on Pervasive and Ubiquitous Computing - UbiComp ’16, ACM Press, 886–897.

In [None]:
sl_daily_df = merged_df.copy()

In [None]:
sl_daily_df['day'] = pd.to_datetime(sl_daily_df['day'])

In [None]:
# Get potential features
sl_feature_cols = [f for f in sl_daily_df.columns.values if f not in ['study_id', 'day']]
sl_ema_cols = [f for f in sl_feature_cols if 'ema' in f]
sl_behavior_cols = [f for f in sl_feature_cols if 'ema' not in f]

In [None]:
# Sort
sl_daily_df = sl_daily_df.sort_values(['study_id', 'day']).reset_index(drop=True)
# Copy over EMA columns
sl_df = sl_daily_df[['study_id', 'day'] + sl_ema_cols].copy()
for f in sl_behavior_cols:
    sl_df[f] = None
# Add a column to collect missing days of data
sl_df['missing_days'] = 0

# Go through each study ID
curr = 0

keep_index = []

for s in sl_daily_df.study_id.unique():
    if (curr % 1) == 0:
        print(curr)
    # Go through each EMA date, discarding the first EMA taken
    for ind in sl_daily_df.loc[
        (sl_daily_df.study_id == s) &
        (((~pd.isnull(sl_daily_df[sl_ema_cols])).sum(axis=1)) > 0), :
    ].index[1:]:
        # Get date
        d = sl_daily_df.loc[ind, 'day']
        # Now see if data exists in other df
        start_date = d - timedelta(days=2)
        end_date = d
        filtered_df = sl_daily_df.loc[
            (sl_daily_df.study_id == s) & (sl_daily_df.day >= start_date) & \
            (sl_daily_df.day <= end_date), :
        ]
        if filtered_df.shape[0] > 0:
            # Get mean
            sl_df.loc[ind, sl_behavior_cols] = filtered_df[sl_behavior_cols].mean().values
            # Check for null values across all columns
        sl_df.loc[ind, 'missing_days'] = 3 - filtered_df.shape[0]
        
    curr += 1

In [None]:
# Drop all rows where this is no behavioral data and no missing data was marked
sl_df_cleaned = sl_df.copy()
# These should be the first EMA
sl_df_cleaned = sl_df_cleaned.loc[~(
        (pd.isnull(sl_df_cleaned[sl_behavior_cols]).sum(axis=1) == len(sl_behavior_cols)) & \
        (sl_df_cleaned.missing_days < 3)
    ), :
]

In [None]:
sl_df_cleaned.to_csv('../data/studentlife_daily_data_cleaned_w_sameday_03192020.csv', index=False)