# CSCI 3360: Course Project
This file is used to clean up the data and create a new modified file to work with.

**Project Members:** JP Park, Yiren Hou, Martha Sikora, Pragya Bhayana
**Dataset:** [Predicting Depression, Anxiety, and Stress](https://www.kaggle.com/yamqwe/depression-anxiety-stress-scales) from [Kaggle](https://www.kaggle.com) by [Yam Peleg](https://www.kaggle.com/yamqwe)

In [141]:
import pandas as pd
import numpy as np

In [142]:
df = pd.read_table("data/data.csv")

# Validity Check

In [143]:
# VCL
df = df[df[['VCL6', 'VCL9', 'VCL12']].sum(axis=1) == 0]
df = df[df[['VCL1', 'VCL4', 'VCL10', 'VCL16']].sum(axis=1) == 4]

In [144]:
# testelapse
testelapse = df['testelapse']
min_qr = testelapse.quantile(0.01)
max_qr = testelapse.quantile(0.99)
df = df[testelapse.between(min_qr, max_qr)]

In [145]:
# surveyelapse
surveyelapse = df['surveyelapse']
min_qr = surveyelapse.quantile(0.01)
max_qr = surveyelapse.quantile(0.99)
df = df[surveyelapse.between(min_qr, max_qr)]

# Calculating BIG-FIVE Personality

In [146]:
# Extraversion
df['EXT1'] = df['TIPI1']
df['EXT2'] = 8 - df['TIPI6']

# Agreeableness
df['AGR1'] = 8 - df['TIPI2']
df['AGR2'] = df['TIPI7']

# Conscientiousness
df['CON1'] = df['TIPI3']
df['CON2'] = 8 - df['TIPI8']

# Emotional Stability
df['EST1'] = 8 - df['TIPI4']
df['EST2'] = df['TIPI9']

# Openness
df['OPE1'] = df['TIPI5']
df['OPE2'] = 8 - df['TIPI10']

# Calculating DASS score and level

In [147]:
# Calculate and create columns for depression, anxiety, and stress scores
# and levels using the official DASS scale
for cat, cols, bins in \
        ('depression', '3|5|10|13|16|17|21|24|26|31|34|37|38|42', [0, 9, 13, 20, 27, np.inf]), \
        ('anxiety', '2|4|7|9|15|19|20|23|25|28|30|36|40|41', [0, 7, 9, 14, 19, np.inf]), \
        ('stress', '1|6|8|11|12|14|18|22|27|29|32|33|35|39', [0, 14, 18, 25, 33, np.inf]):
    df[f'{cat}_score'] = df.filter(regex=f"Q({cols})A").sub(1, fill_value=0).sum(axis=1)
    df[f'{cat}_level'] = pd.cut(df[f'{cat}_score'], right=False, bins=bins, labels=range(5))

# Drop unnecessary columns

In [148]:
drop = lambda regex: df.drop(df.filter(regex=regex).columns, axis=1, inplace=True)

# vocabulary check list to check validity
drop('VCL\\d+')

# Q#A - the answer given to the question
# Q#E - time spent on answering question (in milliseconds)
# Q#I - the question's position in the survey
drop('Q\\d+(A|E|I)')

# introelapse - time spent on the introduction/landing page (in seconds)
# testelapse - time spent on all the DASS questions
# surveylapse - time spent answering the rest of the demographic and survey questions
drop('.+lapse')

# TIPI - Ten Item Personality Inventory
drop('TIPI\\d+')

# Save to a file

In [149]:
df.to_csv("data/cleaned.csv", index=False)