# CSCI 3360: Course Project
This file is used to clean up the data and create a new modified file to work with.

**Project Members:** JP Park, Yiren Hou, Martha Sikora, Pragya Bhayana
**Dataset:** [Predicting Depression, Anxiety, and Stress](https://www.kaggle.com/yamqwe/depression-anxiety-stress-scales) from [Kaggle](https://www.kaggle.com) by [Yam Peleg](https://www.kaggle.com/yamqwe)

In [41]:
import numpy as np
import pandas as pd

In [42]:
df = pd.read_table("data/data.csv")

# Validity Check

In [43]:
# VCL
df = df[df[['VCL6', 'VCL9', 'VCL12']].sum(axis=1) == 0]
df = df[df[['VCL1', 'VCL4', 'VCL10', 'VCL15', 'VCL16']].sum(axis=1) > 2]

In [44]:
# testelapse + surveyelapse
elapse = df['testelapse'] + df['surveyelapse']
min_qr = elapse.quantile(0.01)
max_qr = elapse.quantile(0.99)
df = df[elapse.between(min_qr, max_qr)]

In [45]:
# age
df = df[df['age'].between(13, 90)]

# family size
df = df[df['familysize'] < 15]

# eliminate minors that voted or are/have been married
minor = df['age'] < 18
voted = df['voted'] == 1
married = df['married'] > 1
df = df[~(minor & (voted | married))]

# Drop 0s

In [46]:
cols = ['country', 'education', 'urban', 'gender', 'engnat', 'age', 'hand', 'religion', 'race', 'voted', 'married',
        'familysize']
df = df[~(df[cols] == 0).any(axis=1)]

# Fix major column

In [47]:
df['major'] = df['major'].str.lower()
df['major'] = df['major'].str.strip()
df.loc[df['education'] < 3, 'major'] = np.NAN

# Calculating BIG-FIVE Personality

In [48]:
# Extraversion
df['EXT1'] = df['TIPI1']
df['EXT2'] = 8 - df['TIPI6']
df['EXT'] = df['EXT1'] + df['EXT2']

# Agreeableness
df['AGR1'] = 8 - df['TIPI2']
df['AGR2'] = df['TIPI7']
df['AGR'] = df['AGR1'] + df['AGR2']

# Conscientiousness
df['CON1'] = df['TIPI3']
df['CON2'] = 8 - df['TIPI8']
df['CON'] = df['CON1'] + df['CON2']

# Emotional Stability
df['EST1'] = 8 - df['TIPI4']
df['EST2'] = df['TIPI9']
df['EST'] = df['EST1'] + df['EST2']

# Openness
df['OPE1'] = df['TIPI5']
df['OPE2'] = 8 - df['TIPI10']
df['OPE'] = df['OPE1'] + df['OPE2']

# Calculating DASS score and level

In [49]:
# Calculate and create columns for depression, anxiety, and stress scores
# and levels using the official DASS scale
DASS = (
    ('depression', '3|5|10|13|16|17|21|24|26|31|34|37|38|42', [0, 9, 13, 20, 27, np.inf]),
    ('anxiety', '2|4|7|9|15|19|20|23|25|28|30|36|40|41', [0, 7, 9, 14, 19, np.inf]),
    ('stress', '1|6|8|11|12|14|18|22|27|29|32|33|35|39', [0, 14, 18, 25, 33, np.inf])
)

for cat, cols, bins in DASS:
    df[f'{cat}_score'] = df.filter(regex=f"Q({cols})A").sub(1, fill_value=0).sum(axis=1)
    df[f'{cat}_level'] = pd.cut(df[f'{cat}_score'], right=False, bins=bins, labels=range(5))

# Drop unnecessary columns

In [50]:
drop = lambda regex: df.drop(df.filter(regex=regex).columns, axis=1, inplace=True)

# vocabulary check list to check validity
drop('VCL\\d+')

# Q#A - the answer given to the question
# Q#E - time spent on answering question (in milliseconds)
# Q#I - the question's position in the survey
drop('Q\\d+(A|E|I)')

# introelapse - time spent on the introduction/landing page (in seconds)
# testelapse - time spent on all the DASS questions
# surveylapse - time spent answering the rest of the demographic and survey questions
drop('.+lapse')

# TIPI - Ten Item Personality Inventory
drop('TIPI\\d+')

# screensize
# uniquenetworklocation
df.drop(columns=['screensize', 'uniquenetworklocation', 'source'], inplace=True)

# Changing numerical categorical data to string

In [51]:
df.replace({
    'education': {
        0: np.NAN,
        1: 'less than high school',
        2: 'high school',
        3: 'university degree',
        4: 'graduate degree',
    },
    'urban': {
        0: np.NAN,
        1: 'rural',
        2: 'suburban',
        3: 'urban',
    },
    'gender': {
        0: np.NAN,
        1: 'male',
        2: 'female',
        3: 'other'
    },
    'engnat': {
        0: np.NAN,
        1: 'yes',
        2: 'no',
    },
    'hand': {
        0: np.NAN,
        1: 'right',
        2: 'left',
        3: 'both',
    },
    'religion': {
        0: np.NAN,
        1: 'agnostic',
        2: 'atheist',
        3: 'buddhist',
        4: 'christian (catholic)',
        5: 'christian (mormon)',
        6: 'christian (protestant)',
        7: 'christian (other)',
        8: 'hindu',
        9: 'jewish',
        10: 'muslim',
        11: 'sikh',
        12: 'other',
    },
    'orientation': {
        0: np.NAN,
        1: 'heterosexual',
        2: 'bisexual',
        3: 'homosexual',
        4: 'asexual',
        5: 'other',
    },
    'race': {
        0: np.NAN,
        10: 'asian',
        20: 'arab',
        30: 'black',
        40: 'indigenous australian',
        50: 'native american',
        60: 'white',
        70: 'other',
    },
    'voted': {
        0: np.NAN,
        1: 'yes',
        2: 'no',
    },
    'married': {
        0: np.NAN,
        1: 'never married',
        2: 'currently married',
        3: 'previously married',
    },
}, inplace=True)

# Save to a file

In [52]:
df.to_csv("data/cleaned.csv", index=False)

In [53]:
df

Unnamed: 0,country,education,urban,gender,engnat,age,hand,religion,orientation,race,...,EST,OPE1,OPE2,OPE,depression_score,depression_level,anxiety_score,anxiety_level,stress_score,stress_level
0,IN,high school,urban,female,no,16,right,other,heterosexual,asian,...,2,7,7,14,27,4,34,4,40,4
1,US,high school,urban,female,yes,16,left,christian (other),,other,...,2,5,3,8,24,3,17,3,27,3
3,US,less than high school,urban,female,yes,13,left,christian (catholic),other,other,...,10,6,7,13,16,2,17,3,16,1
4,MY,university degree,suburban,female,no,19,both,muslim,heterosexual,asian,...,5,5,5,10,32,4,40,4,29,3
5,US,high school,urban,female,no,20,right,christian (catholic),heterosexual,other,...,13,7,1,8,13,2,6,0,12,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39769,US,less than high school,suburban,female,yes,16,right,agnostic,heterosexual,other,...,8,6,1,7,15,2,21,4,30,3
39770,GB,high school,suburban,male,yes,16,right,atheist,asexual,white,...,6,6,2,8,36,4,12,2,15,1
39772,US,university degree,suburban,female,yes,48,right,christian (other),heterosexual,black,...,8,6,4,10,4,0,1,0,5,0
39773,US,university degree,suburban,female,yes,20,right,christian (protestant),heterosexual,white,...,4,3,4,7,16,2,13,2,33,4
