In [1]:
# imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelBinarizer
from scipy import stats
import acquire
import prepare
import explore
import wrangle
import tyler_explore
import tyler_wrangle
from datetime import datetime
import os

In [2]:
#df = pd.read_csv('survey.csv')

In [3]:
#get the data and store it into a dataframe
df = wrangle.get_survey_data('survey.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1259 entries, 0 to 1258
Data columns (total 27 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   Timestamp                  1259 non-null   object
 1   Age                        1259 non-null   int64 
 2   Gender                     1259 non-null   object
 3   Country                    1259 non-null   object
 4   state                      744 non-null    object
 5   self_employed              1241 non-null   object
 6   family_history             1259 non-null   object
 7   treatment                  1259 non-null   object
 8   work_interfere             995 non-null    object
 9   no_employees               1259 non-null   object
 10  remote_work                1259 non-null   object
 11  tech_company               1259 non-null   object
 12  benefits                   1259 non-null   object
 13  care_options               1259 non-null   object
 14  wellness

In [4]:
# encode the data
strings_df = wrangle.prep_the_strings(df)
encoded_df = wrangle.prep_encode(strings_df)

In [5]:
# get info on the dataframe
strings_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 957 entries, 0 to 1258
Data columns (total 25 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   timestamp                  957 non-null    datetime64[ns]
 1   age                        957 non-null    int64         
 2   gender                     957 non-null    object        
 3   country                    957 non-null    object        
 4   self_employed              957 non-null    int64         
 5   family_history             957 non-null    int64         
 6   treatment                  957 non-null    int64         
 7   work_interfere             957 non-null    object        
 8   no_employees               957 non-null    int64         
 9   remote_work                957 non-null    int64         
 10  tech_company               957 non-null    int64         
 11  benefits                   957 non-null    int64         
 12  care_op

In [6]:
# get info on the encoded dataframe
encoded_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 957 entries, 0 to 1258
Data columns (total 25 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   timestamp                  957 non-null    datetime64[ns]
 1   age                        957 non-null    int64         
 2   gender                     957 non-null    int64         
 3   country                    957 non-null    object        
 4   self_employed              957 non-null    int64         
 5   family_history             957 non-null    int64         
 6   treatment                  957 non-null    int64         
 7   work_interfere             957 non-null    int64         
 8   company_size               957 non-null    int64         
 9   remote_work                957 non-null    int64         
 10  tech_company               957 non-null    int64         
 11  benefits                   957 non-null    int64         
 12  care_op

In [7]:
# one hot encode the data
hot_df = tyler_explore.one_hot(encoded_df)

In [8]:
# split the data
train, validate, test = explore.three_split(encoded_df, 'work_interfere')

In [9]:
# display the shape of the data
train.shape, validate.shape, test.shape

((535, 25), (230, 25), (192, 25))

In [10]:
# determine the category and quantity variables
cat_vars, quant_vars = tyler_explore.cat_vs_quant(train)

In [11]:

# display the category variables
cat_vars

['gender',
 'self_employed',
 'family_history',
 'treatment',
 'work_interfere',
 'company_size',
 'remote_work',
 'tech_company',
 'benefits',
 'care_options',
 'wellness_program',
 'seek_help',
 'anonymity',
 'leave',
 'mental_health_consequence',
 'phys_health_consequence',
 'coworkers',
 'supervisor',
 'mental_health_interview',
 'phys_health_interview',
 'mental_vs_physical',
 'obs_consequence']

In [12]:
# display the quantitative variables

quant_vars

['age']

In [13]:
# Create dummy variables

pd.get_dummies(df.coworkers)

Unnamed: 0,No,Some of them,Yes
0,0,1,0
1,1,0,0
2,0,0,1
3,0,1,0
4,0,1,0
...,...,...,...
1254,0,1,0
1255,0,1,0
1256,1,0,0
1257,1,0,0


In [14]:
# get the dummies of supervisor
pd.get_dummies(df.supervisor)

Unnamed: 0,No,Some of them,Yes
0,0,0,1
1,1,0,0
2,0,0,1
3,1,0,0
4,0,0,1
...,...,...,...
1254,0,1,0
1255,0,0,1
1256,1,0,0
1257,1,0,0


In [15]:
# store the encoded df into another dataframe
carl_df = encoded_df[['coworkers', 'supervisor']]

In [16]:
# display the first few rows of the data frame
carl_df.head()

Unnamed: 0,coworkers,supervisor
0,2,1
1,0,0
2,1,1
4,2,1
5,1,1


In [17]:
# get the information of the dataframe
carl_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 957 entries, 0 to 1258
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype
---  ------      --------------  -----
 0   coworkers   957 non-null    int64
 1   supervisor  957 non-null    int64
dtypes: int64(2)
memory usage: 22.4 KB


In [18]:
# create a dummies table
carl_dummies = pd.get_dummies(data=carl_df, columns = ['coworkers', 'supervisor'])

In [19]:
# display the dummies table
carl_dummies

Unnamed: 0,coworkers_0,coworkers_1,coworkers_2,supervisor_0,supervisor_1,supervisor_2
0,0,0,1,0,1,0
1,1,0,0,1,0,0
2,0,1,0,0,1,0
4,0,0,1,0,1,0
5,0,1,0,0,1,0
...,...,...,...,...,...,...
1253,0,0,1,0,0,1
1255,0,0,1,0,1,0
1256,1,0,0,1,0,0
1257,1,0,0,1,0,0


In [20]:
# merge the two dfs
new_df= pd.concat([carl_dummies, encoded_df])

In [21]:
# Display the data in transform format
new_df.head().T

Unnamed: 0,0,1,2,4,5
coworkers_0,0,1,0,0,0
coworkers_1,0,0,1,0,1
coworkers_2,1,0,0,1,0
supervisor_0,0,1,0,0,0
supervisor_1,1,0,1,1,1
supervisor_2,0,0,0,0,0
timestamp,NaT,NaT,NaT,NaT,NaT
age,,,,,
gender,,,,,
country,,,,,


In [22]:
# tyler_explore.explore_bivariate_2nd(new_df, 'work_interfere')

In [23]:
# Split the data
hot_train, hot_validate, hot_test = explore.three_split(encoded_df, 'work_interfere')

In [24]:
# display the train data
hot_train.head().T

Unnamed: 0,973,1022,358,278,399
timestamp,2014-08-29 07:51:42,2014-08-29 10:12:10,2014-08-27 14:57:46,2014-08-27 14:03:16,2014-08-27 15:25:47
age,49,32,34,28,29
gender,0,0,0,0,0
country,United States,United States,United States,United States,Canada
self_employed,0,0,0,0,0
family_history,1,0,0,0,0
treatment,0,0,1,1,1
work_interfere,0,0,1,1,1
company_size,5,1,3,3,3
remote_work,0,0,1,0,0


Does having benefits affect whether or not you seek treatment affect work interference?
- Ho = There is no difference between having benefits and whether or not treatment is sought.
- Ha = There is a difference between having benefits and whether or not treatment is sought.

In [25]:
# Create a contingency table
contingency_table = pd.crosstab(encoded_df.benefits, encoded_df.treatment)

In [26]:
# display the train data
train.head().T

Unnamed: 0,973,1022,358,278,399
timestamp,2014-08-29 07:51:42,2014-08-29 10:12:10,2014-08-27 14:57:46,2014-08-27 14:03:16,2014-08-27 15:25:47
age,49,32,34,28,29
gender,0,0,0,0,0
country,United States,United States,United States,United States,Canada
self_employed,0,0,0,0,0
family_history,1,0,0,0,0
treatment,0,0,1,1,1
work_interfere,0,0,1,1,1
company_size,5,1,3,3,3
remote_work,0,0,1,0,0


In [27]:
# Get training data
benefits_zero = train[train['benefits'] == 0]

In [28]:
# Get training data
benefits_one = train[train['benefits'] == 1]

In [29]:
# Get training data
benefits_two = train[train['benefits'] == 2]

In [30]:
#Create a new contingency table based on benefits
contingency_table3 = pd.crosstab(benefits_zero.work_interfere, benefits_zero.treatment)

In [31]:
# display the contingency table
contingency_table3

treatment,0,1
work_interfere,Unnamed: 1_level_1,Unnamed: 2_level_1
0,36,1
1,23,46


In [32]:
# run chi2
test_results3 = stats.chi2_contingency(contingency_table3)

In [33]:
# display the results
test_results3

(37.378283597258495,
 9.72995312622513e-10,
 1,
 array([[20.59433962, 16.40566038],
        [38.40566038, 30.59433962]]))

In [34]:
# Get the results
_, p, _, expected = test_results3

In [35]:
# display p
p

9.72995312622513e-10

In [36]:
# Create contingency table
contingency_table4 = pd.crosstab(benefits_one.work_interfere, benefits_one.treatment)

In [37]:
# display the table
contingency_table4

treatment,0,1
work_interfere,Unnamed: 1_level_1,Unnamed: 2_level_1
0,68,8
1,20,156


In [38]:
# Run chi2 on the contingency table
test_results4 = stats.chi2_contingency(contingency_table4)

In [39]:
# Get the results
_, p, _, expected = test_results4

In [40]:
# Print P
p

4.2232378219264606e-32

In [41]:
# Create another contingency table
contingency_table5 = pd.crosstab(benefits_two.work_interfere, benefits_two.treatment)

In [42]:
# Display the table
contingency_table5

treatment,0,1
work_interfere,Unnamed: 1_level_1,Unnamed: 2_level_1
0,80,4
1,28,65


In [43]:
# Execute chi2
test_results5 = stats.chi2_contingency(contingency_table5)

In [44]:
# Get the results
_, p, _, expected = test_results5
p

2.8422489495275323e-18

In [62]:
# Explore the 3 chi2 results
explore.three_chi(train, 'benefits', 'work_interfere', 'treatment')

controlling for benefits = 0

work_interfere & treatment chi2 test results

chi^2 = 37.38
    p = 0.0000

-----------------------------------
controlling for benefits = 1

work_interfere & treatment chi2 test results

chi^2 = 139.08
    p = 0.0000

-----------------------------------
controlling for benefits = 2

work_interfere & treatment chi2 test results

chi^2 = 76.00
    p = 0.0000

-----------------------------------


Based on the p values, we reject the null hypotheses.

If you have observed negative consequences for coworkers with mental health conditions do you not talk to your supervisor and this interferes with your work performance?
- Ho = There is no difference between observed negative consequences for coworkers with mental health conditions and talking to my supervisor.
- Ha = There is a difference between observed negative consequences for coworkers with mental health conditions and talking to my supervisor.


In [45]:
# get the training data
supervisor_zero = train[train['supervisor'] == 0]

In [46]:
# get the training data
supervisor_one = train[train['supervisor'] == 1]

In [47]:
# get the training data
supervisor_two = train[train['supervisor'] == 2]

In [48]:
# Create another contingency table
contingency_table6 = pd.crosstab(supervisor_zero.work_interfere, supervisor_zero.obs_consequence)

In [49]:
# display the data
contingency_table6

obs_consequence,0,1
work_interfere,Unnamed: 1_level_1,Unnamed: 2_level_1
0,52,5
1,94,14


In [50]:
# run chi2
test_results6 = stats.chi2_contingency(contingency_table6)

In [51]:
# Get the results
_, p, _, expected = test_results6
p

0.5853908493041852

In [52]:
# Create another contingency table
contingency_table7 = pd.crosstab(supervisor_one.work_interfere, supervisor_one.obs_consequence)

In [53]:
# display the data
contingency_table7

obs_consequence,0,1
work_interfere,Unnamed: 1_level_1,Unnamed: 2_level_1
0,97,5
1,113,8


In [54]:
# Run chi2
test_results7 = stats.chi2_contingency(contingency_table7)

In [55]:
# Get the results
_, p, _, expected = test_results7
p

0.7979683819404795

In [56]:
# Create a contingency table
contingency_table8 = pd.crosstab(supervisor_two.work_interfere, supervisor_two.obs_consequence)

In [57]:
# get the results
test_results8 = stats.chi2_contingency(contingency_table8)

In [58]:
# Get the results and print p
_, p, _, expected = test_results8
p

0.16812163324112914

With the p values > alpha, we accept the null hypothesis of there being no difference between observed negative consequences for coworkers with mental health conditions and talking to my supervisor.

In [61]:
# Explore the 3 chi results
explore.three_chi(train, 'supervisor', 'work_interfere', 'obs_consequence')

controlling for supervisor = 0

work_interfere & obs_consequence chi2 test results

chi^2 = 0.30
    p = 0.5854

-----------------------------------
controlling for supervisor = 1

work_interfere & obs_consequence chi2 test results

chi^2 = 0.07
    p = 0.7980

-----------------------------------
controlling for supervisor = 2

work_interfere & obs_consequence chi2 test results

chi^2 = 1.90
    p = 0.1681

-----------------------------------
