In [1]:
import pandas as pd
import numpy as np
from scipy import stats

In [2]:
df=pd.read_csv('employee_experience_survey_data.csv')

In [6]:
df.head(16)

Unnamed: 0,Name,Age Bracket,Gender,Ethnicity,Job Title,Department,Date Survey Completed,Job Satisfaction,Work-Life Balance,Management Support,...,Career Development Opportunities,Workplace Inclusivity,Company Communication,Compensation Satisfaction,Job Security,Overall Engagement,Job Satisfaction Num,Overall Engagement Num,Work-Life Balance Num,Compensation Satisfaction Num
0,John Doe,25-34,Female,Asian,Product Manager,Product Development,2024-10-05,Disagree,Strongly Agree,Neutral,...,Disagree,Agree,Strongly Agree,Strongly Agree,Agree,Strongly Agree,2,5,5,5
1,Jane Smith,18-24,Female,Middle Eastern,Operations Manager,Sales,2024-10-07,Agree,Strongly Disagree,Strongly Agree,...,Disagree,Neutral,Neutral,Neutral,Agree,Neutral,4,3,1,3
2,Carlos Reyes,45-54,Female,Indian,UX Designer,Consulting,2024-10-08,Neutral,Strongly Disagree,Agree,...,Agree,Agree,Strongly Disagree,Neutral,Strongly Agree,Strongly Agree,3,5,1,3
3,Emily Zhang,35-44,Male,Caucasian,UX Designer,HR,2024-10-07,Neutral,Agree,Agree,...,Strongly Agree,Agree,Strongly Disagree,Strongly Disagree,Agree,Neutral,3,3,4,1
4,Michael Johnson,18-24,Female,Caucasian,UX Designer,Product Development,2024-10-07,Agree,Strongly Agree,Disagree,...,Disagree,Disagree,Disagree,Strongly Disagree,Neutral,Disagree,4,2,5,1
5,Sara Ahmed,45-54,Male,Middle Eastern,Business Consultant,Operations,2024-10-09,Disagree,Strongly Disagree,Neutral,...,Agree,Disagree,Neutral,Agree,Strongly Disagree,Neutral,2,3,1,4
6,Tom Davis,25-34,Male,Caucasian,UX Designer,HR,2024-10-08,Strongly Agree,Strongly Agree,Strongly Disagree,...,Strongly Disagree,Strongly Agree,Strongly Disagree,Neutral,Neutral,Agree,5,4,5,3
7,Linda Lopez,18-24,Male,African American,Customer Support,Product Development,2024-10-10,Disagree,Agree,Neutral,...,Agree,Neutral,Agree,Disagree,Disagree,Agree,2,4,4,2
8,Raj Patel,35-44,Female,Indian,Product Manager,IT,2024-10-07,Strongly Disagree,Strongly Agree,Strongly Disagree,...,Agree,Agree,Neutral,Agree,Strongly Agree,Disagree,1,2,5,4
9,Amara Njeri,18-24,Male,African American,HR Specialist,Design,2024-10-10,Strongly Agree,Strongly Agree,Agree,...,Strongly Disagree,Neutral,Agree,Neutral,Neutral,Strongly Disagree,5,1,5,3


In [3]:
# First, let's clean the data to convert relevant columns to numeric where necessary and handle any missing values.
# Columns like 'Job Satisfaction' and 'Overall Engagement' are categorical but can be converted to numerical scale for statistical analysis.

# Define a mapping for categorical responses to numerical values
response_mapping = {
    "Strongly Disagree": 1,
    "Disagree": 2,
    "Neutral": 3,
    "Agree": 4,
    "Strongly Agree": 5
}

# Apply the mapping to 'Job Satisfaction' and 'Overall Engagement'
df['Job Satisfaction Num'] = df['Job Satisfaction'].map(response_mapping)
df['Overall Engagement Num'] = df['Overall Engagement'].map(response_mapping)

# Descriptive statistics for 'Job Satisfaction' and 'Overall Engagement'
job_satisfaction_stats = df['Job Satisfaction Num'].describe()
overall_engagement_stats = df['Overall Engagement Num'].describe()

job_satisfaction_stats, overall_engagement_stats

(count    15.000000
 mean      3.000000
 std       1.309307
 min       1.000000
 25%       2.000000
 50%       3.000000
 75%       4.000000
 max       5.000000
 Name: Job Satisfaction Num, dtype: float64,
 count    15.000000
 mean      3.400000
 std       1.298351
 min       1.000000
 25%       2.500000
 50%       3.000000
 75%       4.500000
 max       5.000000
 Name: Overall Engagement Num, dtype: float64)

In [4]:
# Let's examine key trends in job satisfaction across different demographics (Age Bracket, Department, Gender, Ethnicity).

# Grouping by 'Age Bracket' and 'Department' to find mean Job Satisfaction
age_bracket_satisfaction = df.groupby('Age Bracket')['Job Satisfaction Num'].mean()
department_satisfaction = df.groupby('Department')['Job Satisfaction Num'].mean()

# Grouping by 'Gender' and 'Ethnicity' for mean Work-Life Balance and Compensation Satisfaction
df['Work-Life Balance Num'] = df['Work-Life Balance'].map(response_mapping)
df['Compensation Satisfaction Num'] = df['Compensation Satisfaction'].map(response_mapping)

gender_wlb_comp = df.groupby('Gender')[['Work-Life Balance Num', 'Compensation Satisfaction Num']].mean()
ethnicity_wlb_comp = df.groupby('Ethnicity')[['Work-Life Balance Num', 'Compensation Satisfaction Num']].mean()

age_bracket_satisfaction, department_satisfaction, gender_wlb_comp, ethnicity_wlb_comp

(Age Bracket
 18-24    3.428571
 25-34    3.000000
 35-44    2.000000
 45-54    2.666667
 Name: Job Satisfaction Num, dtype: float64,
 Department
 Consulting             3.000000
 Design                 5.000000
 Finance                4.000000
 HR                     4.000000
 IT                     1.000000
 Operations             2.000000
 Product Development    2.666667
 Sales                  2.750000
 Name: Job Satisfaction Num, dtype: float64,
         Work-Life Balance Num  Compensation Satisfaction Num
 Gender                                                      
 Female               3.111111                       2.888889
 Male                 3.666667                       2.500000,
                   Work-Life Balance Num  Compensation Satisfaction Num
 Ethnicity                                                             
 African American               4.500000                       2.500000
 Asian                          4.000000                       3.500000
 Caucasi

In [5]:
# First, we need to filter the data for the two departments (e.g., IT and HR), but the dataset provided doesn't include IT.
# So, we'll use HR and another department that has multiple entries (e.g., Product Development).
# We will convert the "Job Satisfaction" ratings into numerical values for hypothesis testing and correlation analysis.


# Map the categorical survey responses to numerical values for statistical analysis
rating_mapping = {
    "Strongly Disagree": 1,
    "Disagree": 2,
    "Neutral": 3,
    "Agree": 4,
    "Strongly Agree": 5
}

# Filter data for HR and Product Development
hr_data = df[df['Department'] == 'HR']['Job Satisfaction'].map(rating_mapping)
pd_data = df[df['Department'] == 'Product Development']['Job Satisfaction'].map(rating_mapping)

# Drop any missing values (if any)
hr_data_clean = hr_data.dropna()
pd_data_clean = pd_data.dropna()

# Perform a two-sample t-test to see if there is a significant difference in Job Satisfaction between HR and Product Development
t_stat, p_value = stats.ttest_ind(hr_data_clean, pd_data_clean, equal_var=False)

# Correlation analysis between Work-Life Balance and Overall Engagement
# Convert categorical data to numerical values for both columns
work_life_balance = df['Work-Life Balance'].map(rating_mapping)
overall_engagement = df['Overall Engagement'].map(rating_mapping)

# Drop missing values (if any) for correlation analysis
clean_wlb = work_life_balance.dropna()
clean_oe = overall_engagement.dropna()

# Perform correlation analysis
correlation_coefficient, p_value_corr = stats.pearsonr(clean_wlb, clean_oe)

(t_stat, p_value, correlation_coefficient, p_value_corr)

(1.1094003924504583,
 0.3879768065234701,
 -0.361040820409166,
 0.18612659993280434)