In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv("/content/mental_health_workplace_survey.csv") #loading the data

missing_values = df.isnull().sum() #calculates the no. of missing values per catergory
print("The number of values missing per column are:", missing_values)

#simple imputation method to handle missing values
for col in df.columns:
    if df[col].dtype in ['float64', 'int64']:
        df[col].fillna(df[col].median(), inplace=True)
    else:
        df[col].fillna(df[col].mode()[0], inplace=True)

#handling outliers
df['WorkHoursPerWeek'] = df['WorkHoursPerWeek'].clip(upper=126)
df['SleepHours'] = df['SleepHours'].clip(upper=16)

#visualising data
#scatter plot
sns.set(style="whitegrid")
plt.figure(figsize=(12, 8))
sns.scatterplot(data=df, x='SleepHours', y='StressLevel', hue='BurnoutLevel', palette='coolwarm', alpha=0.7)
plt.title('Stress Level vs Sleep Hours')
plt.xlabel('Sleep Hours (per day)')
plt.ylabel('Stress Level (0–10)')
plt.tight_layout()
plt.show()
#box-plot
plt.figure(figsize=(12, 8))
sns.boxplot(data=df, x='Age', y='BurnoutLevel', palette='Set2')
plt.title('Age Vs Burnout Level')
plt.xlabel('Age')
plt.ylabel('Burnout Level (0–10)')
plt.tight_layout()
plt.show()

#outlier report
def remove_outliers_iqr(df, col):
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    original_size = df.shape[0]
    df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]
    removed = original_size - df.shape[0]
    print(f"{removed} outliers removed from '{col}'")
    return df

for col in ['WorkHoursPerWeek', 'SleepHours']:
    df = remove_outliers_iqr(df, col)