In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('/kaggle/input/mental-health-in-tech-survey/survey.csv')
print("DataFrame loaded")

: 

In [None]:
print(f'Shape : {df.shape}')
print(f'Info : {df.info()}')
print(f'Head : {df.head}')

In [None]:
print(df.describe())

In [None]:
print(f'Total missing values : {df.isnull().sum()}')

This shows that "comments" and "state" missing values are more in number so they will hold very little value in our models.

In [None]:
print(f"Number of duplicate rows: {df.duplicated().sum()}")

In [None]:
features = df.columns
print(features)

In [None]:
extra_features = ["Gender", "Country", "Timestamp", "Age", "state","comments"]
cat_features = [feature for feature in features if feature not in extra_features]
print(cat_features)

In [None]:
for feature in cat_features:
    plt.figure(figsize = (10,5))
    sns.countplot(x = feature, data = df, order = df[feature].value_counts().index)
    plt.title(f'Distribution of {feature}')
    plt.xticks(rotation = 45)
    plt.show()
    

In [None]:
for feature in extra_features:
    print(df[feature].value_counts())

This clearly tells us that some columns like **Age** and **Gender** need to be cleaned. The **Timestamp** column and **Comments** column will make the model inefficient so they need to be removed.


In [None]:
drop_columns = ["comments", "Timestamp"]
df.drop(columns = drop_columns, inplace = True)
print(f"shape : {df.shape}")

In [None]:
# Clean Age outliers
df = df[(df['Age'] >= 18) & (df['Age'] <= 65)]
print(f'shape : {df.shape}')

In [None]:
# Fix Gender column
# instead of keeping many entries we will classify them in 'male', 'female' and 'other/non-binary'

def clean_gender(gender):
    if not isinstance(gender, str): 
        return 'Other/Non-Binary'
    g = gender.lower()
    if 'fem' in g or 'wom' in g or g == 'f':
        return 'Female'
    elif 'mal' in g or 'man' in g or g == 'm' or 'guy' in g:
        return 'Male'
    else :
        return 'Other/Non-Binary'
df['Gender'] = df['Gender'].apply(clean_gender)
print(df['Gender'].value_counts())

In [None]:
extra_features = [feature for feature in extra_features if feature not in ['comments','Timestamp']]
for feature in extra_features:
    print(df[feature].value_counts())

In [None]:
plt.figure(figsize = (10,5))
sns.countplot(x = 'Gender', data = df, order = df['Gender'].value_counts().index)
plt.title(f'Distribution of Gender')
plt.xticks(rotation = 45)
plt.show()
    

In [None]:
plt.figure(figsize = (10,5))
age_order = sorted(df['Age'].unique())
sns.countplot(x = 'Age', data = df, order = age_order)
plt.title(f'Distribution of Age')
plt.xticks(rotation = 45)
plt.show()
    

So far, we have dropped columns - 'Timestamp' and 'comments'. cleaned columns like 'Age' and 'Gender'. 
From the univariate analysis it can be seen that most of the respondants in out dataset are between the age group of 20 to 46. The number of males is significantly higher than the other two categories which suggest that mental wellness is a problem common amongst males.

In [None]:
# Final columns
print(df.columns)

In [None]:
sns.countplot(x='family_history', hue='treatment', data=df)
plt.title('Treatment Status by Family History')
plt.show()

In [None]:
plt.figure(figsize=(10, 7))
sns.violinplot(
    x='treatment',
    y='Age',
    data=df,
    palette='viridis'
)
plt.title('Age Distribution for Respondents With and Without Treatment', fontsize=16)
plt.xlabel('Sought Treatment?', fontsize=12)
plt.ylabel('Age', fontsize=12)
plt.show()

In [None]:
contingency_table = pd.crosstab(df['work_interfere'], df['mental_health_consequence'])

# Reorder the columns and rows for a more logical flow
ordered_rows = ['Never', 'Rarely', 'Sometimes', 'Often']
ordered_cols = ['No', 'Maybe', 'Yes']
contingency_table = contingency_table.reindex(ordered_rows).reindex(ordered_cols, axis=1)


plt.figure(figsize=(12, 8))
sns.heatmap(
    contingency_table,
    annot=True,      
    fmt='d',         
    cmap='YlGnBu',   
    linewidths=.5
)
plt.title('Heatmap of Work Interference vs. Perceived Mental Health Consequences', fontsize=16)
plt.xlabel('Perceived Mental Health Consequence', fontsize=12)
plt.ylabel('Work Interference', fontsize=12)
plt.show()

In [None]:
g = sns.catplot(
    x='treatment',
    col='tech_company', # Creates separate columns for 'Yes' and 'No' in tech_company
    hue='work_interfere',
    hue_order=ordered_rows, # Use the logical order we defined earlier
    data=df,
    kind='count', # Creates a count plot
    height=6,
    aspect=0.8,
    palette='magma'
)
g.fig.suptitle('Treatment Count by Work Interference, Split by Company Type', y=1.03, fontsize=16)
g.set_axis_labels('Sought Treatment?', 'Number of Respondents')
g.set_titles("Tech Company: {col_name}")
plt.show()

In [None]:
policy_columns = [
    'benefits',
    'wellness_program',
    'seek_help',
    'leave'
]
treatment_palette = {'Yes': '#4CAF50', 'No': '#F44336'}

for col in policy_columns:
    plt.figure(figsize=(12, 7))
    sns.countplot(
        x=col,
        hue='treatment', # This is the key part: it splits the bars based on the 'treatment' column
        data=df,
        palette=treatment_palette,
        order=sorted(df[col].unique()) # Sort the x-axis categories for consistency
    )
    plt.title(f'Treatment Status based on Company Policy: "{col.replace("_", " ").title()}"', fontsize=16)
    plt.ylabel('Number of Respondents', fontsize=12)
    plt.xlabel(f'Response to "{col.replace("_", " ").title()}"', fontsize=12)
    plt.xticks(rotation=15, ha='right') # Rotate labels slightly for readability
    plt.legend(title='Sought Treatment?')
    plt.tight_layout()
    plt.show()

In [None]:
# Let us now encode our features to better suit them for modelling and seeing the corrleation between them
from sklearn.preprocessing import LabelEncoder
leave_map = {'Very easy': 4, 'Somewhat easy': 3, "Don't know": 2, 'Somewhat difficult': 1, 'Very difficult': 0}
df['leave'] = df['leave'].map(leave_map)

work_interfere_map = {'Never': 0, 'Rarely': 1, 'Sometimes': 2, 'Often': 3}
df['work_interfere'] = df['work_interfere'].map(work_interfere_map)

no_employees_map = {'1-5': 0, '6-25': 1, '26-100': 2, '100-500': 3, '500-1000': 4, 'More than 1000': 5}
df['no_employees'] = df['no_employees'].map(no_employees_map)

trinary_map = {'No': 0, 'Maybe': 1, "Don't know": 1, 'Not sure': 1, 'Some of them': 1, 'Yes': 2}
trinary_cols = [
    'benefits', 'care_options', 'wellness_program', 'seek_help', 'anonymity',
    'mental_health_consequence', 'phys_health_consequence', 'coworkers', 'supervisor',
    'mental_health_interview', 'phys_health_interview', 'mental_vs_physical'
]
for col in trinary_cols:
    df[col] = df[col].map(trinary_map)

binary_map = {'No': 0, 'Yes': 1}
binary_cols = [
    'self_employed', 'family_history', 'treatment', 'remote_work', 
    'tech_company', 'obs_consequence'
]
for col in binary_cols:
    df[col] = df[col].map(binary_map)

for column in df.select_dtypes(include='object').columns:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])



In [None]:
correlation_matrix = df.corr()
plt.figure(figsize = (22,18))
sns.heatmap(correlation_matrix, cmap = 'coolwarm', annot = True)
plt.title('Correlation Matrix', fontsize=16)
plt.show()