In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('/kaggle/input/mental-health-in-tech-survey/survey.csv')
print("DataFrame loaded")

DataFrame loaded


In [2]:
print(f'Shape : {df.shape}')
print(f'Info : {df.info()}')
print(f'Head : {df.head}')
print(f'Describe : {df.describe()}')

Shape : (1259, 27)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1259 entries, 0 to 1258
Data columns (total 27 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   Timestamp                  1259 non-null   object
 1   Age                        1259 non-null   int64 
 2   Gender                     1259 non-null   object
 3   Country                    1259 non-null   object
 4   state                      744 non-null    object
 5   self_employed              1241 non-null   object
 6   family_history             1259 non-null   object
 7   treatment                  1259 non-null   object
 8   work_interfere             995 non-null    object
 9   no_employees               1259 non-null   object
 10  remote_work                1259 non-null   object
 11  tech_company               1259 non-null   object
 12  benefits                   1259 non-null   object
 13  care_options               1259 non-null   o

In [3]:
print(f'Total missing values : {df.isnull().sum()}')

Total missing values : Timestamp                       0
Age                             0
Gender                          0
Country                         0
state                         515
self_employed                  18
family_history                  0
treatment                       0
work_interfere                264
no_employees                    0
remote_work                     0
tech_company                    0
benefits                        0
care_options                    0
wellness_program                0
seek_help                       0
anonymity                       0
leave                           0
mental_health_consequence       0
phys_health_consequence         0
coworkers                       0
supervisor                      0
mental_health_interview         0
phys_health_interview           0
mental_vs_physical              0
obs_consequence                 0
comments                     1095
dtype: int64


In [4]:
drop_columns = ["comments", "Timestamp"]
df.drop(columns = drop_columns, inplace = True)
print(f"shape : {df.shape}")

shape : (1259, 25)


In [5]:
# Clean Age outliers
df = df[(df['Age'] >= 18) & (df['Age'] <= 65)]
print(f'shape : {df.shape}')

shape : (1250, 25)


In [6]:
# Fix Gender column
# instead of keeping many entries we will classify them in 'male', 'female' and 'other/non-binary'

def clean_gender(gender):
    if not isinstance(gender, str): 
        return 'Other/Non-Binary'
    g = gender.lower()
    if 'fem' in g or 'wom' in g or g == 'f':
        return 'Female'
    elif 'mal' in g or 'man' in g or g == 'm' or 'guy' in g:
        return 'Male'
    else :
        return 'Other/Non-Binary'
df['Gender'] = df['Gender'].apply(clean_gender)
print(df['Gender'].value_counts())

Gender
Male                983
Female              250
Other/Non-Binary     17
Name: count, dtype: int64


In [7]:
# Let us now encode our features to better suit them for modelling and seeing the corrleation between them
from sklearn.preprocessing import LabelEncoder
leave_map = {'Very easy': 4, 'Somewhat easy': 3, "Don't know": 2, 'Somewhat difficult': 1, 'Very difficult': 0}
df['leave'] = df['leave'].map(leave_map)

work_interfere_map = {'Never': 0, 'Rarely': 1, 'Sometimes': 2, 'Often': 3}
df['work_interfere'] = df['work_interfere'].map(work_interfere_map)

no_employees_map = {'1-5': 0, '6-25': 1, '26-100': 2, '100-500': 3, '500-1000': 4, 'More than 1000': 5}
df['no_employees'] = df['no_employees'].map(no_employees_map)

trinary_map = {'No': 0, 'Maybe': 1, "Don't know": 1, 'Not sure': 1, 'Some of them': 1, 'Yes': 2}
trinary_cols = [
    'benefits', 'care_options', 'wellness_program', 'seek_help', 'anonymity',
    'mental_health_consequence', 'phys_health_consequence', 'coworkers', 'supervisor',
    'mental_health_interview', 'phys_health_interview', 'mental_vs_physical'
]
for col in trinary_cols:
    df[col] = df[col].map(trinary_map)

binary_map = {'No': 0, 'Yes': 1}
binary_cols = [
    'self_employed', 'family_history', 'treatment', 'remote_work', 
    'tech_company', 'obs_consequence'
]
for col in binary_cols:
    df[col] = df[col].map(binary_map)




In [8]:
print(df.head)

<bound method NDFrame.head of       Age  Gender         Country state  self_employed  family_history  \
0      37  Female   United States    IL            NaN               0   
1      44    Male   United States    IN            NaN               0   
2      32    Male          Canada   NaN            NaN               0   
3      31    Male  United Kingdom   NaN            NaN               1   
4      31    Male   United States    TX            NaN               0   
...   ...     ...             ...   ...            ...             ...   
1254   26    Male  United Kingdom   NaN            0.0               0   
1255   32    Male   United States    IL            0.0               1   
1256   34    Male   United States    CA            0.0               1   
1257   46  Female   United States    NC            0.0               0   
1258   25    Male   United States    IL            0.0               1   

      treatment  work_interfere  no_employees  remote_work  ...  anonymity  \
0  

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


In [9]:
# Let us make a new column which can combine all the company policies in a single feature "support_score"

def calculate_support_score(row):
    """
    This function calculates a support score for a single row
    based on a predefined scoring system.
    """
    score = 0
    
    # Scoring for 'benefits'
    if row['benefits'] == 2:
        score += 2
    elif row['benefits'] == 1:
        score += 1
        
    # Scoring for 'wellness_program'
    if row['wellness_program'] == 2:
        score += 2
    elif row['wellness_program'] == 1:
        score += 1

    # Scoring for 'seek_help'
    if row['seek_help'] == 2:
        score += 2
    elif row['seek_help'] == 1:
        score += 1

    # Scoring for 'anonymity'
    if row['anonymity'] == 2:
        score += 2
    elif row['anonymity'] == 1:
        score += 1
        
    # Scoring for 'care_options'
    if row['care_options'] == 2:
        score += 1

    # Scoring for 'leave'
    if row['leave'] == 4:
        score += 3
    elif row['leave'] == 3:
        score += 2
    elif row['leave'] == 1:
        score -= 1
    elif row['leave'] == 0:
        score -= 2
        
    return score

# Apply the function to each row (axis=1) to create the new column
df['support_score'] = df.apply(calculate_support_score, axis=1)

# Display the new column alongside the original features to verify
print(df[['benefits', 'wellness_program','seek_help','anonymity','care_options', 'leave', 'support_score']].head())

   benefits  wellness_program  seek_help  anonymity  care_options  leave  \
0         2                 0          2          2             1      3   
1         1                 1          1          1             0      2   
2         0                 0          0          1             0      1   
3         0                 0          0          0             2      1   
4         2                 1          1          1             0      2   

   support_score  
0              8  
1              4  
2              0  
3              0  
4              5  


In [10]:
print(df.shape)

(1250, 26)


In [11]:
df.to_csv('preprocessed_df_capstone.csv', index = False)
print("Dataframe saved")

Dataframe saved


In [12]:
print(df.info)

<bound method DataFrame.info of       Age  Gender         Country state  self_employed  family_history  \
0      37  Female   United States    IL            NaN               0   
1      44    Male   United States    IN            NaN               0   
2      32    Male          Canada   NaN            NaN               0   
3      31    Male  United Kingdom   NaN            NaN               1   
4      31    Male   United States    TX            NaN               0   
...   ...     ...             ...   ...            ...             ...   
1254   26    Male  United Kingdom   NaN            0.0               0   
1255   32    Male   United States    IL            0.0               1   
1256   34    Male   United States    CA            0.0               1   
1257   46  Female   United States    NC            0.0               0   
1258   25    Male   United States    IL            0.0               1   

      treatment  work_interfere  no_employees  remote_work  ...  leave  \
0    

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
