In [4]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
file_path = 'https://raw.githubusercontent.com/Dilum-Alahakoon/AIML-Project/refs/heads/main/data/raw/IT24104181.csv'
df = pd.read_csv(file_path)

# Shape of the dataset
print(f"Shape of the dataset: {df.shape}")
print(f"Number of rows : {df.shape[0]}")
print(f"Number of columns : {df.shape[1]}")

Shape of the dataset: (2358, 14)
Number of rows : 2358
Number of columns : 14


In [6]:
# Viewing the basic details of the dataset
if 'df' in locals():
    print("\n--- First 5 rows of the dataset: ---")
    print(df.head())

    print("\n--- Dataset Info (Columns, Data Types, Non-null counts): ---")
    df.info()


--- First 5 rows of the dataset: ---
  Survey_Date  Age  Gender   Region               Industry          Job_Role  \
0  2025-06-01   27  Female     Asia  Professional Services      Data Analyst   
1  2025-06-01   37  Female     Asia  Professional Services      Data Analyst   
2  2025-06-01   32  Female   Africa              Education  Business Analyst   
3  2025-06-01   40  Female   Europe              Education      Data Analyst   
4  2025-06-01   52    Male  Oceania       Customer Service  Business Analyst   

  Work_Arrangement  Hours_Per_Week Mental_Health_Status Burnout_Level  \
0           Onsite              64      Stress Disorder          High   
1           Onsite              37      Stress Disorder          High   
2           Onsite              36                 ADHD          High   
3           Onsite              63                 ADHD        Medium   
4           Onsite              61              Burnout        Medium   

   Work_Life_Balance_Score  Social_Isolati

In [7]:
df['Salary_Range'].value_counts()

Unnamed: 0_level_0,count
Salary_Range,Unnamed: 1_level_1
$60K-80K,768
$80K-100K,719
$40K-60K,380
$100K-120K,336
$120K+,155


In [8]:
df['Salary_Range_Clean'] = df['Salary_Range'].str.replace('$', '', regex=False).str.replace('K', '', regex=False).str.replace('+', '', regex=False)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2358 entries, 0 to 2357
Data columns (total 15 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Survey_Date              2358 non-null   object
 1   Age                      2358 non-null   int64 
 2   Gender                   2358 non-null   object
 3   Region                   2358 non-null   object
 4   Industry                 2358 non-null   object
 5   Job_Role                 2358 non-null   object
 6   Work_Arrangement         2358 non-null   object
 7   Hours_Per_Week           2358 non-null   int64 
 8   Mental_Health_Status     2358 non-null   object
 9   Burnout_Level            2358 non-null   object
 10  Work_Life_Balance_Score  2358 non-null   int64 
 11  Social_Isolation_Score   2358 non-null   int64 
 12  Salary_Range             2358 non-null   object
 13  Physical_Health_Issues   2358 non-null   object
 14  Salary_Range_Clean       2358 non-null  

In [10]:
salary_split = df['Salary_Range_Clean'].str.split('-', expand=True)

In [11]:
min_salary = pd.to_numeric(salary_split[0])

In [12]:
max_salary = pd.to_numeric(salary_split[1]).fillna(min_salary)

In [13]:
df['Average_Salary'] = (min_salary + max_salary) / 2

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2358 entries, 0 to 2357
Data columns (total 16 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Survey_Date              2358 non-null   object 
 1   Age                      2358 non-null   int64  
 2   Gender                   2358 non-null   object 
 3   Region                   2358 non-null   object 
 4   Industry                 2358 non-null   object 
 5   Job_Role                 2358 non-null   object 
 6   Work_Arrangement         2358 non-null   object 
 7   Hours_Per_Week           2358 non-null   int64  
 8   Mental_Health_Status     2358 non-null   object 
 9   Burnout_Level            2358 non-null   object 
 10  Work_Life_Balance_Score  2358 non-null   int64  
 11  Social_Isolation_Score   2358 non-null   int64  
 12  Salary_Range             2358 non-null   object 
 13  Physical_Health_Issues   2358 non-null   object 
 14  Salary_Range_Clean      

In [15]:
df = df.drop(columns=['Salary_Range', 'Salary_Range_Clean'])

print("Successfully created 'Average_Salary' feature (with error fixed).")
print(df.head())

Successfully created 'Average_Salary' feature (with error fixed).
  Survey_Date  Age  Gender   Region               Industry          Job_Role  \
0  2025-06-01   27  Female     Asia  Professional Services      Data Analyst   
1  2025-06-01   37  Female     Asia  Professional Services      Data Analyst   
2  2025-06-01   32  Female   Africa              Education  Business Analyst   
3  2025-06-01   40  Female   Europe              Education      Data Analyst   
4  2025-06-01   52    Male  Oceania       Customer Service  Business Analyst   

  Work_Arrangement  Hours_Per_Week Mental_Health_Status Burnout_Level  \
0           Onsite              64      Stress Disorder          High   
1           Onsite              37      Stress Disorder          High   
2           Onsite              36                 ADHD          High   
3           Onsite              63                 ADHD        Medium   
4           Onsite              61              Burnout        Medium   

   Work_Life_B

In [16]:
df['Physical_Issue_Count'] = df['Physical_Health_Issues'].apply(lambda x: 0 if x == 'None' else len(x.split(';')))

In [17]:
print("\nSuccessfully created 'Physical_Issue_Count' feature.")


Successfully created 'Physical_Issue_Count' feature.


In [18]:
print(df[['Physical_Health_Issues', 'Physical_Issue_Count']].head())

      Physical_Health_Issues  Physical_Issue_Count
0   Shoulder Pain; Neck Pain                     2
1                  Back Pain                     1
2  Shoulder Pain; Eye Strain                     2
3  Shoulder Pain; Eye Strain                     2
4   Back Pain; Shoulder Pain                     2


In [19]:
print(df.head(5))

  Survey_Date  Age  Gender   Region               Industry          Job_Role  \
0  2025-06-01   27  Female     Asia  Professional Services      Data Analyst   
1  2025-06-01   37  Female     Asia  Professional Services      Data Analyst   
2  2025-06-01   32  Female   Africa              Education  Business Analyst   
3  2025-06-01   40  Female   Europe              Education      Data Analyst   
4  2025-06-01   52    Male  Oceania       Customer Service  Business Analyst   

  Work_Arrangement  Hours_Per_Week Mental_Health_Status Burnout_Level  \
0           Onsite              64      Stress Disorder          High   
1           Onsite              37      Stress Disorder          High   
2           Onsite              36                 ADHD          High   
3           Onsite              63                 ADHD        Medium   
4           Onsite              61              Burnout        Medium   

   Work_Life_Balance_Score  Social_Isolation_Score     Physical_Health_Issues  \

In [20]:
df.columns

Index(['Survey_Date', 'Age', 'Gender', 'Region', 'Industry', 'Job_Role',
       'Work_Arrangement', 'Hours_Per_Week', 'Mental_Health_Status',
       'Burnout_Level', 'Work_Life_Balance_Score', 'Social_Isolation_Score',
       'Physical_Health_Issues', 'Average_Salary', 'Physical_Issue_Count'],
      dtype='object')

In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2358 entries, 0 to 2357
Data columns (total 15 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Survey_Date              2358 non-null   object 
 1   Age                      2358 non-null   int64  
 2   Gender                   2358 non-null   object 
 3   Region                   2358 non-null   object 
 4   Industry                 2358 non-null   object 
 5   Job_Role                 2358 non-null   object 
 6   Work_Arrangement         2358 non-null   object 
 7   Hours_Per_Week           2358 non-null   int64  
 8   Mental_Health_Status     2358 non-null   object 
 9   Burnout_Level            2358 non-null   object 
 10  Work_Life_Balance_Score  2358 non-null   int64  
 11  Social_Isolation_Score   2358 non-null   int64  
 12  Physical_Health_Issues   2358 non-null   object 
 13  Average_Salary           2358 non-null   float64
 14  Physical_Issue_Count    

In [22]:
df.to_csv('IT24102770.csv',index=False)