In [2]:
import pandas as pd 
import numpy as np
import os

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [3]:
PATH = '/Users/kwonkyoungmi/workspaces/workspace_Python/Git_AtAiffel/datathon/data/MentalHealth'
data = pd.read_csv(os.path.join(PATH,'train.csv'))
data.columns

Index(['id', 'Name', 'Gender', 'Age', 'City',
       'Working Professional or Student', 'Profession', 'Academic Pressure',
       'Work Pressure', 'CGPA', 'Study Satisfaction', 'Job Satisfaction',
       'Sleep Duration', 'Dietary Habits', 'Degree',
       'Have you ever had suicidal thoughts ?', 'Work/Study Hours',
       'Financial Stress', 'Family History of Mental Illness', 'Depression'],
      dtype='object')

### Data Cleaning

In [4]:
# 컬럼명 변경 딕셔너리 생성
new_column_names = {
    'Name': 'name',
    'Gender': 'gender',
    'Age': 'age',
    'City': 'city',
    'Working Professional or Student': 'occupation',
    'Profession': 'profession',
    'Academic Pressure': 'academic_pressure',
    'Work Pressure': 'work_pressure',
    'CGPA': 'cgpa',
    'Study Satisfaction': 'study_satisfaction',
    'Job Satisfaction': 'job_satisfaction',
    'Sleep Duration': 'sleep_duration',
    'Dietary Habits': 'dietary_habits',
    'Degree': 'degree',
    'Have you ever had suicidal thoughts ?': 'suicidal_thoughts', 
    'Work/Study Hours': 'work_study_hours',
    'Financial Stress': 'financial_stress',
    'Family History of Mental Illness': 'family_mh_history', # 'family_mh_history' (mh: mental health)
    'Depression': 'depression'
}

df = data.rename(columns=new_column_names)

In [5]:
# 중복값 확인
df.duplicated().sum() # 중복된 행이 있는지 확인

0

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 140700 entries, 0 to 140699
Data columns (total 20 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   id                  140700 non-null  int64  
 1   name                140700 non-null  object 
 2   gender              140700 non-null  object 
 3   age                 140700 non-null  float64
 4   city                140700 non-null  object 
 5   occupation          140700 non-null  object 
 6   profession          104070 non-null  object 
 7   academic_pressure   27897 non-null   float64
 8   work_pressure       112782 non-null  float64
 9   cgpa                27898 non-null   float64
 10  study_satisfaction  27897 non-null   float64
 11  job_satisfaction    112790 non-null  float64
 12  sleep_duration      140700 non-null  object 
 13  dietary_habits      140696 non-null  object 
 14  degree              140698 non-null  object 
 15  suicidal_thoughts   140700 non-nul

In [7]:
# 결측치 확인
df.isnull().sum()

id                         0
name                       0
gender                     0
age                        0
city                       0
occupation                 0
profession             36630
academic_pressure     112803
work_pressure          27918
cgpa                  112802
study_satisfaction    112803
job_satisfaction       27910
sleep_duration             0
dietary_habits             4
degree                     2
suicidal_thoughts          0
work_study_hours           0
financial_stress           4
family_mh_history          0
depression                 0
dtype: int64

In [8]:
df.describe()

Unnamed: 0,id,age,academic_pressure,work_pressure,cgpa,study_satisfaction,job_satisfaction,work_study_hours,financial_stress,depression
count,140700.0,140700.0,27897.0,112782.0,27898.0,27897.0,112790.0,140700.0,140696.0,140700.0
mean,70349.5,40.388621,3.142273,2.998998,7.658636,2.94494,2.974404,6.252679,2.988983,0.181713
std,40616.735775,12.384099,1.380457,1.405771,1.464466,1.360197,1.416078,3.853615,1.413633,0.385609
min,0.0,18.0,1.0,1.0,5.03,1.0,1.0,0.0,1.0,0.0
25%,35174.75,29.0,2.0,2.0,6.29,2.0,2.0,3.0,2.0,0.0
50%,70349.5,42.0,3.0,3.0,7.77,3.0,3.0,6.0,3.0,0.0
75%,105524.25,51.0,4.0,4.0,8.92,4.0,4.0,10.0,4.0,0.0
max,140699.0,60.0,5.0,5.0,10.0,5.0,5.0,12.0,5.0,1.0


#### gender

In [9]:
df['gender'].value_counts() # 원핫 인코딩 또는 male=0 female=1로 변환 필요

gender
Male      77464
Female    63236
Name: count, dtype: int64

In [10]:
df['gender'] = df['gender'].replace({'Male' : 0, 'Female' : 1})
df['gender'].value_counts()

  df['gender'] = df['gender'].replace({'Male' : 0, 'Female' : 1})


gender
0    77464
1    63236
Name: count, dtype: int64

#### age 

In [11]:
df['age'].value_counts() # 연령대별로 그룹화 필요

age
56.0    5246
49.0    5099
38.0    4564
53.0    4526
57.0    4395
47.0    4199
46.0    4080
54.0    3928
51.0    3927
18.0    3921
43.0    3856
59.0    3781
48.0    3695
45.0    3535
20.0    3515
50.0    3439
42.0    3393
24.0    3355
28.0    3338
39.0    3262
44.0    3217
33.0    3140
41.0    3140
29.0    3097
40.0    3094
58.0    2932
25.0    2931
23.0    2900
55.0    2846
35.0    2834
21.0    2742
37.0    2724
36.0    2703
19.0    2634
27.0    2613
52.0    2589
34.0    2578
31.0    2560
60.0    2501
26.0    2108
22.0    2066
30.0    1912
32.0    1785
Name: count, dtype: int64

In [12]:
df['age'].max(),df['age'].min()

(60.0, 18.0)

In [13]:
# 연령대 그룹화
df['age_group'] = pd.cut(df['age'],
                        bins=[0, 18, 26, 36, 46, 56, 66, np.inf],
                        labels=['0-17', '18-25', '26-35', '36-45', '46-55', '56-65', '65+'],
                        right=False) # 왼쪽 경계값만 포함하고, 오른쪽 경계값은 포함 안함
df['age_group'].value_counts()

age_group
46-55    38328
36-45    33488
26-35    25965
18-25    24064
56-65    18855
0-17         0
65+          0
Name: count, dtype: int64

#### occupation-profession

In [14]:
df[df.profession.isnull()].head()

Unnamed: 0,id,name,gender,age,city,occupation,profession,academic_pressure,work_pressure,cgpa,study_satisfaction,job_satisfaction,sleep_duration,dietary_habits,degree,suicidal_thoughts,work_study_hours,financial_stress,family_mh_history,depression,age_group
2,2,Yuvraj,0,33.0,Visakhapatnam,Student,,5.0,,8.97,2.0,,5-6 hours,Healthy,B.Pharm,Yes,3.0,1.0,No,1,26-35
8,8,Aishwarya,1,24.0,Bangalore,Student,,2.0,,5.9,5.0,,5-6 hours,Moderate,BSc,No,3.0,2.0,Yes,0,18-25
26,26,Aditya,0,31.0,Srinagar,Student,,3.0,,7.03,5.0,,Less than 5 hours,Healthy,BA,No,9.0,1.0,Yes,0,26-35
29,29,Kashish,1,19.0,Agra,Working Professional,,,1.0,,,5.0,More than 8 hours,Healthy,Class 12,No,5.0,2.0,No,0,18-25
30,30,Prisha,1,28.0,Varanasi,Student,,3.0,,5.59,2.0,,7-8 hours,Moderate,BCA,Yes,4.0,5.0,Yes,1,26-35


In [15]:
#pd.set_option('display.max_colwidth', None)
df[df['occupation'] == 'Student']['profession'].unique().tolist()

[nan,
 'Civil Engineer',
 'Architect',
 'UX/UI Designer',
 'Digital Marketer',
 'Content Writer',
 'Educational Consultant',
 'Teacher',
 'Student',
 'Manager',
 'Chef',
 'Doctor',
 'Lawyer',
 'Entrepreneur',
 'Pharmacist']

In [16]:
black_list = df[df['occupation'] == 'Student']['profession'].dropna().unique().tolist()

df.loc[(df['occupation'] == 'Student') & 
         (df['profession'].isin(black_list))]

Unnamed: 0,id,name,gender,age,city,occupation,profession,academic_pressure,work_pressure,cgpa,study_satisfaction,job_satisfaction,sleep_duration,dietary_habits,degree,suicidal_thoughts,work_study_hours,financial_stress,family_mh_history,depression,age_group
609,609,Anand,0,21.0,Ahmedabad,Student,Civil Engineer,5.0,,7.83,1.0,,5-6 hours,Healthy,MSc,Yes,11.0,3.0,Yes,1,18-25
2105,2105,Shreya,1,31.0,Vadodara,Student,Architect,5.0,,6.95,3.0,,Less than 5 hours,Moderate,BSc,No,8.0,1.0,Yes,1,26-35
9483,9483,Vidya,1,32.0,Kalyan,Student,UX/UI Designer,3.0,,9.6,2.0,,7-8 hours,Moderate,PhD,Yes,5.0,3.0,No,1,26-35
11470,11470,Anjali,1,28.0,Ahmedabad,Student,Digital Marketer,5.0,,9.72,3.0,,More than 8 hours,Healthy,MA,Yes,10.0,5.0,Yes,1,26-35
12425,12425,Rupak,0,25.0,Kalyan,Student,Content Writer,5.0,,8.5,2.0,,5-6 hours,Moderate,B.Ed,Yes,0.0,2.0,Yes,1,18-25
16196,16196,Ivaan,0,33.0,Jaipur,Student,Architect,2.0,,7.13,4.0,,More than 8 hours,Moderate,MSc,No,2.0,5.0,Yes,1,26-35
16959,16959,Vidhi,1,24.0,Mumbai,Student,Architect,3.0,,8.89,5.0,,More than 8 hours,Unhealthy,MSc,Yes,3.0,5.0,Yes,1,18-25
18291,18291,Ritvik,0,32.0,Agra,Student,Educational Consultant,3.0,,5.74,4.0,,7-8 hours,Healthy,B.Ed,Yes,3.0,5.0,Yes,1,26-35
19226,19226,Rupak,0,31.0,Rajkot,Student,Teacher,3.0,,7.48,5.0,,5-6 hours,Unhealthy,MD,Yes,12.0,2.0,Yes,1,26-35
20049,20049,Himani,1,24.0,Kolkata,Student,Student,1.0,,7.32,4.0,,More than 8 hours,Moderate,MA,Yes,8.0,3.0,Yes,0,18-25


In [17]:
df_1 = df.copy()

df_1['occupation'] = df_1['occupation'].replace({'Working Professional' : 'professional', 'Student' : 'student'})
df_1['occupation'].value_counts()

occupation
professional    112799
student          27901
Name: count, dtype: int64

In [18]:
# occupation이 'student'인 경우 profession에 값을 모두 'student'로 변경
black_list = df_1[df_1['occupation'] == 'student']['profession'].unique().tolist()

df_1.loc[(df_1['occupation'] == 'student') & 
         (df_1['profession'].isin(black_list)),
         'profession'] = 'student'

df_1[df_1['occupation'] == 'student'].groupby('occupation')['profession'].unique().to_list()

[array(['student'], dtype=object)]

In [19]:
df_1[df_1['occupation'] == 'professional']['profession'].unique().tolist()
#df_1['profession'].unique()

['Chef',
 'Teacher',
 'Business Analyst',
 'Finanancial Analyst',
 'Chemist',
 'Electrician',
 'Software Engineer',
 'Data Scientist',
 'Plumber',
 'Marketing Manager',
 'Accountant',
 'Entrepreneur',
 'HR Manager',
 'UX/UI Designer',
 'Content Writer',
 nan,
 'Educational Consultant',
 'Civil Engineer',
 'Manager',
 'Pharmacist',
 'Financial Analyst',
 'Architect',
 'Mechanical Engineer',
 'Customer Support',
 'Consultant',
 'Judge',
 'Researcher',
 'Pilot',
 'Graphic Designer',
 'Travel Consultant',
 'Digital Marketer',
 'Lawyer',
 'Research Analyst',
 'Sales Executive',
 'Doctor',
 'Unemployed',
 'Investment Banker',
 'Family Consultant',
 'B.Com',
 'BE',
 'Student',
 'Yogesh',
 'Dev',
 'MBA',
 'LLM',
 'BCA',
 'Academic',
 'Profession',
 'FamilyVirar',
 'City Manager',
 'BBA',
 'Medical Doctor',
 'Working Professional',
 'MBBS',
 'Patna',
 'Unveil',
 'B.Ed',
 'Nagpur',
 'Moderate',
 'M.Ed',
 'Analyst',
 'Pranav',
 'Visakhapatnam',
 'PhD',
 'Yuvraj']

In [20]:
df_1['profession'] = df_1['profession'].replace('Finanancial Analyst', 'Financial Analyst')

valid_professions = [
    'CHEF', 'TEACHER', 'BUSINESS ANALYST', 'FINANCIAL ANALYST', 'CHEMIST',
    'ELECTRICIAN', 'SOFTWARE ENGINEER', 'DATA SCIENTIST', 'PLUMBER',
    'MARKETING MANAGER', 'ACCOUNTANT', 'ENTREPRENEUR', 'HR MANAGER',
    'UX/UI DESIGNER', 'CONTENT WRITER', 'EDUCATIONAL CONSULTANT',
    'CIVIL ENGINEER', 'MANAGER', 'PHARMACIST', 'ARCHITECT',
    'MECHANICAL ENGINEER', 'CUSTOMER SUPPORT', 'CONSULTANT', 'JUDGE',
    'RESEARCHER', 'PILOT', 'GRAPHIC DESIGNER', 'TRAVEL CONSULTANT',
    'DIGITAL MARKETER', 'LAWYER', 'RESEARCH ANALYST', 'SALES EXECUTIVE',
    'DOCTOR', 'UNEMPLOYED', 'INVESTMENT BANKER', 'FAMILY CONSULTANT',
    'ACADEMIC', 'CITY MANAGER', 'MEDICAL DOCTOR', 'ANALYST', 'STUDENT', 'WORKING PROFESSIONAL'
    # 'STUDENT'와 'WORKING PROFESSIONAL'은 'occupation' 컬럼에 속해야 하므로 우선 살려 두고 확인하여 NaN 처리 또는 occupation 컬럼으로 이동
    # 학위 (B.Com, BE, MBA 등), 이름 (Yogesh, Dev 등), 도시 (Patna, Nagpur 등),
    # 그 외 의미 없는 값 (Profession, FamilyVirar, Unveil, Moderate) 등은 모두 제외
]

incorrect_profession_values = [
    'B.Com','BE','MBA','LLM','BCA','BBA','MBBS','B.Ed','M.Ed','PhD',# 학위/학력 (Degree 컬럼에 속해야 할 값)
    'Yogesh','Dev','Pranav','Yuvraj', # 사람 이름 (Name 컬럼에 속해야 할 값)
    'Patna','Nagpur','Visakhapatnam', # 도시 이름 (City 컬럼에 속해야 할 값)
    'Profession',        # 컬럼 이름 자체
    'FamilyVirar',       # 불분명한 값, 직업으로 보이지 않음
    'Unveil',            # 불분명한 값
    'Moderate',          # 압력 수준 등을 나타내는 값일 가능성 높음
    #'Working Professional', # 'occupation' 컬럼에 속해야 할 값 => 우선 살려 두고 확인
    #'Student',           # 'occupation' 컬럼에 속해야 할 값 => 우선 살려 두고 확인
    np.nan               # 결측값 (오류는 아니지만 처리 필요)
]

df_2 = df_1.copy()
df_2['profession'] = df_2['profession'].fillna('').astype(str).str.upper()
df_2.loc[~df_2['profession'].isin(valid_professions), 'profession'] = np.nan


df_2['profession'] = df_2['profession'].astype(str).str.lower().replace('nan',np.nan)
df_2[df_2['occupation'] == 'professional']['profession'].unique().tolist()

['chef',
 'teacher',
 'business analyst',
 'financial analyst',
 'chemist',
 'electrician',
 'software engineer',
 'data scientist',
 'plumber',
 'marketing manager',
 'accountant',
 'entrepreneur',
 'hr manager',
 'ux/ui designer',
 'content writer',
 nan,
 'educational consultant',
 'civil engineer',
 'manager',
 'pharmacist',
 'architect',
 'mechanical engineer',
 'customer support',
 'consultant',
 'judge',
 'researcher',
 'pilot',
 'graphic designer',
 'travel consultant',
 'digital marketer',
 'lawyer',
 'research analyst',
 'sales executive',
 'doctor',
 'unemployed',
 'investment banker',
 'family consultant',
 'student',
 'academic',
 'city manager',
 'medical doctor',
 'working professional',
 'analyst']

In [21]:
df_2[df_2['profession'] == 'working professional'] # 삭제 할 것
df_3 = df_2[~(df_2['profession'] == 'working professional')]

In [22]:
# profession을 'student'으로 작성 후 아카데미 pressure를 work_pressure로 작성한 경우 우선 살려 둠 => 추후 profession과 occupation을 통합할 예정 
df_3.loc[(df_3['occupation'] == 'professional') & (df_3['profession'] == 'student')] 

Unnamed: 0,id,name,gender,age,city,occupation,profession,academic_pressure,work_pressure,cgpa,study_satisfaction,job_satisfaction,sleep_duration,dietary_habits,degree,suicidal_thoughts,work_study_hours,financial_stress,family_mh_history,depression,age_group
11509,11509,Prisha,1,22.0,Thane,professional,student,,3.0,,,2.0,7-8 hours,Moderate,B.Com,Yes,5.0,2.0,No,0,18-25
25447,25447,Gaurav,0,60.0,Pune,professional,student,,3.0,,,2.0,5-6 hours,Unhealthy,M.Pharm,No,8.0,1.0,Yes,0,56-65
73447,73447,Harsha,0,35.0,Ahmedabad,professional,student,,4.0,,,2.0,Less than 5 hours,Moderate,BSc,No,1.0,3.0,Yes,0,26-35
88617,88617,Ayush,0,29.0,Srinagar,professional,student,,4.0,,,2.0,5-6 hours,Moderate,BE,No,10.0,1.0,Yes,0,26-35


In [23]:
df_3[df_3['occupation'] == 'student']['profession'].unique().tolist()

['student']

In [24]:
df_3[df_3['occupation'] != 'student']['profession'].unique().tolist()

['chef',
 'teacher',
 'business analyst',
 'financial analyst',
 'chemist',
 'electrician',
 'software engineer',
 'data scientist',
 'plumber',
 'marketing manager',
 'accountant',
 'entrepreneur',
 'hr manager',
 'ux/ui designer',
 'content writer',
 nan,
 'educational consultant',
 'civil engineer',
 'manager',
 'pharmacist',
 'architect',
 'mechanical engineer',
 'customer support',
 'consultant',
 'judge',
 'researcher',
 'pilot',
 'graphic designer',
 'travel consultant',
 'digital marketer',
 'lawyer',
 'research analyst',
 'sales executive',
 'doctor',
 'unemployed',
 'investment banker',
 'family consultant',
 'student',
 'academic',
 'city manager',
 'medical doctor',
 'analyst']

In [25]:
df_3['occupation'].isnull().sum()

0

In [26]:
# df_3['profession'].isnull().groupby(df_3['city']).sum().sort_values(ascending=False)
# 직업이 없는 경우 'unemployed'로 변경
df_3.loc[df_3['profession'].isnull(), 'profession'] = 'unemployed'
# 직업이 없는 경우 'unemployed'로 변경 후 다시 확인
df_3['profession'].isnull().sum()

0

#### pressure & satisfaction
`academic_pressure`와 `work_pressure` 통합하기 => `study_work_pressure`   
`study_satisfaction`와 `job satisfaction` 통합하기 => `study_work_satistaction`      

In [27]:
# academic_pressure와 work_pressure 동시에 들어간 컬럼 있는지 확인
df_3[df_3['academic_pressure'].notna() & df_3['work_pressure'].notna()] # 없음 확인

Unnamed: 0,id,name,gender,age,city,occupation,profession,academic_pressure,work_pressure,cgpa,study_satisfaction,job_satisfaction,sleep_duration,dietary_habits,degree,suicidal_thoughts,work_study_hours,financial_stress,family_mh_history,depression,age_group


In [28]:
# study_satisfaction와 job_satisfaction 동시에 들어간 컬럼 있는지 확인
df_3[df_3['study_satisfaction'].notna() & df_3['job_satisfaction'].notna()] # 2개의 레코드 => 삭제

df_4 = df_3[~(df_3['study_satisfaction'].notna() & df_3['job_satisfaction'].notna())]
df_4[df_4['study_satisfaction'].notna() & df_4['job_satisfaction'].notna()] # 삭제 완료 확인


Unnamed: 0,id,name,gender,age,city,occupation,profession,academic_pressure,work_pressure,cgpa,study_satisfaction,job_satisfaction,sleep_duration,dietary_habits,degree,suicidal_thoughts,work_study_hours,financial_stress,family_mh_history,depression,age_group


In [29]:
# 'academic_pressure','work_pressure','study_satisfaction','job_satisfaction'
cols_to_fill_zero = [
    'academic_pressure',
    'work_pressure',
    'study_satisfaction',
    'job_satisfaction'
]

df_4[cols_to_fill_zero] = df_4[cols_to_fill_zero].fillna(0)
df_4[cols_to_fill_zero].isna().sum()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_4[cols_to_fill_zero] = df_4[cols_to_fill_zero].fillna(0)


academic_pressure     0
work_pressure         0
study_satisfaction    0
job_satisfaction      0
dtype: int64

In [67]:
df_4[df_4['job_satisfaction'].isna()]

Unnamed: 0,id,name,gender,age,city,occupation,profession,academic_pressure,work_pressure,cgpa,study_satisfaction,job_satisfaction,sleep_duration,dietary_habits,degree,suicidal_thoughts,work_study_hours,financial_stress,family_mh_history,depression,age_group,study_work_pressure,study_work_satisfaction


In [68]:
df_4['work_study_pressure'] = df_4['academic_pressure'] + df_4['work_pressure']
df_4['work_study_satisfaction'] = df_4['study_satisfaction'] + df_4['job_satisfaction']
df_4['work_study_pressure'].isna().sum(), df_4['study_work_satisfaction'].isna().sum() 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_4['work_study_pressure'] = df_4['academic_pressure'] + df_4['work_pressure']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_4['work_study_satisfaction'] = df_4['study_satisfaction'] + df_4['job_satisfaction']


(0, 0)

In [69]:
df_5 = df_4[['id', 'name', 'gender', 'age','age_group','city', 'profession', 'cgpa','degree',
       'work_study_pressure','work_study_satisfaction', 'financial_stress',
       'sleep_duration','dietary_habits', 'work_study_hours',
       'family_mh_history','suicidal_thoughts','depression'
       ]]

In [70]:
df_5.head()

Unnamed: 0,id,name,gender,age,age_group,city,profession,cgpa,degree,work_study_pressure,work_study_satisfaction,financial_stress,sleep_duration,dietary_habits,work_study_hours,family_mh_history,suicidal_thoughts,depression
0,0,Aaradhya,1,49.0,46-55,Ludhiana,chef,,BHM,5.0,2.0,2.0,More than 8 hours,Healthy,1.0,No,No,0
1,1,Vivan,0,26.0,26-35,Varanasi,teacher,,LLB,4.0,3.0,3.0,Less than 5 hours,Unhealthy,7.0,No,Yes,1
2,2,Yuvraj,0,33.0,26-35,Visakhapatnam,student,8.97,B.Pharm,5.0,2.0,1.0,5-6 hours,Healthy,3.0,No,Yes,1
3,3,Yuvraj,0,22.0,18-25,Mumbai,teacher,,BBA,5.0,1.0,1.0,Less than 5 hours,Moderate,10.0,Yes,Yes,1
4,4,Rhea,1,30.0,26-35,Kanpur,business analyst,,BBA,1.0,1.0,4.0,5-6 hours,Unhealthy,9.0,Yes,Yes,0


#### CGPA

In [71]:
df_5.isna().sum()

id                              0
name                            0
gender                          0
age                             0
age_group                       0
city                            0
profession                      0
cgpa                       112801
degree                          2
work_study_pressure             0
work_study_satisfaction         0
financial_stress                4
sleep_duration                  0
dietary_habits                  4
work_study_hours                0
family_mh_history               0
suicidal_thoughts               0
depression                      0
dtype: int64

In [72]:
len(df_5[~(df_5['profession'] == 'student')]['cgpa']) # 112794
df_5[~(df_5['profession'] == 'student')]['cgpa'].isna().sum() #112784


112788

In [73]:
df_5[~(df_5['profession'] == 'student') & df_5['cgpa'].notna()]

# student가 아닌 경우, CGPA가 없어야 함 ; 있으면 이상치 간주 => NaN 처리
df_5.loc[~(df_5['profession'] == 'student') & df_5['cgpa'].notna(), 'cgpa'] = np.nan
df_5[~(df_5['profession'] == 'student') & df_5['cgpa'].notna()] # 이상치 확인 완료

Unnamed: 0,id,name,gender,age,age_group,city,profession,cgpa,degree,work_study_pressure,work_study_satisfaction,financial_stress,sleep_duration,dietary_habits,work_study_hours,family_mh_history,suicidal_thoughts,depression


In [74]:
df_5[(df_5['profession'] == 'student') & df_5['cgpa'].isna()]['city'].value_counts()
"""
# student인 경우, CGPA가 있어야 함 ; 없으면 이상치 간주 => 삭제 (지역별 결측치가 적어서 삭제하기로 결정 )
Ahmedabad    3
Vadodara     2
Pune         2
Thane        1
Chennai      1
Lucknow      1
Srinagar     1
Rajkot       1
Meerut       1
"""

# student인 경우, CGPA가 없는 경우 삭제
df_6 = df_5[~((df_5['profession'] == 'student') & df_5['cgpa'].isna())]

In [75]:
df_6[df_6['profession'] == 'student']['cgpa'].isna().sum() # 0 확인
df_6[df_6['profession'] != 'student']['cgpa'].isna().sum() == len(df_6[df_6['profession'] != 'student'])

True

In [76]:
len(df_6['profession'])

140684

In [77]:
# NaN 처리 => 0으로 변경
df_6['cgpa'] = df_6['cgpa'].fillna(0)
df_6['cgpa'].isna().sum() # 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_6['cgpa'] = df_6['cgpa'].fillna(0)


0

#### sleep duration

In [78]:
df_6['sleep_duration'].value_counts()

sleep_duration
Less than 5 hours    38777
7-8 hours            36964
More than 8 hours    32725
5-6 hours            32139
3-4 hours               12
6-7 hours                8
4-5 hours                7
4-6 hours                5
2-3 hours                5
6-8 hours                4
No                       4
1-6 hours                4
10-11 hours              2
9-11 hours               2
8-9 hours                2
Sleep_Duration           2
Unhealthy                2
45                       2
40-45 hours              1
1-2 hours                1
1-3 hours                1
9-6 hours                1
55-66 hours              1
Moderate                 1
35-36 hours              1
8 hours                  1
10-6 hours               1
Indore                   1
than 5 hours             1
49 hours                 1
Work_Study_Hours         1
3-6 hours                1
45-48 hours              1
9-5                      1
Pune                     1
9-5 hours                1
Name: count, 

In [79]:
df_7 = df_6.copy()
df_7['sleep_duration'] = df_7['sleep_duration'].replace({'Less than 5 hours': 'insufficient', 
                                                         '5-6 hours': 'insufficient', 
                                                         '7-8 hours': 'optimal', 
                                                         'More than 8 hours': 'excessive'})
df_7['sleep_duration'].value_counts()

sleep_duration
insufficient        70916
optimal             36964
excessive           32725
3-4 hours              12
6-7 hours               8
4-5 hours               7
4-6 hours               5
2-3 hours               5
6-8 hours               4
No                      4
1-6 hours               4
10-11 hours             2
9-11 hours              2
8-9 hours               2
Sleep_Duration          2
Unhealthy               2
45                      2
40-45 hours             1
1-2 hours               1
1-3 hours               1
9-6 hours               1
55-66 hours             1
Moderate                1
35-36 hours             1
8 hours                 1
10-6 hours              1
Indore                  1
than 5 hours            1
49 hours                1
Work_Study_Hours        1
3-6 hours               1
45-48 hours             1
9-5                     1
Pune                    1
9-5 hours               1
Name: count, dtype: int64

In [80]:
# sleep_duration value_counts()가 5개 이하 인 데이터 삭제

# 5개 이하인 sleep_duration
a = df_7['sleep_duration'].value_counts()[df_7['sleep_duration'].value_counts() <= 12].index

df_7 = df_7[~df_7['sleep_duration'].isin(a)]
df_7['sleep_duration'].value_counts()

sleep_duration
insufficient    70916
optimal         36964
excessive       32725
Name: count, dtype: int64

#### Dietary_habits

In [81]:
df_7['dietary_habits'].value_counts()

# dietary_habits value_counts()가 5개 이하 인 데이터 삭제
# 5개 이하인 dietary_habits

b = df_7['dietary_habits'].value_counts()[df_7['dietary_habits'].value_counts() <= 5].index
df_7 = df_7[~(df_7['dietary_habits'].isin(b))]
df_7['dietary_habits'].value_counts()

dietary_habits
Moderate     49661
Unhealthy    46196
Healthy      44721
Name: count, dtype: int64

In [82]:
df_7[df_7['dietary_habits'].isna()]

Unnamed: 0,id,name,gender,age,age_group,city,profession,cgpa,degree,work_study_pressure,work_study_satisfaction,financial_stress,sleep_duration,dietary_habits,work_study_hours,family_mh_history,suicidal_thoughts,depression
59350,59350,Tanmay,0,34.0,26-35,Agra,student,9.04,B.Tech,5.0,1.0,5.0,excessive,,4.0,Yes,Yes,1
64565,64565,Anvi,1,23.0,18-25,Bangalore,student,6.91,M.Pharm,2.0,5.0,3.0,insufficient,,6.0,No,No,1
69581,69581,Vikram,0,48.0,46-55,Hyderabad,consultant,0.0,MBA,3.0,2.0,4.0,insufficient,,5.0,No,No,0
76436,76436,Raghavendra,0,37.0,36-45,Nashik,civil engineer,0.0,ME,3.0,3.0,1.0,optimal,,11.0,No,No,0


In [83]:
df_7 = df_7.dropna(subset=['dietary_habits'])
df_7['dietary_habits'].isna().sum() # 0 확인

0

#### degree

In [84]:
df_7['degree'].unique().tolist()

['BHM',
 'LLB',
 'B.Pharm',
 'BBA',
 'MCA',
 'MD',
 'BSc',
 'ME',
 'B.Arch',
 'BCA',
 'BE',
 'MA',
 'B.Ed',
 'B.Com',
 'MBA',
 'M.Com',
 'MHM',
 'BA',
 'Class 12',
 'M.Tech',
 'PhD',
 'M.Ed',
 'MSc',
 'B.Tech',
 'LLM',
 'MBBS',
 'M.Pharm',
 'UX/UI Designer',
 'MPA',
 'BH',
 'Nalini',
 'BEd',
 'B.Sc',
 'Veda',
 'Bhopal',
 'S.Tech',
 'Degree',
 '20',
 'Class 11',
 'H_Pharm',
 'M',
 'P.Com',
 'BPharm',
 'Business Analyst',
 'M.Arch',
 'LL.Com',
 'Data Scientist',
 'MPharm',
 'L.Ed',
 'P.Pharm',
 'Kalyan',
 'Unite',
 'BArch',
 'HR Manager',
 'Badhya',
 'S.Pharm',
 'LLBA',
 'Vrinda',
 'M. Business Analyst',
 'Bhavesh',
 '0',
 'LLCom',
 '29',
 'MTech',
 'Vivaan',
 'BPA',
 'Plumber',
 '5.61',
 'Brit',
 'B.03',
 'Ritik',
 '5.56',
 'MEd',
 'B',
 'B BA',
 '7.06',
 'B.B.Arch',
 'ACA',
 'Brithika',
 'CGPA',
 '24',
 'M_Tech',
 'Pihu',
 'BB',
 'Jhanvi',
 'LLTech',
 'Aarav',
 'Entrepreneur',
 '8.56',
 'LHM',
 'Lata',
 'S.Arch',
 'Marsh',
 'HCA',
 '5.88',
 'B.Student',
 'LL B.Ed',
 'M.S',
 'Navya',
 '

In [85]:
degree_mapping = {
    'B.Sc': 'BSc',
    'B.Pharm': 'BPharm',
    'M.Pharm': 'MPharm', # M.Pharm과 MPharm 통일
    'B.Arch': 'BArch',
    'M.Arch': 'MArch', # M.Arch와 MArch 통일
    'B.Ed': 'BEd',
    'M.Ed': 'MEd',
    'B.Tech': 'BTech', # B.Tech와 MTech 통일
    'M.Tech': 'MTech', # M.Tech와 MTech 통일
    'B.Com': 'BCom',
    'M.Com': 'MCom',
    'M.Business Analyst': 'MBA', # 직업명이지만 학위로 분류될 가능성이 있어 MBA로 일단 통일
    'MBBS': 'MBBS', # 이미 통일되어 있지만 명시
    'MD': 'MD', # 이미 통일되어 있지만 명시
    'PhD': 'PhD', # 이미 통일되어 있지만 명시
    'Class 12': 'HighSchool', # Class 12와 12th 통일
    'Class 11': 'HighSchool', # Class 10과 10th 통일
    'M_Tech' : 'MTech', # M_Tech와 MTech 통일
}

df_7['degree'] = df_7['degree'].replace(degree_mapping)

outliers = [
    'Bhopal', 'Kalyan', # 도시/지역명
    '20', '29', '0', '5.61', '5.56', '7.06', '8.56', '24', '5.88', 'B.03', 'B.3.79', # 숫자 (나이, CGPA 등)
    'Nalini', 'Veda', 'Nalini', 'Vrinda', 'Bhavesh', 'Ritik', 'Brithika', 'Pihu', # 이름/사람
    'Jhanvi', 'Aarav', 'Lata', 'Navya', 'Mahika', 'Mthanya', 'Esha', 'Mihir', 'Advait', # 이름/사람    
    'Degree', 'CGPA', 'M', 'B', 'BB', 'ACA', 'HCA', 'LLS', 'RCA', # 컬럼명/기타 명칭
    'UX/UI Designer', 'Business Analyst', 'HR Manager', 'M. Business Analyst', 'Data Scientist', # 직업명 (Profession 컬럼에 있어야 할 값)
    'Plumber', 'Entrepreneur', 'Doctor', 'Working Professional', # 직업명 (Profession 컬럼에 있어야 할 값)
    'S.Tech', 'H_Pharm', 'P.Com', 'LL.Com', 'L.Ed', 'P.Pharm', 'S.Pharm', 'LLBA', # 알 수 없는 약어 또는 오타/불완전한 학위명
    'LLCom', 'BPA', 'LLTech', 'LHM', 'S.Arch', 'B.Student', 'LL B.Ed', 'M.S', # 알 수 없는 약어 또는 오타/불완전한 학위명
    'K.Ed', 'LLEd', 'E.Tech', 'N.Pharm', 'LCA', 'B BA', 'B.B.Arch', 'Unite', # 알 수 없는 약어 또는 오타/불완전한 학위명
    'Badhya', 'Vivaan', 'Brit', 'B B.Com','Marsh' # 알 수 없는 약어 또는 오타/불완전한 학위명
    ]

# degree 컬럼에서 NaN 처리
df_7['degree'] = df_7['degree'].replace(outliers, np.nan)


In [86]:
df_7['degree'] = df_7['degree'].fillna('Unknown') # NaN을 'Unknown'으로 대체

In [87]:
df_7['degree'].value_counts()

degree
HighSchool    14719
BEd           11689
BArch          8742
BCom           8108
BPharm         5854
BCA            5735
MEd            5663
MCA            5225
BBA            5028
BSc            5020
MSc            4876
LLM            4642
MPharm         4535
MTech          4473
BTech          4424
LLB            4344
BHM            4305
MBA            3812
BA             3741
ME             3626
MD             3381
MHM            3283
BE             3099
PhD            3099
MCom           3091
MBBS           3077
MA             2886
Unknown          94
MArch             5
MPA               1
BH                1
Name: count, dtype: int64

#### suicidal_thoughts 

In [88]:
df_7['suicidal_thoughts'].value_counts()
df_7['suicidal_thoughts'] = df_7['suicidal_thoughts'].replace({'Yes': 1, 'No': 0})
df_7['suicidal_thoughts'].value_counts()

  df_7['suicidal_thoughts'] = df_7['suicidal_thoughts'].replace({'Yes': 1, 'No': 0})


suicidal_thoughts
0    71085
1    69493
Name: count, dtype: int64

#### family history

In [89]:
df_7['family_mh_history'].value_counts()
df_7['family_mh_history'] = df_7['family_mh_history'].replace({'Yes': 1, 'No': 0})
df_7['family_mh_history'].value_counts()

  df_7['family_mh_history'] = df_7['family_mh_history'].replace({'Yes': 1, 'No': 0})


family_mh_history
0    70688
1    69890
Name: count, dtype: int64

#### work_study_hours

In [90]:
df_7['work_study_hours'].value_counts()

work_study_hours
10.0    14188
11.0    12821
9.0     12701
0.0     12059
12.0    11398
2.0     10588
6.0     10420
7.0      9867
1.0      9793
3.0      9463
5.0      9329
4.0      9053
8.0      8898
Name: count, dtype: int64

#### financial_stress

In [91]:
df_7['financial_stress'].isna().sum() # 4
df_7[df_7['financial_stress'].isna()]

Unnamed: 0,id,name,gender,age,age_group,city,profession,cgpa,degree,work_study_pressure,work_study_satisfaction,financial_stress,sleep_duration,dietary_habits,work_study_hours,family_mh_history,suicidal_thoughts,depression
22377,22377,Manvi,1,32.0,26-35,Varanasi,student,5.64,BCA,3.0,1.0,,insufficient,Healthy,12.0,0,0,1
51485,51485,Ishwar,0,37.0,36-45,Hyderabad,unemployed,0.0,HighSchool,4.0,2.0,,excessive,Moderate,9.0,0,1,0
68910,68910,Arav,0,29.0,26-35,Hyderabad,student,8.94,BEd,2.0,3.0,,insufficient,Unhealthy,12.0,1,0,0
97610,97610,Pari,1,20.0,18-25,Kolkata,student,6.83,MBBS,1.0,1.0,,insufficient,Healthy,9.0,1,0,0


In [92]:
df_7.groupby('profession')['financial_stress'].mean().loc[['student', 'unemployed']] # 0.0

profession
student       3.140001
unemployed    3.127616
Name: financial_stress, dtype: float64

In [93]:
df_7.loc[
    (df_7['profession'] == 'student') & (df_7['financial_stress'].isna()),
    'financial_stress'
] = df_7.groupby('profession')['financial_stress'].mean().loc['student']
df_7[(df_7['profession'] == 'student') & (df_7['financial_stress'].isna())] # NaN 처리 완료

df_7.loc[
    (df_7['profession'] == 'unemployed') & (df_7['financial_stress'].isna()),
    'financial_stress'
] = df_7.groupby('profession')['financial_stress'].mean().loc['unemployed']
df_7[(df_7['profession'] == 'unemployed') & (df_7['financial_stress'].isna())] # NaN 처리 완료
# financial_stress NaN 처리 완료
df_7['financial_stress'].isna().sum() # 0 확인

0

#### City

In [94]:
df_7['city'].unique()

array(['Ludhiana', 'Varanasi', 'Visakhapatnam', 'Mumbai', 'Kanpur',
       'Ahmedabad', 'Thane', 'Nashik', 'Bangalore', 'Patna', 'Rajkot',
       'Jaipur', 'Pune', 'Lucknow', 'Meerut', 'Agra', 'Surat',
       'Faridabad', 'Hyderabad', 'Srinagar', 'Ghaziabad', 'Kolkata',
       'Chennai', 'Kalyan', 'Nagpur', 'Vadodara', 'Vasai-Virar', 'Delhi',
       'Bhopal', 'Indore', 'Ishanabad', 'Vidhi', 'Ayush', 'Gurgaon',
       'Krishna', 'Aishwarya', 'Keshav', 'Harsha', 'Nalini', 'Aditya',
       'Malyansh', 'Raghavendra', 'Saanvi', 'M.Tech', 'Bhavna',
       'Less Delhi', 'Nandini', 'M.Com', 'Plata', 'Atharv', 'Pratyush',
       'City', '3.0', 'Less than 5 Kalyan', 'MCA', 'Mira', 'Moreadhyay',
       'Morena', 'Ishkarsh', 'Kashk', 'Mihir', 'Vidya', 'Tolkata', 'Anvi',
       'Krinda', 'Ayansh', 'Shrey', 'Ivaan', 'Vaanya', 'Gaurav', 'Harsh',
       'Reyansh', 'Kashish', 'Kibara', 'Vaishnavi', 'Chhavi', 'Parth',
       'Mahi', 'Tushar', 'MSc', 'No', 'Rashi', 'ME', 'Molkata',
       'Researcher', '

In [95]:
a = df_7['city'].value_counts().sort_values(ascending=True)
a[a<7].index.tolist()

['Unirar',
 'Parth',
 'Kashish',
 'Kibara',
 'Chhavi',
 'Vaishnavi',
 'Tushar',
 'MSc',
 'Ivaan',
 'Vaanya',
 'Gaurav',
 'Harsh',
 'Reyansh',
 'Ishanabad',
 'Vidhi',
 'Shrey',
 'Moreadhyay',
 'Morena',
 'Kashk',
 'Ishkarsh',
 'Anvi',
 'Krinda',
 'Ayansh',
 'Tolkata',
 'Mira',
 'Less than 5 Kalyan',
 '3.0',
 'Gurgaon',
 'Aishwarya',
 'Krishna',
 'Aditya',
 'Galesabad',
 'Itheg',
 'Khaziabad',
 'Malyansh',
 'Raghavendra',
 'M.Tech',
 'Aaradhya',
 'Dhruv',
 'Nalyan',
 'Khushi',
 'Kagan',
 'Researcher',
 'ME',
 'Rashi',
 'Less Delhi',
 'Plata',
 'Pooja',
 'Ithal',
 'No',
 'Jhanvi',
 'Armaan',
 'Nalini',
 'Keshav',
 'Molkata',
 'Atharv',
 'Ayush',
 'MCA',
 'M.Com',
 'City',
 'Harsha',
 'Vidya',
 'Pratyush',
 'Saanvi',
 'Mahi',
 'Bhavna',
 'Nandini']

In [96]:
urban = ['Mumbai','Delhi','Bangalore','Hyderabad','Chennai','Ahmedabad'] # ,'Pune'
rural = ['Srinagar','Varanasi','Rajkot','Agra','Meerut']  #,'Ludhiana','Visakhapatnam'
total = urban + rural


a = df_7[df['city'].isin(urban)].shape[0]; b = df_7[df_7['city'].isin(rural)].shape[0]
print(a, b)
print(a/(b + a) * 100)
print(b/(b + a) * 100)
print(b-a)


26808 25072
51.673091750192754
48.326908249807246
-1736


  a = df_7[df['city'].isin(urban)].shape[0]; b = df_7[df_7['city'].isin(rural)].shape[0]


In [97]:
df_8 = df_7.copy()
df_8['urban_rural'] = df_8['city'].apply(lambda x: 'urban' if x in urban else ('rural' if x in rural else 'other'))

In [98]:
df_final = df_8[df_8['urban_rural'] != 'other']
df_final.info() # 최종 데이터셋 확인

<class 'pandas.core.frame.DataFrame'>
Index: 51880 entries, 1 to 140698
Data columns (total 19 columns):
 #   Column                   Non-Null Count  Dtype   
---  ------                   --------------  -----   
 0   id                       51880 non-null  int64   
 1   name                     51880 non-null  object  
 2   gender                   51880 non-null  int64   
 3   age                      51880 non-null  float64 
 4   age_group                51880 non-null  category
 5   city                     51880 non-null  object  
 6   profession               51880 non-null  object  
 7   cgpa                     51880 non-null  float64 
 8   degree                   51880 non-null  object  
 9   work_study_pressure      51880 non-null  float64 
 10  work_study_satisfaction  51880 non-null  float64 
 11  financial_stress         51880 non-null  float64 
 12  sleep_duration           51880 non-null  object  
 13  dietary_habits           51880 non-null  object  
 14  work_study

In [99]:
df_final.to_csv(os.path.join(PATH, 'df_final.csv'), index=False)

In [100]:
df_8.to_csv(os.path.join(PATH, 'df_cleaned.csv'), index=False)