# 복지에 따른 자살율 분석

In [30]:
import pandas as pd

## 데이터 전처리

In [31]:
sr = pd.read_csv('datasets/Age-standardized suicide rates.csv')
fc = pd.read_csv('datasets/Facilities.csv')
hr = pd.read_csv('datasets/Human Resources.csv')

sr.shape, fc.shape, hr.shape

((549, 6), (112, 7), (107, 6))

In [32]:
sr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 549 entries, 0 to 548
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Country  549 non-null    object 
 1   Sex      549 non-null    object 
 2   2016     549 non-null    float64
 3   2015     549 non-null    float64
 4   2010     549 non-null    float64
 5   2000     549 non-null    float64
dtypes: float64(4), object(2)
memory usage: 25.9+ KB


### 세 개의 데이터에 모두 포함되는 나라만 남기는 작업

In [33]:
col1 = hr['Country'].values
col1 = list(col1)
col2 = fc['Country'].values
col2 = list(col2)
col3 = sr['Country'].values
col3 = list(col3)
col = set(col1) & set(col2) & set(col3)
col=list(col)

In [34]:
sel = [sr['Country'][x] in col for x in range(0,sr.shape[0])]
sr_ = sr.loc[sel,:]
sel = [fc['Country'][x] in col for x in range(0,fc.shape[0])]
fc_ = fc.loc[sel,:]
sel = [hr['Country'][x] in col for x in range(0,hr.shape[0])]
hr_ = hr.loc[sel,:]
sr_.shape , fc_.shape, hr_.shape

((303, 6), (101, 7), (101, 6))

### 2016년 자살율만 추출

In [56]:
sr_df=sr_.loc[sr_['Sex']==' Both sexes', '2016']
sr_df=pd.DataFrame(sr_df.values,columns=['Suicide Rate'])
sr_df

Unnamed: 0,Suicide Rate
0,6.4
1,5.6
2,8.9
3,0.5
4,9.1
...,...
96,7.4
97,5.4
98,9.8
99,11.3


### 세 개의 데이터 하나로 병합

In [66]:
hr_df = hr_.loc[:,['Psychiatrists','Nurses','Social_workers','Psychologists']]
hr_df = pd.DataFrame(hr_df.values, columns=['Psychiatrists','Nurses','Social_workers','Psychologists'])
hr_df

Unnamed: 0,Psychiatrists,Nurses,Social_workers,Psychologists
0,0.231,0.098,,0.296
1,1.471,6.876,1.060,1.231
2,0.057,0.660,0.022,0.179
3,1.001,7.005,4.003,
4,21.705,,,222.572
...,...,...,...,...
96,2.612,9.214,,0.090
97,0.378,4.535,0.378,0.756
98,0.201,0.323,0.056,0.409
99,0.056,1.429,0.019,0.031


In [69]:
fc_df = fc_.loc[:,['Mental _hospitals','health_units','outpatient _facilities','day _treatment','residential_facilities']]
fc_df = pd.DataFrame(fc_df.values, columns=['Mental _hospitals','health_units','outpatient _facilities',
                                            'day _treatment','residential_facilities'])
fc_df

Unnamed: 0,Mental _hospitals,health_units,outpatient _facilities,day _treatment,residential_facilities
0,0.003,0.012,0.006,,
1,0.068,0.068,0.410,,0.445
2,0.011,,,,0.014
3,1.001,,,,
4,0.937,1.071,1.720,,0.152
...,...,...,...,...,...
96,0.081,0.013,0.084,0.052,
97,,0.756,2.645,,
98,0.033,0.026,0.063,,
99,0.062,0.062,0.205,,0.019


In [73]:
df = pd.concat([sr_df,hr_df,fc_df],axis=1)
df.fillna(0, inplace=True)
df

Unnamed: 0,Suicide Rate,Psychiatrists,Nurses,Social_workers,Psychologists,Mental _hospitals,health_units,outpatient _facilities,day _treatment,residential_facilities
0,6.4,0.231,0.098,0.000,0.296,0.003,0.012,0.006,0.000,0.000
1,5.6,1.471,6.876,1.060,1.231,0.068,0.068,0.410,0.000,0.445
2,8.9,0.057,0.660,0.022,0.179,0.011,0.000,0.000,0.000,0.014
3,0.5,1.001,7.005,4.003,0.000,1.001,0.000,0.000,0.000,0.000
4,9.1,21.705,0.000,0.000,222.572,0.937,1.071,1.720,0.000,0.152
...,...,...,...,...,...,...,...,...,...,...
96,7.4,2.612,9.214,0.000,0.090,0.081,0.013,0.084,0.052,0.000
97,5.4,0.378,4.535,0.378,0.756,0.000,0.756,2.645,0.000,0.000
98,9.8,0.201,0.323,0.056,0.409,0.033,0.026,0.063,0.000,0.000
99,11.3,0.056,1.429,0.019,0.031,0.062,0.062,0.205,0.000,0.019


### 데이터 생성

In [80]:
from sklearn.model_selection import train_test_split
sel = ['Suicide Rate', 'Psychiatrists', 'Nurses', 'Social_workers',
       'Psychologists', 'Mental _hospitals', 'health_units',
       'outpatient _facilities', 'day _treatment', 'residential_facilities']
X = train[sel]
y = train['Suicide Rate']
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=77)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((60, 10), (20, 10), (60,), (20,))

### 모델 평가 (그라디언트 부스팅이 최적)

In [95]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(max_depth=5)
model.fit(X_train,y_train)
print(model.score(X_test,y_test))

0.9778673444013178


In [96]:
from sklearn.ensemble import GradientBoostingRegressor
model = GradientBoostingRegressor()
model.fit(X_train,y_train)
print(model.score(X_test,y_test))

0.9880815707663044


In [106]:
from sklearn.inspection import permutation_importance
result = permutation_importance(model,X,y)
pd.DataFrame(result.importances_mean)

Unnamed: 0,0
0,2.07296
1,3.764219e-06
2,0.0006556888
3,0.0001184359
4,2.382653e-06
5,0.000963929
6,-8.825904e-06
7,4.475656e-07
8,-3.70082e-05
9,1.85252e-07
