In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

import seaborn as sns
import matplotlib.pyplot as plt

df=pd.read_csv('부상 데이터셋.csv', encoding='UTF-8')

In [7]:
df
print(df.columns)

print(df['Injury Location'].unique())
#df.count()

Index(['Date', 'ID', 'Gender', 'Age', 'Shift', 'Unnamed: 5', 'Injury Location',
       'Incident Type', 'Days Lost', 'Incident Cost'],
      dtype='object')
['Trunk' 'Abdomen' 'Back' 'Legs' 'Neck' 'Head' 'Multiple' 'Eye' nan 'Arms'
 'Hands' 'Feet']


In [8]:
df.dropna(inplace = True)
df['Incident Cost'] = df['Incident Cost'].str.replace(pat =r'[^\w]', repl=r'', regex=True ) #특수문자 제거


bins = [0,10,20,30,40,50]

df['Age_grouping'] = np.digitize(df['Age'], bins)

df

Unnamed: 0,Date,ID,Gender,Age,Shift,Unnamed: 5,Injury Location,Incident Type,Days Lost,Incident Cost,Age_grouping
0,2020-01-01,438312,Male,25,Night,True,Trunk,Burn,0.0,5000,3
1,2020-01-03,438333,Female,18,Day,True,Abdomen,Cut,0.0,4994,2
2,2020-01-03,438334,Male,35,Day,True,Back,Lifting,5.0,4969,4
3,2020-01-04,438345,Female,50,Day,True,Legs,Lifting,2.5,4947,6
4,2020-01-07,438376,Male,25,Day,True,Abdomen,Lifting,3.0,4940,3
...,...,...,...,...,...,...,...,...,...,...,...
509,2022-06-26,44738511,Female,31,Night,True,Back,Lifting,0.0,0,4
510,2022-06-27,44739512,Male,31,Night,True,Back,Slip/trip,0.0,0,4
511,2022-06-28,44740513,Male,22,Night,True,Abdomen,Crush & Pinch,0.0,0,3
512,2022-06-28,44740514,Male,31,Night,True,Hands,Lifting,0.0,0,4


In [10]:
# 라벨링 처리 : 범주형 데이터를 수치형 데이터로 변환
df.loc[df['Gender']== 'Male','Gender'] = 0 #Gender열의 Male 데이터를 0으로 표시
df.loc[df['Gender']== 'Female','Gender'] = 1 #Gender열의 Female 데이터를 1으로 표시
df.loc[df['Shift']== 'Day','Shift'] = 0 #Shift열의 Day 데이터를 0으로 표시
df.loc[df['Shift']== 'Night','Shift'] = 1 #Shift열의 Night 데이터를 1으로 표시
df

Unnamed: 0,Date,ID,Gender,Age,Shift,Unnamed: 5,Injury Location,Incident Type,Days Lost,Incident Cost,Age_grouping,scaled_DaysLost,scaled_IncidentCost
0,2020-01-01,438312,0,25,1,True,Trunk,Burn,0.0,5000,3,0.0,1.0000
1,2020-01-03,438333,1,18,0,True,Abdomen,Cut,0.0,4994,2,0.0,0.9988
2,2020-01-03,438334,0,35,0,True,Back,Lifting,5.0,4969,4,1.0,0.9938
3,2020-01-04,438345,1,50,0,True,Legs,Lifting,2.5,4947,6,0.5,0.9894
4,2020-01-07,438376,0,25,0,True,Abdomen,Lifting,3.0,4940,3,0.6,0.9880
...,...,...,...,...,...,...,...,...,...,...,...,...,...
509,2022-06-26,44738511,1,31,1,True,Back,Lifting,0.0,0,4,0.0,0.0000
510,2022-06-27,44739512,0,31,1,True,Back,Slip/trip,0.0,0,4,0.0,0.0000
511,2022-06-28,44740513,0,22,1,True,Abdomen,Crush & Pinch,0.0,0,3,0.0,0.0000
512,2022-06-28,44740514,0,31,1,True,Hands,Lifting,0.0,0,4,0.0,0.0000


In [9]:
scaler = MinMaxScaler()
df[['scaled_DaysLost','scaled_IncidentCost']] = scaler.fit_transform(df[['Days Lost','Incident Cost']])

df

Unnamed: 0,Date,ID,Gender,Age,Shift,Unnamed: 5,Injury Location,Incident Type,Days Lost,Incident Cost,Age_grouping,scaled_DaysLost,scaled_IncidentCost
0,2020-01-01,438312,Male,25,Night,True,Trunk,Burn,0.0,5000,3,0.0,1.0000
1,2020-01-03,438333,Female,18,Day,True,Abdomen,Cut,0.0,4994,2,0.0,0.9988
2,2020-01-03,438334,Male,35,Day,True,Back,Lifting,5.0,4969,4,1.0,0.9938
3,2020-01-04,438345,Female,50,Day,True,Legs,Lifting,2.5,4947,6,0.5,0.9894
4,2020-01-07,438376,Male,25,Day,True,Abdomen,Lifting,3.0,4940,3,0.6,0.9880
...,...,...,...,...,...,...,...,...,...,...,...,...,...
509,2022-06-26,44738511,Female,31,Night,True,Back,Lifting,0.0,0,4,0.0,0.0000
510,2022-06-27,44739512,Male,31,Night,True,Back,Slip/trip,0.0,0,4,0.0,0.0000
511,2022-06-28,44740513,Male,22,Night,True,Abdomen,Crush & Pinch,0.0,0,3,0.0,0.0000
512,2022-06-28,44740514,Male,31,Night,True,Hands,Lifting,0.0,0,4,0.0,0.0000


In [10]:
df.drop(['Date','ID','Unnamed: 5'], axis = 1, inplace = True)
# df.drop(['Age','Days Lost','Incident Cost','Date','ID','Unnamed: 5'], axis = 1, inplace = True)
# 각각을 drop한 이유// 
## Age = age grouping을 했기 때문에 필요가 없음
## Days Lost & Incident Cost = fit_transform을 하게되어서 데이터가 중복된 것을 방지
## 이후 // 이건 고려를 안한듯

#시계열 데이터 :  시간에 따른 연속성
df.sort_values('Age_grouping', ascending=True)

df

KeyError: "['Date', 'ID', 'Unnamed: 5'] not found in axis"

In [None]:
#map_Injury = {'Trunk' : 0, 'Abdomen' : 1, 'Back' : 2, 'Legs' : 3, 'Neck':4, 'Head' : 5, 'Multiple':6,
      # 'Eye':7, 'Arms': 8, 'Hands' : 9, 'Feet' : 10} : 이것도 데이터 라벨링
#df['target'] = df['Injury Location'].map(map_Injury) : Injury Location을 Target으로 바꿈
#df

In [11]:

%matplotlib inline
df.shape

(472, 10)

#### 일단 에러 뜬 이유는 윗부분에서 drop을 시킨 것들을 형이 다시 당겨온 부분이 있음

In [12]:
# col_names = ['Gender','Age', 'Shift', 'Injury Location','Incident Type', 'Days Lost', 'Incident Cost']

# df.columns = col_names
# # 연속형 변수와 범주형 변수를 구분. 
# df.info()

In [None]:
# 연속형 변수 6개. 
df.describe().T  ## 이거 왜한것일까요

In [None]:
df.columns

In [None]:
categorical = [var for var in df.columns if df[var].dtype=='O']
print(categorical)

In [None]:
categorical = [var for var in df.columns if df[var].dtype=='O'] # if df[var] is object type
df[categorical].head()

In [None]:
# 각 인자 요소 비율
for var in categorical:
     print(df[var].value_counts()/np.float(len(df)))

In [None]:
df[categorical].isnull().sum() # 각각 missing value가 어느정도인지 확인한다.

In [None]:
# check for cardinality in categorical variables
# more N of label, more cardinality
for var in categorical:
    print(var, ' contains ', len(df[var].unique()), ' labels')

In [None]:
numerical = [var for var in df.columns if df[var].dtype !='O']
print('There are {} numerical variables\n'.format(len(numerical)))
print('The numerical variables are :\n\n', numerical)

In [None]:
df[numerical].head()

In [None]:
df[numerical].isnull().sum()

In [None]:
#Train, Test data set을 나눈다.


In [None]:
X = df.drop(['Incident Type'], axis=1)
y = df['Incident Type']

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

In [None]:
# check the shape of X_train and X_test
X_train.shape, X_test.shape

In [None]:
categorical = [col for col in X_train.columns if X_train[col].dtypes == 'O']
numerical = [col for col in X_train.columns if X_train[col].dtypes != 'O']

In [None]:
# print percentage of missing values in the categorical variables in training set
X_train[categorical].isnull().mean()

In [None]:
X_train.shape

In [None]:
X_test.shape

In [None]:
cols = X_train.columns

In [None]:
# import Random Forest classifier
from sklearn.ensemble import RandomForestClassifier
# instantiate the classifier 
rfc = RandomForestClassifier(random_state=0)
rfc.fit(X_train, y_train)
y_pred = rfc.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score
print('Model accuracy score with 10 decision-trees : {0:0.4f}'. format(accuracy_score(y_test, y_pred)))