# 범주형 변수 인코딩


In [15]:
from hossam import load_data
from pandas import DataFrame, get_dummies,merge
from sklearn.preprocessing import OneHotEncoder

In [16]:
origin = load_data('nursing_grades')
origin

[94m[data][0m https://data.hossam.kr/data/lab05/nursing_grades.xlsx
[94m[desc][0m 어느 간호학과 대학원에 지원한 학생들에 대한 합격/불합격 여부를 조사한 가상의 데이터(메타데이터 없음)
[91m[!] Cannot read metadata[0m


Unnamed: 0_level_0,이름,성별,필기점수,학부성적,병원경력,합격여부
접수코드,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
NRS0001,장은우,남,380,3.61,3,불합격
NRS0002,최지호,남,660,3.67,3,합격
NRS0003,김하준,남,800,4.00,1,합격
NRS0004,임아윤,여,640,3.19,4,합격
NRS0005,강하준,남,520,2.93,4,불합격
...,...,...,...,...,...,...
NRS0396,박지유,여,620,4.00,2,불합격
NRS0397,조하은,여,560,3.04,3,불합격
NRS0398,박하윤,여,460,2.63,2,불합격
NRS0399,이지우,여,700,3.65,2,불합격


In [17]:
# 범주형 타입 변환
df = origin.astype({'성별':'category', '병원경력':'category',
                    '합격여부': 'category'})
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 400 entries, NRS0001 to NRS0400
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   이름      400 non-null    object  
 1   성별      400 non-null    category
 2   필기점수    400 non-null    int64   
 3   학부성적    400 non-null    float64 
 4   병원경력    400 non-null    category
 5   합격여부    400 non-null    category
dtypes: category(3), float64(1), int64(1), object(1)
memory usage: 30.3+ KB


### Pandas를 사용하는 방법(일반 통계 모형에 추천)

In [18]:
# 하나의 컬럼에 대해 모든 값을 더미변수로 변환
# 값의 수에 따라 N개 생성

df1 = get_dummies(df, columns =['성별'], dtype='int')
df1.head()

Unnamed: 0_level_0,이름,필기점수,학부성적,병원경력,합격여부,성별_남,성별_여
접수코드,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
NRS0001,장은우,380,3.61,3,불합격,1,0
NRS0002,최지호,660,3.67,3,합격,1,0
NRS0003,김하준,800,4.0,1,합격,1,0
NRS0004,임아윤,640,3.19,4,합격,0,1
NRS0005,강하준,520,2.93,4,불합격,1,0


In [19]:
# N-1개의 더미변수 생성
#drop_first = True 파라미터 설정(기본값 False)

df2 = get_dummies(df, columns = ['성별'], dtype = 'int',
                drop_first=True)
df2.head()

Unnamed: 0_level_0,이름,필기점수,학부성적,병원경력,합격여부,성별_여
접수코드,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
NRS0001,장은우,380,3.61,3,불합격,0
NRS0002,최지호,660,3.67,3,합격,0
NRS0003,김하준,800,4.0,1,합격,0
NRS0004,임아윤,640,3.19,4,합격,1
NRS0005,강하준,520,2.93,4,불합격,0


In [20]:
# 두 개 이상의 명목형 변수 처리

df3 = get_dummies(df, columns = ['성별', '병원경력'], dtype = 'int',
                  drop_first = True)
df3.head()            

Unnamed: 0_level_0,이름,필기점수,학부성적,합격여부,성별_여,병원경력_2,병원경력_3,병원경력_4
접수코드,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
NRS0001,장은우,380,3.61,불합격,0,0,1,0
NRS0002,최지호,660,3.67,합격,0,0,1,0
NRS0003,김하준,800,4.0,합격,0,0,0,0
NRS0004,임아윤,640,3.19,합격,1,0,0,1
NRS0005,강하준,520,2.93,불합격,0,0,0,1


### scikit-Learn 사용하는 방법

In [21]:
# 1단계 - 명목형 데이터의 차원변환

# 행 수에 관계 없이 1개의 열을 갖는 2차원 배열로 변환하는 경우
X = df['성별'].values.reshape(-1,1)
X

[['남'], ['남'], ['남'], ['여'], ['남'], ..., ['여'], ['여'], ['여'], ['여'], ['여']]
Length: 400
Categories (2, object): ['남', '여']

In [22]:
# 1단계 - 데이터 프레임 형식을 유지하는 변수 추출
# filter 함수도 동일
X = df['성별']
X.head()

접수코드
NRS0001    남
NRS0002    남
NRS0003    남
NRS0004    여
NRS0005    남
Name: 성별, dtype: category
Categories (2, object): ['남', '여']

In [23]:
# 2단계 - OneHotEncoding 처리

X = df['성별'].values.reshape(-1, 1)
encoder = OneHotEncoder(sparse_output=False, drop =None) #(희소행렬 지정 여부), 첫 번째 항목 삭제 여부(None/'fisrt')
result = encoder.fit_transform(X)
result[:10]


array([[1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.]])

In [24]:
# 3단계 - 인코딩 결과를 데이터 프레임으로 생성

# 컬럼명 생성
new_cols = encoder.get_feature_names_out(['성별'])
print(new_cols)

one_hot_df = DataFrame(result, columns=new_cols, index=df.index)
print(one_hot_df.head())

['성별_남' '성별_여']
         성별_남  성별_여
접수코드               
NRS0001   1.0   0.0
NRS0002   1.0   0.0
NRS0003   1.0   0.0
NRS0004   0.0   1.0
NRS0005   1.0   0.0


In [25]:
# 4단계 - 원본 데이터 프레임과 병합

df_copy = df.copy()
df4 = merge(df_copy, one_hot_df, left_index = True, right_index = True
            )
df4.head()

Unnamed: 0_level_0,이름,성별,필기점수,학부성적,병원경력,합격여부,성별_남,성별_여
접수코드,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
NRS0001,장은우,남,380,3.61,3,불합격,1.0,0.0
NRS0002,최지호,남,660,3.67,3,합격,1.0,0.0
NRS0003,김하준,남,800,4.0,1,합격,1.0,0.0
NRS0004,임아윤,여,640,3.19,4,합격,0.0,1.0
NRS0005,강하준,남,520,2.93,4,불합격,1.0,0.0


### 연습문제

In [26]:
origin1 = load_data('online_store_customers')
print(origin1)
origin2 = load_data('online_store_purchases')
print(origin2)

[94m[data][0m https://data.hossam.kr/data/lab05/online_store_customers.xlsx
[94m[desc][0m 한 온라인 스토어의 마케팅 팀이 연말 특별 프로모션을 기획에 필요한 고객의 기본 인구 통계 정보(메타데이터 없음)
[91m[!] Cannot read metadata[0m
            name gender  age
user_id                     
1          Alice      F   25
2            Bob      M   30
3        Charlie      M   35
4          David      M   42
5            Eve      F   28
6          Frank      M   21
7          Grace      F   33
8          Henry      M   45
9            Ivy      F   29
10          Jack      M   38
[94m[data][0m https://data.hossam.kr/data/lab05/online_store_purchases.xlsx
[94m[desc][0m 한 온라인 스토어의 마케팅 팀이 연말 특별 프로모션을 기획에 필요한 고객의 구매 관련 정보(메타데이터 없음)
[91m[!] Cannot read metadata[0m
             user_id  product size  color  price
purchase_id                                     
101                1  T-shirt    M  White  15000
102                2    Pants    L  Black  35000
103                1    Skirt    S    Red  25000
104                3  T-sh

In [27]:
df1 = merge(origin1,origin2, on = 'user_id')
df1

Unnamed: 0,user_id,name,gender,age,product,size,color,price
0,1,Alice,F,25,T-shirt,M,White,15000
1,1,Alice,F,25,Skirt,S,Red,25000
2,1,Alice,F,25,Pants,S,Blue,31000
3,2,Bob,M,30,Pants,L,Black,35000
4,2,Bob,M,30,Jacket,L,Navy,62000
5,3,Charlie,M,35,T-shirt,L,Blue,17000
6,4,David,M,42,Pants,M,Khaki,33000
7,5,Eve,F,28,Jacket,M,Black,55000
8,5,Eve,F,28,Pants,S,Khaki,32000
9,6,Frank,M,21,T-shirt,S,White,16000


In [28]:
df2 = get_dummies(df1, columns = ['gender'], dtype = 'int')
df2

Unnamed: 0,user_id,name,age,product,size,color,price,gender_F,gender_M
0,1,Alice,25,T-shirt,M,White,15000,1,0
1,1,Alice,25,Skirt,S,Red,25000,1,0
2,1,Alice,25,Pants,S,Blue,31000,1,0
3,2,Bob,30,Pants,L,Black,35000,0,1
4,2,Bob,30,Jacket,L,Navy,62000,0,1
5,3,Charlie,35,T-shirt,L,Blue,17000,0,1
6,4,David,42,Pants,M,Khaki,33000,0,1
7,5,Eve,28,Jacket,M,Black,55000,1,0
8,5,Eve,28,Pants,S,Khaki,32000,1,0
9,6,Frank,21,T-shirt,S,White,16000,0,1


In [29]:
x = df1['color'].values.reshape(-1, 1)
x

array([['White'],
       ['Red'],
       ['Blue'],
       ['Black'],
       ['Navy'],
       ['Blue'],
       ['Khaki'],
       ['Black'],
       ['Khaki'],
       ['White'],
       ['Red'],
       ['Black'],
       ['Black'],
       ['Blue'],
       ['White']], dtype=object)

In [37]:
x = df1[['color']]
x.head()

Unnamed: 0,color
0,White
1,Red
2,Blue
3,Black
4,Navy


### 연습문제


In [38]:
encoder = OneHotEncoder(sparse_output = False, drop = None)
result = encoder.fit_transform(x)
result[:10]

array([[0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 1., 0.],
       [0., 1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1.]])

In [39]:
new_cols = encoder.get_feature_names_out(['color'])
new_cols

array(['color_Black', 'color_Blue', 'color_Khaki', 'color_Navy',
       'color_Red', 'color_White'], dtype=object)

In [40]:
one_hot_df = DataFrame(result, columns = new_cols, index = df1.index)
one_hot_df.head()

Unnamed: 0,color_Black,color_Blue,color_Khaki,color_Navy,color_Red,color_White
0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,1.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,0.0


In [30]:
from hossam import load_data
from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler, StandardScaler, RobustScaler

In [31]:
origin = load_data('performance')
origin

[94m[data][0m https://data.hossam.kr/data/lab05/performance.xlsx
[94m[desc][0m 직원 성과 지표 데이터

field                 description
--------------------  -----------------
EmployeeID            직원 ID
MonthlySales          월별 매출
CustomerSatisfaction  고객 만족도
ProjectHours          프로젝트 작업시간
InnovationScore       혁신 점수



Unnamed: 0_level_0,MonthlySales,CustomerSatisfaction,ProjectHours,InnovationScore
EmployeeID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
E001,120,75,160,8
E002,250,90,180,9
E003,80,60,150,6
E004,180,82,170,8
E005,300,95,190,10
E006,90,65,155,7
E007,200,88,175,9
E008,110,70,165,7
E009,220,85,185,9
E010,70,55,145,6


In [32]:
df = origin.copy()
df['MonthlySales_scaled'] = MinMaxScaler().fit_transform(df[['MonthlySales']])
df['CustomerSatisfaction_scaled'] = MaxAbsScaler().fit_transform(df[['CustomerSatisfaction']])
df.head()

Unnamed: 0_level_0,MonthlySales,CustomerSatisfaction,ProjectHours,InnovationScore,MonthlySales_scaled,CustomerSatisfaction_scaled
EmployeeID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
E001,120,75,160,8,0.259259,0.765306
E002,250,90,180,9,0.740741,0.918367
E003,80,60,150,6,0.111111,0.612245
E004,180,82,170,8,0.481481,0.836735
E005,300,95,190,10,0.925926,0.969388


In [33]:
scaled_standard_features = StandardScaler().fit_transform(df[['ProjectHours', 'InnovationScore']])
df['ProjectHours_scaled'] = scaled_standard_features[:, 0]
df['InnovationScore_scaled'] = scaled_standard_features[:, 1]
df.head()

Unnamed: 0_level_0,MonthlySales,CustomerSatisfaction,ProjectHours,InnovationScore,MonthlySales_scaled,CustomerSatisfaction_scaled,ProjectHours_scaled,InnovationScore_scaled
EmployeeID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
E001,120,75,160,8,0.259259,0.765306,-0.462556,0.066082
E002,250,90,180,9,0.740741,0.918367,0.738889,0.7269
E003,80,60,150,6,0.111111,0.612245,-1.063279,-1.255555
E004,180,82,170,8,0.481481,0.836735,0.138166,0.066082
E005,300,95,190,10,0.925926,0.969388,1.339611,1.387719


In [34]:
df['TotalScaledScore'] = df['MonthlySales_scaled'] + df['CustomerSatisfaction_scaled'] + df['ProjectHours_scaled'] +df['InnovationScore_scaled']
df.head()

Unnamed: 0_level_0,MonthlySales,CustomerSatisfaction,ProjectHours,InnovationScore,MonthlySales_scaled,CustomerSatisfaction_scaled,ProjectHours_scaled,InnovationScore_scaled,TotalScaledScore
EmployeeID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
E001,120,75,160,8,0.259259,0.765306,-0.462556,0.066082,0.628091
E002,250,90,180,9,0.740741,0.918367,0.738889,0.7269,3.124897
E003,80,60,150,6,0.111111,0.612245,-1.063279,-1.255555,-1.595478
E004,180,82,170,8,0.481481,0.836735,0.138166,0.066082,1.522464
E005,300,95,190,10,0.925926,0.969388,1.339611,1.387719,4.622644


In [35]:
max_data = df.sort_values('TotalScaledScore', ascending = False).head(1)
max_data

Unnamed: 0_level_0,MonthlySales,CustomerSatisfaction,ProjectHours,InnovationScore,MonthlySales_scaled,CustomerSatisfaction_scaled,ProjectHours_scaled,InnovationScore_scaled,TotalScaledScore
EmployeeID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
E018,320,98,195,10,1.0,1.0,1.639972,1.387719,5.027691
