## label encoding

In [1]:
from sklearn.preprocessing import LabelEncoder
import numpy as np 
import pandas as pd

In [2]:
data =pd.Series(['apple','summer','spring'])
data

#데이터 타입 object -> 숫자는 아님 -> 아직 카테고리도 아님

0     apple
1    summer
2    spring
dtype: object

In [3]:
# seires를 기준으로 라벨링
encoder = LabelEncoder()
encoder.fit(data) # 시리즈의 값들이 카테고리 성격을 가진다.

LabelEncoder()

In [4]:
target=pd.Series(['apple','summer','summer','spring','spring'])
target

0     apple
1    summer
2    summer
3    spring
4    spring
dtype: object

In [5]:
encoder.transform(target)

# fit_transform과 fit 후 transform 차이 
# 라벨과 transform 시키는 데이터가 다를 경우 fit 후 transform 사용

array([0, 2, 2, 1, 1])

In [6]:
target2 = encoder.transform(target)
type(target2)

numpy.ndarray

In [7]:
# decode
target3 = encoder.inverse_transform(target2)
target3

array(['apple', 'summer', 'summer', 'spring', 'spring'], dtype=object)

In [9]:
# 2개 컬럼 이상 라벨 인코딩 

words = pd.Series(['apple','summer','summer','spring','spring'])
hobby = pd.Series(['book','run','talk','coffee','song'])

In [10]:
df = pd.DataFrame({"words":words,"hobby":hobby})
df

Unnamed: 0,words,hobby
0,apple,book
1,summer,run
2,summer,talk
3,spring,coffee
4,spring,song


In [13]:
columns = df.columns
columns

Index(['words', 'hobby'], dtype='object')

In [14]:
encoder2 = LabelEncoder()

In [15]:
for col in columns : 
    df[col] = encoder2.fit_transform(df[col])
df

Unnamed: 0,words,hobby
0,0,0
1,2,2
2,2,4
3,1,1
4,1,3


## one hot encoding

In [11]:
data2 = pd.Series(['apple', 'summer', 'spring', 'summer', 'spring'])
data2

# series 객체는 index와 value 의 결합 

0     apple
1    summer
2    spring
3    summer
4    spring
dtype: object

In [12]:
data3 = pd.get_dummies(data2)
data3

# get_dummies(x) : x 의 단위는 column, 객체는 series
# array 는 객체, list는 특별한 객체 중 하나(dic, list)
# 누구? : 순수한 데이터 or 부품

Unnamed: 0,apple,spring,summer
0,1,0,0
1,0,0,1
2,0,1,0
3,0,0,1
4,0,1,0


In [22]:
# 2개 컬럼 이상 원 핫 인코딩

df2 = pd.DataFrame({"words" : ['apple', 'summer', 'summer', 'spring', 'spring'], 
                    "hobby" : ['book', 'run','talk','coffee','song']})
df2

Unnamed: 0,words,hobby
0,apple,book
1,summer,run
2,summer,talk
3,spring,coffee
4,spring,song


In [23]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   words   5 non-null      object
 1   hobby   5 non-null      object
dtypes: object(2)
memory usage: 208.0+ bytes


In [18]:
pd.get_dummies(df2['words'])

Unnamed: 0,apple,spring,summer
0,1,0,0
1,0,0,1
2,0,0,1
3,0,1,0
4,0,1,0


In [25]:
df2['words'] = df2['words'].astype('category')
df2['hobby'] = df2['hobby'].astype('category')

In [26]:
df2.info()

# 메모리 사용량 증가되었다
# 카테고리를 위한 메모리가 필요하기 때문

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   words   5 non-null      category
 1   hobby   5 non-null      category
dtypes: category(2)
memory usage: 482.0 bytes


In [27]:
pd.get_dummies(df2)

# dummy로 바꾸는 건 training 할 때! 

Unnamed: 0,words_apple,words_spring,words_summer,hobby_book,hobby_coffee,hobby_run,hobby_song,hobby_talk
0,1,0,0,1,0,0,0,0
1,0,0,1,0,0,1,0,0
2,0,0,1,0,0,0,0,1
3,0,1,0,0,1,0,0,0
4,0,1,0,0,0,0,1,0


In [29]:
df2

Unnamed: 0,words,hobby
0,apple,book
1,summer,run
2,summer,talk
3,spring,coffee
4,spring,song


In [28]:
df3 = pd.get_dummies(df2) #데이터프레임 전체 더미화
df3

Unnamed: 0,words_apple,words_spring,words_summer,hobby_book,hobby_coffee,hobby_run,hobby_song,hobby_talk
0,1,0,0,1,0,0,0,0
1,0,0,1,0,0,1,0,0
2,0,0,1,0,0,0,0,1
3,0,1,0,0,1,0,0,0
4,0,1,0,0,0,0,1,0


In [30]:
df3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype
---  ------        --------------  -----
 0   words_apple   5 non-null      uint8
 1   words_spring  5 non-null      uint8
 2   words_summer  5 non-null      uint8
 3   hobby_book    5 non-null      uint8
 4   hobby_coffee  5 non-null      uint8
 5   hobby_run     5 non-null      uint8
 6   hobby_song    5 non-null      uint8
 7   hobby_talk    5 non-null      uint8
dtypes: uint8(8)
memory usage: 168.0 bytes


## banklist 데이터로 인코딩 활용

In [31]:
df_bank = pd.read_csv('banklist.csv')
df_bank

Unnamed: 0,Bank Name,City,ST,CERT,Acquiring Institution,Closing Date,Updated Date
0,Washington Federal Bank for Savings,Chicago,IL,30570,Royal Savings Bank,15-Dec-17,20-Dec-17
1,The Farmers and Merchants State Bank of Argonia,Argonia,KS,17719,Conway Bank,13-Oct-17,20-Oct-17
2,Fayette County Bank,Saint Elmo,IL,1802,"United Fidelity Bank, fsb",26-May-17,26-Jul-17
3,"Guaranty Bank, (d/b/a BestBank in Georgia & Mi...",Milwaukee,WI,30003,First-Citizens Bank & Trust Company,5-May-17,26-Jul-17
4,First NBC Bank,New Orleans,LA,58302,Whitney Bank,28-Apr-17,5-Dec-17
...,...,...,...,...,...,...,...
550,"Superior Bank, FSB",Hinsdale,IL,32646,"Superior Federal, FSB",27-Jul-01,19-Aug-14
551,Malta National Bank,Malta,OH,6629,North Valley Bank,3-May-01,18-Nov-02
552,First Alliance Bank & Trust Co.,Manchester,NH,34264,Southern New Hampshire Bank & Trust,2-Feb-01,18-Feb-03
553,National State Bank of Metropolis,Metropolis,IL,3815,Banterra Bank of Marion,14-Dec-00,17-Mar-05


In [32]:
# 라벨인코딩 : Bank Name, City

from sklearn.preprocessing import LabelEncoder 

encoder = LabelEncoder()

for col in ['Bank Name', 'City'] :
    df_bank[col] = encoder.fit_transform(df_bank[col])

df_bank

Unnamed: 0,Bank Name,City,ST,CERT,Acquiring Institution,Closing Date,Updated Date
0,522,69,IL,30570,Royal Savings Bank,15-Dec-17,20-Dec-17
1,480,12,KS,17719,Conway Bank,13-Oct-17,20-Oct-17
2,168,328,IL,1802,"United Fidelity Bank, fsb",26-May-17,26-Jul-17
3,253,239,WI,30003,First-Citizens Bank & Trust Company,5-May-17,26-Jul-17
4,205,250,LA,58302,Whitney Bank,28-Apr-17,5-Dec-17
...,...,...,...,...,...,...,...
550,463,162,IL,32646,"Superior Federal, FSB",27-Jul-01,19-Aug-14
551,314,220,OH,6629,North Valley Bank,3-May-01,18-Nov-02
552,171,221,NH,34264,Southern New Hampshire Bank & Trust,2-Feb-01,18-Feb-03
553,338,235,IL,3815,Banterra Bank of Marion,14-Dec-00,17-Mar-05


In [33]:
# 원핫인코딩 : ST

pd.get_dummies(df_bank['ST'])

Unnamed: 0,AL,AR,AZ,CA,CO,CT,FL,GA,HI,IA,...,SC,SD,TN,TX,UT,VA,WA,WI,WV,WY
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
550,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
551,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
552,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
553,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
