In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

class disp(object):
    template = '<div style="float: left;padding:10px;"> <b>[{0}]</b> {1}</div>'
    def __init__(self, *args):
        self.args = args
        
    def _repr_html_(self):
        return '\n'.join(self.template.format(a, eval(a)._repr_html_())
                         for a in self.args )

import pandas as pd
import numpy as np

### [예제1] Category화

In [2]:
# [1]

sr = pd.Series(['kor','eng','kor','math']).astype('category')
r1 = sr.cat.categories  # kor, eng, math -> 3개의 카테고리 반환
r2 = sr.cat.codes       # eng:0, kor:1, math:2 -> 카테고리별로 코드 부여 
sr; r1; r2

0     kor
1     eng
2     kor
3    math
dtype: category
Categories (3, object): ['eng', 'kor', 'math']

Index(['eng', 'kor', 'math'], dtype='object')

0    1
1    0
2    1
3    2
dtype: int8

In [3]:
# [2]

sr = pd.Series(['kor','eng','kor','math']).astype('category')
sr = sr.cat.rename_categories(['EN','KR','MT'])
sr

0    KR
1    EN
2    KR
3    MT
dtype: category
Categories (3, object): ['EN', 'KR', 'MT']

### [예제2] Category 수정, 선별적 Category화

In [3]:
# [1-1]

sr = pd.Series(['kor','eng','kor','math']).astype('category')
sr1 = sr.cat.set_categories(['math','eng','kor'])
r1 = sr1.sort_values()
sr; sr1; r1

0     kor
1     eng
2     kor
3    math
dtype: category
Categories (3, object): ['eng', 'kor', 'math']

0     kor
1     eng
2     kor
3    math
dtype: category
Categories (3, object): ['math', 'eng', 'kor']

3    math
1     eng
0     kor
2     kor
dtype: category
Categories (3, object): ['math', 'eng', 'kor']

In [5]:
# [1-2]

sr = pd.Series(['kor','eng','kor','math']).astype('category')
sr2 = sr.cat.set_categories(['math','eng','kor'], ordered=True)
r2 = sr2 > 'eng'
r3 = sr2.max()
# r4 = sr1.max() # error
sr; sr2; r2; r3

0     kor
1     eng
2     kor
3    math
dtype: category
Categories (3, object): ['eng', 'kor', 'math']

0     kor
1     eng
2     kor
3    math
dtype: category
Categories (3, object): ['math' < 'eng' < 'kor']

0     True
1    False
2     True
3    False
dtype: bool

'kor'

### [예제3] Categorical 객체 생성

In [6]:
cat1 = pd.Categorical(['a','b','c','a'])
cat2 = pd.Categorical(['a','b','c','a'], categories=['b','c','a'])
cat3 = pd.Categorical(['a','b','c','a'], ordered=True)
sr = pd.Series(cat1)
cat1; cat2; cat3; sr

['a', 'b', 'c', 'a']
Categories (3, object): ['a', 'b', 'c']

['a', 'b', 'c', 'a']
Categories (3, object): ['b', 'c', 'a']

['a', 'b', 'c', 'a']
Categories (3, object): ['a' < 'b' < 'c']

0    a
1    b
2    c
3    a
dtype: category
Categories (3, object): ['a', 'b', 'c']

### [예제4] cut()메서드

In [7]:
# [1-1]

sr = pd.Series([95,87,55,77,100])
r1 = pd.cut(sr, bins=3) # 3등분할때 어느구간에 속하는지 series로 반환해줌
sr; r1

0     95
1     87
2     55
3     77
4    100
dtype: int64

0     (85.0, 100.0]
1     (85.0, 100.0]
2    (54.955, 70.0]
3      (70.0, 85.0]
4     (85.0, 100.0]
dtype: category
Categories (3, interval[float64, right]): [(54.955, 70.0] < (70.0, 85.0] < (85.0, 100.0]]

In [None]:
# [1-2]

sr = pd.Series([95,87,55,77,100])
r2 = pd.cut(sr, bins=3,labels=['A', 'B', 'C'])
r3 = pd.cut(sr, bins=[54,70,85,100], labels=['A','B','C'])
sr; r2; r3

### [예제5] qcut() 메서드

* qcut : 같은 개수로 구간 나누기

In [8]:
sr = pd.Series([95,87,55,77])
r1 = pd.qcut(sr, 2)
r2 = pd.qcut(sr, 2, labels=['A','B'])
sr; r1; r2    

0    95
1    87
2    55
3    77
dtype: int64

0      (82.0, 95.0]
1      (82.0, 95.0]
2    (54.999, 82.0]
3    (54.999, 82.0]
dtype: category
Categories (2, interval[float64, right]): [(54.999, 82.0] < (82.0, 95.0]]

0    B
1    B
2    A
3    A
dtype: category
Categories (2, object): ['A' < 'B']

### [예제6] cut()메서드 활용 예

In [5]:
# [1]

df = pd.DataFrame({'name':['kim','lee','park','song','lew'],'exam':[95,87,55,77,100]}) 

df['grade'] = pd.cut(df['exam'],bins=[-1,59,69,79,89,100],labels=list('FDCBA'))
r1 = df.groupby('grade').size().to_frame('count')
disp('df', 'r1')

Unnamed: 0,name,exam,grade
0,kim,95,A
1,lee,87,B
2,park,55,F
3,song,77,C
4,lew,100,A

Unnamed: 0_level_0,count
grade,Unnamed: 1_level_1
F,1
D,0
C,1
B,1
A,2


In [6]:
# [2]

df = pd.DataFrame({'name':['kim','lee','park','song','lew'],'exam':[95,87,55,77,100]}) 
df['grade'] = pd.cut(df['exam'],bins=[-1,59,69,79,89,100],labels=list('FDCBA'))
df['class'] = pd.qcut(df['exam'], 3, ['low','mid','high'])

r2 = df.groupby('class').agg(count=('name','count'),avg=('exam','mean'))
disp('df', 'r2')

Unnamed: 0,name,exam,grade,class
0,kim,95,A,high
1,lee,87,B,mid
2,park,55,F,low
3,song,77,C,low
4,lew,100,A,high

Unnamed: 0_level_0,count,avg
class,Unnamed: 1_level_1,Unnamed: 2_level_1
low,2,66.0
mid,1,87.0
high,2,97.5
