# Pandas 데이터 변환

## applymap 변환

- 단일 원소 변환

In [1]:
import numpy as np
import pandas as pd

np.random.seed(0)
df = pd.DataFrame(np.random.randn(4, 3), columns=list('bde'), index=['Utah', 'Ohio', 'Texas', 'Oregon'])
df

Unnamed: 0,b,d,e
Utah,1.764052,0.400157,0.978738
Ohio,2.240893,1.867558,-0.977278
Texas,0.950088,-0.151357,-0.103219
Oregon,0.410599,0.144044,1.454274


In [2]:
format = lambda x: '%.2f' % x
df2 = df.applymap(format)
df2

Unnamed: 0,b,d,e
Utah,1.76,0.4,0.98
Ohio,2.24,1.87,-0.98
Texas,0.95,-0.15,-0.1
Oregon,0.41,0.14,1.45


In [3]:
df.values.dtype, df2.values.dtype

(dtype('float64'), dtype('O'))

## apply 변환

- row/column 변환

In [4]:
df = pd.DataFrame({
        'Qu1': [1, 3, 4, 3, 4],
        'Qu2': [2, 3, 1, 2, 3],
        'Qu3': [1, 5, 2, 4, 4]
    })
df

Unnamed: 0,Qu1,Qu2,Qu3
0,1,2,1
1,3,3,5
2,4,1,2
3,3,2,4
4,4,3,4


In [5]:
f = lambda x: 2 * x
df.apply(f)

Unnamed: 0,Qu1,Qu2,Qu3
0,2,4,2
1,6,6,10
2,8,2,4
3,6,4,8
4,8,6,8


In [6]:
f = lambda x: x.max() - x.min()
df.apply(f)

Qu1    3
Qu2    2
Qu3    4
dtype: int64

In [7]:
df.apply(f, axis=1)

0    1
1    2
2    3
3    2
4    1
dtype: int64

In [8]:
df.apply(pd.value_counts)

Unnamed: 0,Qu1,Qu2,Qu3
1,1.0,1.0,1.0
2,,2.0,1.0
3,2.0,2.0,
4,2.0,,2.0
5,,,1.0


In [9]:
df.apply(pd.value_counts).fillna(0)

Unnamed: 0,Qu1,Qu2,Qu3
1,1.0,1.0,1.0
2,0.0,2.0,1.0
3,2.0,2.0,0.0
4,2.0,0.0,2.0
5,0.0,0.0,1.0


## 데이터프레임과 시리즈의 연산

데이터프레임의 각 행을 같은 크기의 시리즈와 연산하면 반복 연산(브로드캐스팅)을 한다. 다만 열은 연산이 되지 않으므로 전치 연산을 통해야 한다.

In [10]:
df/df.ix[0]

Unnamed: 0,Qu1,Qu2,Qu3
0,1.0,1.0,1.0
1,3.0,1.5,5.0
2,4.0,0.5,2.0
3,3.0,1.0,4.0
4,4.0,1.5,4.0


In [11]:
(df.T/df.ix[:,0]).T

Unnamed: 0,Qu1,Qu2,Qu3
0,1.0,2.0,1.0
1,1.0,1.0,1.666667
2,1.0,0.25,0.5
3,1.0,0.666667,1.333333
4,1.0,0.75,1.0


## cut / qcut

- 실수 자료를 카테고리 자료로 변환
- cut: bins 를 사용자 지정
- qcut: quantile 기준

In [12]:
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]
bins = [18, 25, 35, 60, 100]
cats = pd.cut(ages, bins)
cats

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, object): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [13]:
cats.categories

Index(['(18, 25]', '(25, 35]', '(35, 60]', '(60, 100]'], dtype='object')

In [14]:
cats.codes

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [15]:
pd.cut(ages, [18, 26, 36, 61, 100], right=False)

[[18, 26), [18, 26), [18, 26), [26, 36), [18, 26), ..., [26, 36), [61, 100), [36, 61), [36, 61), [26, 36)]
Length: 12
Categories (4, object): [[18, 26) < [26, 36) < [36, 61) < [61, 100)]

In [16]:
group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']
pd.cut(ages, bins, labels=group_names)

[Youth, Youth, Youth, YoungAdult, Youth, ..., YoungAdult, Senior, MiddleAged, MiddleAged, YoungAdult]
Length: 12
Categories (4, object): [Youth < YoungAdult < MiddleAged < Senior]

In [17]:
df = pd.DataFrame(ages, columns=["ages"])
df.tail()

Unnamed: 0,ages
7,31
8,61
9,45
10,41
11,32


In [18]:
df["age_cat"] = pd.cut(df.ages, bins, labels=group_names)
df

Unnamed: 0,ages,age_cat
0,20,Youth
1,22,Youth
2,25,Youth
3,27,YoungAdult
4,21,Youth
5,23,Youth
6,37,MiddleAged
7,31,YoungAdult
8,61,Senior
9,45,MiddleAged


In [19]:
data = np.random.randn(1000)
cats = pd.qcut(data, 4)
cats

[(0.584, 2.759], (-0.058, 0.584], (-0.058, 0.584], (-0.058, 0.584], (0.584, 2.759], ..., [-3.0461, -0.705], (-0.058, 0.584], (-0.058, 0.584], [-3.0461, -0.705], (0.584, 2.759]]
Length: 1000
Categories (4, object): [[-3.0461, -0.705] < (-0.705, -0.058] < (-0.058, 0.584] < (0.584, 2.759]]

In [20]:
pd.value_counts(cats)

(0.584, 2.759]       250
(-0.058, 0.584]      250
(-0.705, -0.058]     250
[-3.0461, -0.705]    250
dtype: int64

In [21]:
pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.])

[(-0.058, 1.212], (-0.058, 1.212], (-0.058, 1.212], (-0.058, 1.212], (1.212, 2.759], ..., [-3.0461, -1.304], (-0.058, 1.212], (-0.058, 1.212], (-1.304, -0.058], (1.212, 2.759]]
Length: 1000
Categories (4, object): [[-3.0461, -1.304] < (-1.304, -0.058] < (-0.058, 1.212] < (1.212, 2.759]]