
# Ch05. pandas 시작하기

In [88]:
from pandas import Series, DataFrame

In [89]:
import pandas as pd

## 5.1 pandas 자료 구조 소개

### 5.1.1 Series

In [90]:
#Series는 일련의 객체를 담을 수 있는 1차원 배열같은 자료구조
obj = Series([4,7,-5,3])
#보통 Series는 Series([values],index=[index]) 이러한 형태로 만듦

In [91]:
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [92]:
type(obj)

pandas.core.series.Series

In [93]:
obj.values

array([ 4,  7, -5,  3], dtype=int64)

In [94]:
obj.index

Int64Index([0, 1, 2, 3], dtype='int64')

In [95]:
obj2=Series([4,7,-5,3],index=['d','b','a','c'])
#value 와 index 지정가능

In [96]:
obj2

d    4
b    7
a   -5
c    3
dtype: int64

In [97]:
obj2['a']

-5

In [98]:
obj2.index

Index([u'd', u'b', u'a', u'c'], dtype='object')

In [99]:
obj2['a']

-5

In [100]:
obj2['d'] = 6

In [101]:
obj2

d    6
b    7
a   -5
c    3
dtype: int64

In [102]:
obj2[['c','a','d']]

c    3
a   -5
d    6
dtype: int64

In [103]:
obj2

d    6
b    7
a   -5
c    3
dtype: int64

In [104]:
obj2[obj2>0]

d    6
b    7
c    3
dtype: int64

In [105]:
obj2*2

d    12
b    14
a   -10
c     6
dtype: int64

In [106]:
import numpy as np

In [107]:
np.exp(obj2)

d     403.428793
b    1096.633158
a       0.006738
c      20.085537
dtype: float64

In [108]:
obj2

d    6
b    7
a   -5
c    3
dtype: int64

In [109]:
'b' in obj2
#obj2에 'b' 색인이 있는지 확인

True

In [110]:
'e' in obj2

False

In [111]:
12 in obj2

False

In [112]:
6 in obj2

False

In [113]:
sdata = {'Ohio' : 35000, 'Texas' : 71000, 'Oregon' : 16000, 'Utah' : 5000}

In [114]:
sdata

{'Ohio': 35000, 'Oregon': 16000, 'Texas': 71000, 'Utah': 5000}

In [115]:
type(sdata)

dict

In [116]:
obj3=Series(sdata)

In [117]:
obj3
#Series에 dict를 인자로 넣으면 [index : value] 형태로 저장이 됨

Ohio      35000
Oregon    16000
Texas     71000
Utah       5000
dtype: int64

In [118]:
obj3['Ohio'],obj3[0]

(35000, 35000)

In [119]:
states= ['California','Ohio','Oregon','Texas']

In [120]:
obj4=Series(sdata,index=states)
#'California'는 index가 없으므로 values가 Nan

In [121]:
obj4

California      NaN
Ohio          35000
Oregon        16000
Texas         71000
dtype: float64

In [122]:
pd.isnull(obj4)

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [123]:
pd.notnull(obj4)

California    False
Ohio           True
Oregon         True
Texas          True
dtype: bool

In [124]:
obj4.isnull()

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [125]:
obj3

Ohio      35000
Oregon    16000
Texas     71000
Utah       5000
dtype: int64

In [126]:
obj4

California      NaN
Ohio          35000
Oregon        16000
Texas         71000
dtype: float64

In [127]:
obj3+obj4

California       NaN
Ohio           70000
Oregon         32000
Texas         142000
Utah             NaN
dtype: float64

In [133]:
obj4.name = 'population'

In [134]:
obj4.index.name = 'state'

In [135]:
obj4

state
California      NaN
Ohio          35000
Oregon        16000
Texas         71000
Name: population, dtype: float64

In [136]:
obj.index = ['Bob','Steve','Jeff','Ryan']
#색인의 이름을 변경

In [137]:
obj

Bob      4
Steve    7
Jeff    -5
Ryan     3
dtype: int64

### 5.1.2 DataFrame

In [138]:
data = {'state' : ['Ohio','Ohio','Ohio','Nevada','Nevada'],
        'year' : [2000,2001,2002,2001,2002],
        'pop' : [1.5, 1.7, 3.6, 2.4, 2.9]}

In [139]:
data

{'pop': [1.5, 1.7, 3.6, 2.4, 2.9],
 'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],
 'year': [2000, 2001, 2002, 2001, 2002]}

In [140]:
frame= DataFrame(data)
#DataFrame은 표 같은 스프레드시트 형식의 자료 구조로 여러 개의
#colum이 있는데, 각 colum은 서로 다른 종류의 값
#(숫자,문자열,boolean)을 담을수 있다.

In [141]:
frame

Unnamed: 0,pop,state,year
0,1.5,Ohio,2000
1,1.7,Ohio,2001
2,3.6,Ohio,2002
3,2.4,Nevada,2001
4,2.9,Nevada,2002


In [142]:
type(frame)

pandas.core.frame.DataFrame

In [143]:
DataFrame(data,columns=['year','state','pop'])
#원하는 순서대로 columns를 지정이 가능

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9


In [144]:
DataFrame(data,index=['A','B',3,4,'Five'])
#columns와 달리 자신이 원하는 index를 지정이 가능하다

Unnamed: 0,pop,state,year
A,1.5,Ohio,2000
B,1.7,Ohio,2001
3,3.6,Ohio,2002
4,2.4,Nevada,2001
Five,2.9,Nevada,2002


In [145]:
frame2 = DataFrame(data,columns=['year','state','pop','debt'],
                   index=['one','two','three','four','five'])
#존재하지않는 column_key 값을 지정시, NaN(Not a Number)가 됨
#index는 자신이 지정한 형태로 각 index가 바뀜

In [263]:
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,-1.2
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,-1.5
five,2002,Nevada,2.9,-1.7


In [147]:
frame2.columns

Index([u'year', u'state', u'pop', u'debt'], dtype='object')

In [148]:
print frame2['state']
print
print frame2.state

one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
Name: state, dtype: object

one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
Name: state, dtype: object


In [149]:
print frame2['year']
print
print frame2.year

one      2000
two      2001
three    2002
four     2001
five     2002
Name: year, dtype: int64

one      2000
two      2001
three    2002
four     2001
five     2002
Name: year, dtype: int64


In [150]:
frame2.ix['three']
#ix(=index의 줄임말로 생각하여) 해당 index의 정보를 반환

year     2002
state    Ohio
pop       3.6
debt      NaN
Name: three, dtype: object

In [151]:
frame2['debt'] = 16.5

In [152]:
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,16.5
two,2001,Ohio,1.7,16.5
three,2002,Ohio,3.6,16.5
four,2001,Nevada,2.4,16.5
five,2002,Nevada,2.9,16.5


In [189]:
frame2['debt'] = np.arange(5.)
#Numpy.arange(start,end,interval)

In [190]:
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,0
two,2001,Ohio,1.7,1
three,2002,Ohio,3.6,2
four,2001,Nevada,2.4,3
five,2002,Nevada,2.9,4


In [191]:
val = Series([-1.2, -1.5, -1.7],index=['two','four','five'])

In [192]:
val

two    -1.2
four   -1.5
five   -1.7
dtype: float64

In [193]:
frame2['debt']=val
#index가 같은 column에 value를 대입

In [194]:
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,-1.2
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,-1.5
five,2002,Nevada,2.9,-1.7


In [195]:
frame2['eastern'] = frame2.state == 'Ohio'
#frame2.state == 'Ohio' 논리연산후 T/F를 column[eastern]에 대입

In [196]:
frame2

Unnamed: 0,year,state,pop,debt,eastern
one,2000,Ohio,1.5,,True
two,2001,Ohio,1.7,-1.2,True
three,2002,Ohio,3.6,,True
four,2001,Nevada,2.4,-1.5,False
five,2002,Nevada,2.9,-1.7,False


In [197]:
del frame2['eastern']

In [198]:
frame2.columns

Index([u'year', u'state', u'pop', u'debt'], dtype='object')

In [199]:
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,-1.2
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,-1.5
five,2002,Nevada,2.9,-1.7


In [200]:
pop = {'Nevada': {2001: 2.4,2002: 2.9},'Ohio':{2000:1.5, 2001: 1.7, 2002: 3.6}}

In [214]:
pop

{'Nevada': {2001: 2.4, 2002: 2.9}, 'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}

In [215]:
type(pop)

dict

In [216]:
type(pop['Nevada'][2001])

float

In [217]:
pop['Nevada'][2001]

2.4

In [218]:
frame3 = DataFrame(pop)

In [219]:
frame3

Unnamed: 0,Nevada,Ohio
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


In [220]:
frame3.T
#행과 열을 transform

Unnamed: 0,2000,2001,2002
Nevada,,2.4,2.9
Ohio,1.5,1.7,3.6


In [221]:
DataFrame(pop,index=[2001,2002,2003])

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2003,,


In [222]:
pdata = {'Ohio' : frame3['Ohio'][:-1],
        'Nevada' : frame3['Nevada'][:2]}

In [223]:
pdata

{'Nevada': 2000    NaN
 2001    2.4
 Name: Nevada, dtype: float64, 'Ohio': 2000    1.5
 2001    1.7
 Name: Ohio, dtype: float64}

In [224]:
DataFrame(pdata)

Unnamed: 0,Nevada,Ohio
2000,,1.5
2001,2.4,1.7


In [225]:
frame3.index.name = 'year'
frame3.columns.name = 'state'
#index의 이름은 'year',column의 이름은 'state'

In [226]:
frame3

state,Nevada,Ohio
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


In [227]:
frame3.values

array([[ nan,  1.5],
       [ 2.4,  1.7],
       [ 2.9,  3.6]])

In [228]:
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,-1.2
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,-1.5
five,2002,Nevada,2.9,-1.7


In [229]:
frame2.values

array([[2000L, 'Ohio', 1.5, nan],
       [2001L, 'Ohio', 1.7, -1.2],
       [2002L, 'Ohio', 3.6, nan],
       [2001L, 'Nevada', 2.4, -1.5],
       [2002L, 'Nevada', 2.9, -1.7]], dtype=object)

### 5.1.3 색인 객체

In [232]:
obj = Series(range(3),index=['a','b','c'])

In [233]:
obj

a    0
b    1
c    2
dtype: int64

In [234]:
index = obj.index

In [235]:
index

Index([u'a', u'b', u'c'], dtype='object')

In [236]:
index[1:]

Index([u'b', u'c'], dtype='object')

In [90]:
index[1]

'b'

In [237]:
index[1] = 'd'
#색인은 변경이 불가능함

TypeError: Indexes does not support mutable operations

In [238]:
index = pd.Index(np.arange(3))

In [239]:
index

Int64Index([0, 1, 2], dtype='int64')

In [240]:
obj2 = Series([-1.5, -2.5, 0], index = index)

In [241]:
obj2

0   -1.5
1   -2.5
2    0.0
dtype: float64

In [242]:
obj2.index is index

True

In [243]:
frame3

state,Nevada,Ohio
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


In [244]:
'Ohio' in frame3
#column에 대해서 'Ohio' 존재?

True

In [245]:
2000 in frame3
#column에 대해서 2000 존재?

False

In [246]:
#그러면 index에 대해선 어떻게 확인?
2000 in frame3.index

True

In [247]:
2003 in frame3.index

False

## 5.2 핵심 기능

### 5.2.1 재색인

In [248]:
obj = Series([4.5, 7.2, -5.3, 3.6],index=['d','b','a','c'])

In [249]:
obj

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

In [250]:
obj2 = obj.reindex(['a','b','c','d','e'])
#색인을 다시 설정

In [251]:
obj2

a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64

In [252]:
obj.reindex(['a','b','c','d','e'],fill_value=0)
#비어있는곳은 0으로 채우기

a   -5.3
b    7.2
c    3.6
d    4.5
e    0.0
dtype: float64

In [253]:
obj3 = Series(['blue','purple','yellow'],index=[0,2,4])

In [254]:
obj3
#현재 index가 0,2,4 만 있고 1,3은 없는 상태

0      blue
2    purple
4    yellow
dtype: object

In [255]:
obj3.reindex(range(6),method="ffill")
#5까지의 index들중 누락된 index들을 채워줌(value는 앞의 값과 동일(=ffill(front_fill)))
#---------- reindex 메서드(보간) 옵션 ----------
#1. ffill 또는 pad : 앞의 값으로 채워 넣는다.
#2. bfill 또는 backfill : 뒤의 값으로 채워 넣는다.

0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object

In [256]:
obj3 = Series(['blue','purple','yellow'],index=[0,2,4])

In [257]:
obj3

0      blue
2    purple
4    yellow
dtype: object

In [258]:
obj3.reindex(range(6),method="bfill")
#value는 뒤의 값과 동일(=bfill(back_fill))

0      blue
1    purple
2    purple
3    yellow
4    yellow
5       NaN
dtype: object

In [113]:
frame = DataFrame(np.arange(9).reshape((3,3)),index=['a','c','d'],columns=['Ohio','Texas','California'])

In [114]:
frame = DataFrame(np.arange(9).reshape(3,3),index=['a','c','d'],columns=['Ohio','Texas','California'])

In [115]:
frame.shape

(3, 3)

In [116]:
frame

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [117]:
frame2 = frame.reindex(['a','b','c','d'])

In [118]:
frame2

Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


In [119]:
states = ['Texas', 'Utah', 'California']

In [120]:
frame.reindex(columns=states)
#column을 변수로 대입가능

Unnamed: 0,Texas,Utah,California
a,1,,2
c,4,,5
d,7,,8


In [121]:
frame.reindex(index=['a','b','c','d'],method='ffill',columns=states)

Unnamed: 0,Texas,Utah,California
a,1,,2
b,1,,2
c,4,,5
d,7,,8


In [122]:
frame.ix[['a','b','c','d'],states]

Unnamed: 0,Texas,Utah,California
a,1.0,,2.0
b,,,
c,4.0,,5.0
d,7.0,,8.0


###5.2.2 하나의 로우 또는 칼럼 삭제하기

In [123]:
#drop 메서드를 사용하면 선택한 값이 삭제된 새로운 객체를 얻을수 있다.
obj = Series(np.arange(5.),index=['a','b','c','d','e'])

In [124]:
obj

a    0
b    1
c    2
d    3
e    4
dtype: float64

In [125]:
new_obj = obj.drop('c')
#column = 'c'부분을 drop

In [126]:
new_obj

a    0
b    1
d    3
e    4
dtype: float64

In [127]:
obj.drop(['d','c'])

a    0
b    1
e    4
dtype: float64

In [269]:
data = DataFrame(np.arange(16).reshape(4,4),index = ['Ohio','Colorado','Utah','New York'],columns=['one','two','three','four'])

In [270]:
data.drop(['Colorado','Utah'])

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
New York,12,13,14,15


In [271]:
data.drop('two',axis=1)
#1번축의 'two' 제거

Unnamed: 0,one,three,four
Ohio,0,2,3
Colorado,4,6,7
Utah,8,10,11
New York,12,14,15


In [272]:
data.drop(['two','four'],axis=1)

Unnamed: 0,one,three
Ohio,0,2
Colorado,4,6
Utah,8,10
New York,12,14


In [273]:
data.drop(['Ohio','Utah'],axis=0)
#0번축의 'Ohio','Utah'제거(default=0)

Unnamed: 0,one,two,three,four
Colorado,4,5,6,7
New York,12,13,14,15


###5.2.3 색인하기, 선택하기, 거르기

In [274]:
obj = Series(np.arange(4.),index=['a','b','c','d'])

In [275]:
obj

a    0
b    1
c    2
d    3
dtype: float64

In [276]:
print obj['b']
print
print obj[1]

1.0

1.0


In [277]:
print obj[2:4]
print
print obj[['b','a','d']]

c    2
d    3
dtype: float64

b    1
a    0
d    3
dtype: float64


In [278]:
obj[['b','a']]
#안에 리스트형태가 들어가야함

b    1
a    0
dtype: float64

In [279]:
obj[[1,3]]

b    1
d    3
dtype: float64

In [280]:
obj[obj<2]

a    0
b    1
dtype: float64

In [281]:
obj['b':'c']
#라벨이름으로 슬라이싱할 경우 시작점과 끝점을 포함하는 점이 다르다!

b    1
c    2
dtype: float64

In [282]:
obj['b':'c']=5

In [283]:
obj

a    0
b    5
c    5
d    3
dtype: float64

In [284]:
data = DataFrame(np.arange(16).reshape(4,4),
                index=['Ohio','Colorado','Utah','New York'],
                columns=['one','two','three','four'])

In [285]:
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [286]:
data['two']

Ohio         1
Colorado     5
Utah         9
New York    13
Name: two, dtype: int32

In [287]:
data[['three','one']]

Unnamed: 0,three,one
Ohio,2,0
Colorado,6,4
Utah,10,8
New York,14,12


In [288]:
data[:2]
#index를 1까지

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7


In [289]:
data['three']>5

Ohio        False
Colorado     True
Utah         True
New York     True
Name: three, dtype: bool

In [290]:
data[data['three']>5]
#data['Colorado','Utah','New York']

Unnamed: 0,one,two,three,four
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [296]:
data < 5
#모든요소를 각각 검사
#????

Unnamed: 0,one,two,three,four
Ohio,True,True,True,True
Colorado,True,False,False,False
Utah,False,False,False,False
New York,False,False,False,False


In [151]:
data[data<5] = 0

In [152]:
data
#이전에서 True처리된 요소들 전부 0으로 밝혀짐

Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Colorado,0,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [153]:
data.ix['Colorado',['two','three']]

two      5
three    6
Name: Colorado, dtype: int32

In [154]:
data.ix[['Colorado','Utah'],[3,0,1]]

Unnamed: 0,four,one,two
Colorado,7,0,5
Utah,11,8,9


In [155]:
data.ix[2]
#index[2]인 Utah의 정보

one       8
two       9
three    10
four     11
Name: Utah, dtype: int32

In [157]:
data.ix[:'Utah','two']

Ohio        0
Colorado    5
Utah        9
Name: two, dtype: int32

In [159]:
data.ix[data.three>5,:3]

Unnamed: 0,one,two,three
Colorado,0,5,6
Utah,8,9,10
New York,12,13,14
