In [1]:
import pandas as pd
import numpy as np

In [2]:
index=[('California',2000),('California',2010),('New York',2000),('New York',2010),('Texas',2000),('Texas',2010)]
populations=[33871648,37253956,18976457,19378102,20851820,25145561]
pop=pd.Series(populations, index=index)
pop

(California, 2000)    33871648
(California, 2010)    37253956
(New York, 2000)      18976457
(New York, 2010)      19378102
(Texas, 2000)         20851820
(Texas, 2010)         25145561
dtype: int64

In [3]:
pop[('California',2010):('Texas',2000)]

(California, 2010)    37253956
(New York, 2000)      18976457
(New York, 2010)      19378102
(Texas, 2000)         20851820
dtype: int64

In [4]:
pop[[i for i in pop.index if i[1]==2010]] #2010인 데이터를 가져올려면 꽤나 힘든 수고를 해야한다

(California, 2010)    37253956
(New York, 2010)      19378102
(Texas, 2010)         25145561
dtype: int64

In [5]:
#튜플로부터 다중 인덱스를 만들 수 있다
index=pd.MultiIndex.from_tuples(index)
index

MultiIndex([('California', 2000),
            ('California', 2010),
            (  'New York', 2000),
            (  'New York', 2010),
            (     'Texas', 2000),
            (     'Texas', 2010)],
           )

In [6]:
#데이터를 예쁘게 모아준다/사이에 빈 공간은 값이 위와 같다는 뜻이다.
pop=pop.reindex(index)
pop

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [7]:
pop[:,2010]

California    37253956
New York      19378102
Texas         25145561
dtype: int64

In [11]:
#간단한 DataFrame으로도 동일한 데이터 표현가능

In [10]:
#다중 인덱스 배열 -> DataFrame
pop_df=pop.unstack()
pop_df

Unnamed: 0,2000,2010
California,33871648,37253956
New York,18976457,19378102
Texas,20851820,25145561


In [9]:
#DataFrame -> 다중 인덱스 배열
pop_df.stack()

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [9]:
#계층적 인덱싱을 알아야 하는 이유는
#2차원 데이터를 1차원 Series에 표현하기 위해 다중 인덱싱을 사용할 수 있는 것처럼
#3차원이나 4차원 데이터를 Series나 DataFrame에 표현할 때도 사용할 수 있다

In [17]:
pop_df=pd.DataFrame({'total':pop,
                     'under18':[9267089,9284094,4687374,4318033,5906301,6879014]})
pop_df

Unnamed: 0,Unnamed: 1,total,under18
California,2000,33871648,9267089
California,2010,37253956,9284094
New York,2000,18976457,4687374
New York,2010,19378102,4318033
Texas,2000,20851820,5906301
Texas,2010,25145561,6879014


In [18]:
#고차원의 데이터 가공
f_u18=pop_df['under18']/pop_df['total']
f_u18.unstack()

Unnamed: 0,2000,2010
California,0.273594,0.249211
New York,0.24701,0.222831
Texas,0.283251,0.273568


In [19]:
#자동으로 MultiIndex로 바꿔준다
df=pd.DataFrame(np.random.rand(4,2),
                index=[['a','a','b','b'],
                       [1,2,1,2]],columns=['data1','data2'])
df

Unnamed: 0,Unnamed: 1,data1,data2
a,1,0.741145,0.723623
a,2,0.228849,0.403603
b,1,0.895095,0.728385
b,2,0.504603,0.182489


In [20]:
pd.MultiIndex.from_arrays([['a','a','b','b'],[1,2,1,2]])

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )

In [21]:
pd.MultiIndex.from_tuples([('a', 1),('a', 2),('b', 1),('b', 2)])

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )

In [23]:
pd.MultiIndex.from_product([['a','b'],[1,2]])

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )

In [10]:
#인덱스 값에 이름을 붙여줄 수 있다.
pop.index.names=['state','year']
pop

state       year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [26]:
#계층적 인덱스를 잘 활용하여, 행과 열에 모두 인덱스 이름을 붙일 수 있다. 4차원 데이터를 만들 수 있다.

In [27]:
pop #다중 인덱스를 가진 Series

state       year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [28]:
pop['California',2000] #단일 요소에 접근 할 수 있다

33871648

In [29]:
pop['California'] #부분 인덱싱을 통해 Series 형태를 유지할 수 있다

year
2000    33871648
2010    37253956
dtype: int64

In [30]:
#정렬되어 있다면 부분 슬라이싱도 할 수 있다
pop.loc['California':'New York']

state       year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
dtype: int64

In [31]:
pop[:,2000]

state
California    33871648
New York      18976457
Texas         20851820
dtype: int64

In [32]:
pop[pop>22000000]

state       year
California  2000    33871648
            2010    37253956
Texas       2010    25145561
dtype: int64

In [33]:
pop[['California','Texas']]

state       year
California  2000    33871648
            2010    37253956
Texas       2000    20851820
            2010    25145561
dtype: int64

In [35]:
index=pd.MultiIndex.from_product([[2013,2014],[1,2]],names=['year','visit'])
columns=pd.MultiIndex.from_product([['Bob','Guido','Sue'],['HR','Temp']],names=['subject','type'])

data=np.round(np.random.randn(4,6),1)
data[:,::2] *= 10
data+=37

health_data=pd.DataFrame(data, index=index, columns=columns)
health_data

Unnamed: 0_level_0,subject,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,type,HR,Temp,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,58.0,38.4,37.0,38.6,41.0,37.0
2013,2,42.0,37.7,23.0,37.4,37.0,36.4
2014,1,41.0,37.1,43.0,38.7,47.0,35.0
2014,2,13.0,37.7,11.0,36.8,36.0,38.5


In [36]:
health_data['Guido','HR']

year  visit
2013  1        37.0
      2        23.0
2014  1        43.0
      2        11.0
Name: (Guido, HR), dtype: float64

In [37]:
health_data.iloc[:2,:2]

Unnamed: 0_level_0,subject,Bob,Bob
Unnamed: 0_level_1,type,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2
2013,1,58.0,38.4
2013,2,42.0,37.7


In [41]:
health_data.loc[:,('Bob','HR')]

year  visit
2013  1        58.0
      2        42.0
2014  1        41.0
      2        13.0
Name: (Bob, HR), dtype: float64

In [40]:
#튜플내에선 슬라이싱이 안된다
health_data.loc[(:,1),(:,'HR')]

SyntaxError: invalid syntax (3480173087.py, line 1)

In [42]:
#판다스 내부에서 슬라이싱을 가능하게 해주는 함수
idx=pd.IndexSlice 
health_data.loc[idx[:,1],idx[:,'HR']]

Unnamed: 0_level_0,subject,Bob,Guido,Sue
Unnamed: 0_level_1,type,HR,HR,HR
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
2013,1,58.0,37.0,41.0
2014,1,41.0,43.0,47.0


In [11]:
#정렬되지 않은 인덱스를 가진 데이터
index=pd.MultiIndex.from_product([['a','c','b'],[1,2]]) 
data=pd.Series(np.random.rand(6), index=index)
data.index.names=['char','int']
data

char  int
a     1      0.280961
      2      0.019246
c     1      0.900499
      2      0.503662
b     1      0.584043
      2      0.069956
dtype: float64

In [12]:
data['a':'b']

UnsortedIndexError: 'Key length (1) was greater than MultiIndex lexsort depth (0)'

In [45]:
try:
    data['a':'b']                  #인덱스가 사전적 정의대로 정렬이 되어 있지 않아서 발생하는 오류
except KeyError as e:
    print(type(e))
    print(e)

<class 'pandas.errors.UnsortedIndexError'>
'Key length (1) was greater than MultiIndex lexsort depth (0)'


In [48]:
#데이터의 인덱스를 사전적 정의대로 정렬
data=data.sort_index()
data

char  int
a     1      0.063056
      2      0.032148
b     1      0.979245
      2      0.478905
c     1      0.591350
      2      0.611046
dtype: float64

In [49]:
data['a':'b']

char  int
a     1      0.063056
      2      0.032148
b     1      0.979245
      2      0.478905
dtype: float64

In [12]:
pop

state       year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [11]:
pop.unstack()

year,2000,2010
state,Unnamed: 1_level_1,Unnamed: 2_level_1
California,33871648,37253956
New York,18976457,19378102
Texas,20851820,25145561


In [61]:
pop.unstack(level=0) #사용할 인덱스를 선택할 수 있다

state,California,New York,Texas
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2000,33871648,18976457,20851820
2010,37253956,19378102,25145561


In [51]:
pop.unstack(level=1)

year,2000,2010
state,Unnamed: 1_level_1,Unnamed: 2_level_1
California,33871648,37253956
New York,18976457,19378102
Texas,20851820,25145561


In [62]:
pop.unstack().stack()

state       year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [66]:
#인덱스 레이블을 열로, 내부의 데이터는 유지
pop_flat=pop.reset_index(name='population') 
pop_flat

Unnamed: 0,state,year,population
0,California,2000,33871648
1,California,2010,37253956
2,New York,2000,18976457
3,New York,2010,19378102
4,Texas,2000,20851820
5,Texas,2010,25145561


In [68]:
#원시 데이터로 간단하게 다중 인덱스를 갖는 데이터프레임을 만들 수 있다.
pop_flat.set_index(['state','year']) 

Unnamed: 0_level_0,Unnamed: 1_level_0,population
state,year,Unnamed: 2_level_1
California,2000,33871648
California,2010,37253956
New York,2000,18976457
New York,2010,19378102
Texas,2000,20851820
Texas,2010,25145561


In [69]:
health_data

Unnamed: 0_level_0,subject,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,type,HR,Temp,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,58.0,38.4,37.0,38.6,41.0,37.0
2013,2,42.0,37.7,23.0,37.4,37.0,36.4
2014,1,41.0,37.1,43.0,38.7,47.0,35.0
2014,2,13.0,37.7,11.0,36.8,36.0,38.5


In [78]:
data_mean=health_data.mean(axis=0,level='year') #행,열 인덱스 중 행 인덱스
data_mean

  """Entry point for launching an IPython kernel.


subject,Bob,Bob,Guido,Guido,Sue,Sue
type,HR,Temp,HR,Temp,HR,Temp
year,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
2013,50.0,38.05,30.0,38.0,39.0,36.7
2014,27.0,37.4,27.0,37.75,41.5,36.75


In [74]:
data_mean.mean(axis=1,level='type') #행,열 인덱스 중 열 인덱스

  """Entry point for launching an IPython kernel.


type,HR,Temp
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2013,39.666667,37.583333
2014,31.833333,37.3


In [79]:
#데이터 세트 결합/Series와 DataFrame의 연결

In [2]:
def make_df(cols,ind):
    """빠르게 DataFrame 생성"""
    data={c: [str(c)+str(i) for i in ind]
         for c in cols}
    return pd.DataFrame(data, ind)

make_df('ABC',range(3)) #편의를 위해 사용자 지정 함수를 만듦

Unnamed: 0,A,B,C
0,A0,B0,C0
1,A1,B1,C1
2,A2,B2,C2


In [81]:
x=[1,2,3]
y=[4,5,6]
z=[7,8,9]
np.concatenate([x,y,z])

array([1, 2, 3, 4, 5, 6, 7, 8, 9])

In [94]:
[x,x]

[[[1, 2], [3, 4]], [[1, 2], [3, 4]]]

In [97]:
x=[[1,2],[3,4]]
np.concatenate([x,x],axis=1)

array([[1, 2, 1, 2],
       [3, 4, 3, 4]])

In [98]:
#pd.concat을 이용해서 간단하게 연결할 수 있다

In [99]:
ser1=pd.Series(['A','B','C'], index=[1,2,3])
ser2=pd.Series(['D','E','F'], index=[4,5,6])
pd.concat([ser1,ser2])

1    A
2    B
3    C
4    D
5    E
6    F
dtype: object

In [100]:
df1=make_df('AB',[1,2]) 
df2=make_df('AB',[3,4])
print(df1);print(df2);print(pd.concat([df1,df2]))

    A   B
1  A1  B1
2  A2  B2
    A   B
3  A3  B3
4  A4  B4
    A   B
1  A1  B1
2  A2  B2
3  A3  B3
4  A4  B4


In [101]:
df3=make_df('AB',[0,1])
df4=make_df('CD',[0,1])
#결합되는 방향 지정 가능
print(df3);print(df4);print(pd.concat([df3,df4],axis=1))

    A   B
0  A0  B0
1  A1  B1
    C   D
0  C0  D0
1  C1  D1
    A   B   C   D
0  A0  B0  C0  D0
1  A1  B1  C1  D1


In [104]:
x=make_df('AB',[0,1])
y=make_df('AB',[2,3])
y.index=x.index
print(x);print(y);print(pd.concat([x,y]))

    A   B
0  A0  B0
1  A1  B1
    A   B
0  A2  B2
1  A3  B3
    A   B
0  A0  B0
1  A1  B1
0  A2  B2
1  A3  B3


In [106]:
try:
    pd.concat([x,y],verify_integrity=True)
except ValueError as e:
        print("ValueError:",e)

ValueError: Indexes have overlapping values: Int64Index([0, 1], dtype='int64')


In [107]:
#반복되는 인덱스를 무시한다/새로운 인덱스를 지정해줌
print(x);print(y);print(pd.concat([x,y],ignore_index=True)) 

    A   B
0  A0  B0
1  A1  B1
    A   B
0  A2  B2
1  A3  B3
    A   B
0  A0  B0
1  A1  B1
2  A2  B2
3  A3  B3


In [108]:
#계층적 구조를 가지는 시리즈 생성
print(x);print(y);print(pd.concat([x,y],keys=['x','y'])) 

    A   B
0  A0  B0
1  A1  B1
    A   B
0  A2  B2
1  A3  B3
      A   B
x 0  A0  B0
  1  A1  B1
y 0  A2  B2
  1  A3  B3


In [111]:
df5=make_df('ABC',[1,2])
df6=make_df('BDC',[3,4])
#채울 수 없는 값은 NAN처리
print(df5);print(df6);print(pd.concat([df5,df6])) 

    A   B   C
1  A1  B1  C1
2  A2  B2  C2
    B   D   C
3  B3  D3  C3
4  B4  D4  C4
     A   B   C    D
1   A1  B1  C1  NaN
2   A2  B2  C2  NaN
3  NaN  B3  C3   D3
4  NaN  B4  C4   D4


In [112]:
#(join=inner)->열의 교집합 처리
print(df5);print(df6);print(pd.concat([df5,df6],join='inner')) 

    A   B   C
1  A1  B1  C1
2  A2  B2  C2
    B   D   C
3  B3  D3  C3
4  B4  D4  C4
    B   C
1  B1  C1
2  B2  C2
3  B3  C3
4  B4  C4


In [125]:
print(df5);print(df6);print(pd.concat([df5,df6],join_axes=[df5.columns]))#삭제된 함수 
print(df5);print(df6);print(pd.concat([df5,df6]).reindex(df5.index))#reindex

    A   B   C
1  A1  B1  C1
2  A2  B2  C2
    B   D   C
3  B3  D3  C3
4  B4  D4  C4
    A   B   C    D
1  A1  B1  C1  NaN
2  A2  B2  C2  NaN


In [129]:
print(df1);print(df2);print(df1.append(df2))

    A   B
1  A1  B1
2  A2  B2
    A   B
3  A3  B3
4  A4  B4
    A   B
1  A1  B1
2  A2  B2
3  A3  B3
4  A4  B4


In [None]:
#append 함수
#원래의 객체를 변경하지 않고 새로운 객체를 만든다
#새 인덱스와 버퍼를 생성하기 때문에 반복 실행한다면
#concat 함수를 사용하는게 좋다