## 4.1 숫자와 문자 인덱스 처리

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

In [2]:
idx1 = pd.Index([1,2,3,4])
type(idx1)

pandas.core.indexes.numeric.Int64Index

In [3]:
pd.Index

pandas.core.indexes.base.Index

In [4]:
idx2 = pd.Index(range(1,4))
type(idx2)

pandas.core.indexes.range.RangeIndex

In [5]:
idx1.dtype, idx2.dtype

(dtype('int64'), dtype('int64'))

In [6]:
idx_s = pd.Index(['a','b','c'])
idx_s

Index(['a', 'b', 'c'], dtype='object')

In [7]:
type(idx_s)

pandas.core.indexes.base.Index

In [8]:
idx_s.values

array(['a', 'b', 'c'], dtype=object)

In [10]:
idx_f = pd.Index([1,2,3,4],dtype='float')
idx_f

Float64Index([1.0, 2.0, 3.0, 4.0], dtype='float64')

In [12]:
idx1[0] # 슬라이스 처리 가능

1

In [13]:
idx1[[0]] # 팬시 검색 가능

Int64Index([1], dtype='int64')

In [14]:
idx1[idx1 < 3] # 마스킹 검색 가능

Int64Index([1, 2], dtype='int64')

In [15]:
idx4 = pd.Index([4,5,6,7])

In [16]:
try: # 특정 원소 변경은 불가능
    idx4[0] = 100
except Exception as e:
    print(e)

Index does not support mutable operations


In [17]:
idx4 = pd.Index([1,2,3,4])
idx4 # 같은 형태의 인덱스 클래스 전체를 대체하는 것은 가능

Int64Index([1, 2, 3, 4], dtype='int64')

In [18]:
idx1.difference(idx2)

Int64Index([4], dtype='int64')

In [19]:
idx1 + 3

Int64Index([4, 5, 6, 7], dtype='int64')

In [20]:
s1 = pd.Series(index=list('aaab'), data=np.arange(4))
s1

a    0
a    1
a    2
b    3
dtype: int64

In [21]:
s2 = pd.Series(index=list('baaa'), data=np.arange(4))
s2

b    0
a    1
a    2
a    3
dtype: int64

In [22]:
s1 + s2 # 두 개의 레이블리 달라 시리즈 안의 레이블 개수끼리 곱한 만큼 카테시언 프로덕트를 처리한 레이블이 생김

a    1
a    2
a    3
a    2
a    3
a    4
a    3
a    4
a    5
b    3
dtype: int64

In [23]:
s1.sort_index().add(s2.sort_index())

a    1
a    3
a    5
b    3
dtype: int64

In [24]:
s3 = pd.Series(index=list('aaab'), data=np.arange(4))
s1 + s3

a    0
a    2
a    4
b    6
dtype: int64

In [26]:
s4 = pd.Series(index=list('aaabc'), data=np.arange(5))
s1 + s4 # 카테시언 프로덕트 

a    0.0
a    1.0
a    2.0
a    1.0
a    2.0
a    3.0
a    2.0
a    3.0
a    4.0
b    6.0
c    NaN
dtype: float64

## 4.2 날짜 및 범주형 인덱스의 처리

In [27]:
idx_d = pd.Index(pd.date_range('20130101', periods=3))
idx_d

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03'], dtype='datetime64[ns]', freq='D')

In [28]:
idx_d.values

array(['2013-01-01T00:00:00.000000000', '2013-01-02T00:00:00.000000000',
       '2013-01-03T00:00:00.000000000'], dtype='datetime64[ns]')

In [29]:
idx_d.freq

<Day>

In [31]:
index = pd.DatetimeIndex(['2014-07-04', '2014-08-04', '2015-07-04','2015-08-04'])
data = pd.Series([0,1,2,3], index=index)
data

2014-07-04    0
2014-08-04    1
2015-07-04    2
2015-08-04    3
dtype: int64

In [32]:
index.dtype

dtype('<M8[ns]')

In [33]:
dr1 = pd.date_range('2018-07-03','2018-07-10')
dr1

DatetimeIndex(['2018-07-03', '2018-07-04', '2018-07-05', '2018-07-06',
               '2018-07-07', '2018-07-08', '2018-07-09', '2018-07-10'],
              dtype='datetime64[ns]', freq='D')

In [34]:
dr2 = pd.date_range('2018-07-03', periods=8)
dr2

DatetimeIndex(['2018-07-03', '2018-07-04', '2018-07-05', '2018-07-06',
               '2018-07-07', '2018-07-08', '2018-07-09', '2018-07-10'],
              dtype='datetime64[ns]', freq='D')

In [35]:
dr3 = pd.date_range('2018-08-03', periods=24, freq='H')
dr3

DatetimeIndex(['2018-08-03 00:00:00', '2018-08-03 01:00:00',
               '2018-08-03 02:00:00', '2018-08-03 03:00:00',
               '2018-08-03 04:00:00', '2018-08-03 05:00:00',
               '2018-08-03 06:00:00', '2018-08-03 07:00:00',
               '2018-08-03 08:00:00', '2018-08-03 09:00:00',
               '2018-08-03 10:00:00', '2018-08-03 11:00:00',
               '2018-08-03 12:00:00', '2018-08-03 13:00:00',
               '2018-08-03 14:00:00', '2018-08-03 15:00:00',
               '2018-08-03 16:00:00', '2018-08-03 17:00:00',
               '2018-08-03 18:00:00', '2018-08-03 19:00:00',
               '2018-08-03 20:00:00', '2018-08-03 21:00:00',
               '2018-08-03 22:00:00', '2018-08-03 23:00:00'],
              dtype='datetime64[ns]', freq='H')

In [36]:
dr3.to_period('D') # 주기를 나타내는 인덱스로 바꿈 

PeriodIndex(['2018-08-03', '2018-08-03', '2018-08-03', '2018-08-03',
             '2018-08-03', '2018-08-03', '2018-08-03', '2018-08-03',
             '2018-08-03', '2018-08-03', '2018-08-03', '2018-08-03',
             '2018-08-03', '2018-08-03', '2018-08-03', '2018-08-03',
             '2018-08-03', '2018-08-03', '2018-08-03', '2018-08-03',
             '2018-08-03', '2018-08-03', '2018-08-03', '2018-08-03'],
            dtype='period[D]', freq='D')

In [37]:
dr_m = pd.period_range('2018-01', periods=13, freq='M')
dr_m

PeriodIndex(['2018-01', '2018-02', '2018-03', '2018-04', '2018-05', '2018-06',
             '2018-07', '2018-08', '2018-09', '2018-10', '2018-11', '2018-12',
             '2019-01'],
            dtype='period[M]', freq='M')

In [38]:
tm_1 = pd.timedelta_range(0, periods=10, freq='H')
tm_1

TimedeltaIndex(['00:00:00', '01:00:00', '02:00:00', '03:00:00', '04:00:00',
                '05:00:00', '06:00:00', '07:00:00', '08:00:00', '09:00:00'],
               dtype='timedelta64[ns]', freq='H')

In [39]:
tm_1 - tm_1[1] # 델타 연산 가능

TimedeltaIndex(['-1 days +23:00:00',          '00:00:00',          '01:00:00',
                         '02:00:00',          '03:00:00',          '04:00:00',
                         '05:00:00',          '06:00:00',          '07:00:00',
                         '08:00:00'],
               dtype='timedelta64[ns]', freq='H')

In [40]:
dr2

DatetimeIndex(['2018-07-03', '2018-07-04', '2018-07-05', '2018-07-06',
               '2018-07-07', '2018-07-08', '2018-07-09', '2018-07-10'],
              dtype='datetime64[ns]', freq='D')

In [41]:
dr2 - dr2[0]

TimedeltaIndex(['0 days', '1 days', '2 days', '3 days', '4 days', '5 days',
                '6 days', '7 days'],
               dtype='timedelta64[ns]', freq=None)

In [42]:
index = pd.date_range('1/1/2018', periods=9, freq='T')
series = pd.Series(range(9), index=index)
series

2018-01-01 00:00:00    0
2018-01-01 00:01:00    1
2018-01-01 00:02:00    2
2018-01-01 00:03:00    3
2018-01-01 00:04:00    4
2018-01-01 00:05:00    5
2018-01-01 00:06:00    6
2018-01-01 00:07:00    7
2018-01-01 00:08:00    8
Freq: T, dtype: int64

In [43]:
s_3T = series.resample('3T') # 분 단위로 만들어진 시리즈를 3분 단위로 처리
s_3T

DatetimeIndexResampler [freq=<3 * Minutes>, axis=0, closed=left, label=left, convention=start, base=0]

In [44]:
s_3T.sum()

2018-01-01 00:00:00     3
2018-01-01 00:03:00    12
2018-01-01 00:06:00    21
Freq: 3T, dtype: int64

In [45]:
data = pd.read_csv('/mnt/c/Workspace/handson_pandas/data/hanriver_bridge.csv', index_col='Date', parse_dates=True, encoding='cp949')
data.head()

Unnamed: 0_level_0,한강 좌측 인도,한강 우측 인도
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2012-10-03 00:00:00,4.0,9.0
2012-10-03 01:00:00,4.0,6.0
2012-10-03 02:00:00,1.0,1.0
2012-10-03 03:00:00,2.0,3.0
2012-10-03 04:00:00,6.0,1.0


In [46]:
data.columns

Index(['한강 좌측 인도', '한강 우측 인도'], dtype='object')

In [47]:
data.index

DatetimeIndex(['2012-10-03 00:00:00', '2012-10-03 01:00:00',
               '2012-10-03 02:00:00', '2012-10-03 03:00:00',
               '2012-10-03 04:00:00', '2012-10-03 05:00:00',
               '2012-10-03 06:00:00', '2012-10-03 07:00:00',
               '2012-10-03 08:00:00', '2012-10-03 09:00:00',
               ...
               '2018-05-31 14:00:00', '2018-05-31 15:00:00',
               '2018-05-31 16:00:00', '2018-05-31 17:00:00',
               '2018-05-31 18:00:00', '2018-05-31 19:00:00',
               '2018-05-31 20:00:00', '2018-05-31 21:00:00',
               '2018-05-31 22:00:00', '2018-05-31 23:00:00'],
              dtype='datetime64[ns]', name='Date', length=49608, freq=None)

In [49]:
data.columns = ['좌측','우측']
data['합산'] = data.eval('좌측 + 우측') # 실제 연산을 문자열로 만들어 eval 메소드로 실행
data.head()

Unnamed: 0_level_0,좌측,우측,합산
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2012-10-03 00:00:00,4.0,9.0,13.0
2012-10-03 01:00:00,4.0,6.0,10.0
2012-10-03 02:00:00,1.0,1.0,2.0
2012-10-03 03:00:00,2.0,3.0,5.0
2012-10-03 04:00:00,6.0,1.0,7.0


In [50]:
data.isnull().sum()

좌측    8
우측    8
합산    8
dtype: int64

In [51]:
data_dp = data.dropna()

In [52]:
daily = data.resample('D').sum() # 날짜 기준으로 리샘플링

In [53]:
daily.head()

Unnamed: 0_level_0,좌측,우측,합산
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2012-10-03,1760.0,1761.0,3521.0
2012-10-04,1708.0,1767.0,3475.0
2012-10-05,1558.0,1590.0,3148.0
2012-10-06,1080.0,926.0,2006.0
2012-10-07,1191.0,951.0,2142.0


In [54]:
weekly_r = data.resample('W').mean() # 주 단위 리샘플링
weekly_r.head()

Unnamed: 0_level_0,좌측,우측,합산
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2012-10-07,60.808333,58.291667,119.1
2012-10-14,51.660714,48.309524,99.970238
2012-10-21,47.297619,45.017857,92.315476
2012-10-28,41.077381,38.904762,79.982143
2012-11-04,38.142857,34.440476,72.583333


In [55]:
weekly_f = data.asfreq('W') # 기존 데이터 프레임의 freq 변경
# 특정 날짜에 있는 행의 데이터로 세팅되고, 나머지 데이터는 사라짐
weekly_f.head()

Unnamed: 0_level_0,좌측,우측,합산
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2012-10-07,6.0,5.0,11.0
2012-10-14,3.0,3.0,6.0
2012-10-21,5.0,12.0,17.0
2012-10-28,5.0,5.0,10.0
2012-11-04,7.0,11.0,18.0


In [56]:
inx_i = pd.CategoricalIndex([1,2,3,4]) # 정수 리스트를 인자로 받아 인덱스 내의 레이블이 한정된 정수값을 가짐
s = pd.Series([1,2,3,4], index=inx_i)
s

1    1
2    2
3    3
4    4
dtype: int64

In [57]:
s.index = s.index.add_categories(5) # 인덱스 추가후 
s.index

CategoricalIndex([1, 2, 3, 4], categories=[1, 2, 3, 4, 5], ordered=False, dtype='category')

In [58]:
try:
    s.index = s.index.insert(4,5)
except Exception as e:
    print(e)

Length mismatch: Expected axis has 4 elements, new values have 5 elements


In [59]:
# 시리즈의 레이블을 추가하기 위해 범주형 인덱스를 일단 리스트로 전환
stl = s.index.tolist()
stl

[1, 2, 3, 4]

In [61]:
s.index = stl # 이를 시리즈에 .index 속성에 할당하여 정수형 인덱스로 변환
s.index

Int64Index([1, 2, 3, 4], dtype='int64')

In [62]:
s[5] = 100 # 정수형 인덱스는 인덱스 레이블 추가 가능 
s

1      1
2      2
3      3
4      4
5    100
dtype: int64

In [63]:
s.index = s.index.astype('category') # 다시 범주형 인덱스로 
s

1      1
2      2
3      3
4      4
5    100
dtype: int64

## 4.3 멀티인덱스 처리

In [64]:
ind = [("서울", 2017), ("경기도", 2017)]
si = pd.Series(np.random.randint(1,10,2), index=ind)
si

(서울, 2017)     5
(경기도, 2017)    2
dtype: int64

In [65]:
si['서울',2017]

5

In [66]:
si[('서울',2017)]

5

In [67]:
index = pd.MultiIndex.from_tuples(ind)
index

MultiIndex(levels=[['경기도', '서울'], [2017]],
           labels=[[1, 0], [0, 0]])