In [1]:
# import numpy and pandas
import numpy as np
import pandas as pd

# used for dates
import datetime
from datetime import datetime, date

# Set some pandas options controlling output format
pd.set_option('display.notebook_repr_html', False)
pd.set_option('display.max_columns', 8)
pd.set_option('display.max_rows', 10)
pd.set_option('display.width', 80)

# bring in matplotlib for graphics .%matplotlib inline 쥬피터 노트북 그리기툴
import matplotlib.pyplot as plt
%matplotlib inline

# read data
sp500 = pd.read_csv("../data/sp500.csv", 
                   index_col='Symbol',
                   usecols=[0, 2, 3, 7],)

# The importance of indexes

In [2]:
# create DataFrame of random numbers and a key column
np.random.seed(123456)
df = pd.DataFrame({'foo':np.random.random(10000), 'key':range(100, 10100)})
df[:5]

        foo  key
0  0.126970  100
1  0.966718  101
2  0.260476  102
3  0.897237  103
4  0.376750  104

In [4]:
# boolean select where key is 10099
df[df.key==10099]

           foo    key
9999  0.272283  10099

In [5]:
# time the select
%timeit df[df.key==10099]

619 µs ± 90 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [6]:
df_with_index = df.set_index(['key'])
df_with_index[:5]

          foo
key          
100  0.126970
101  0.966718
102  0.260476
103  0.897237
104  0.376750

In [7]:
df_with_index.loc[10099]

foo    0.272283
Name: 10099, dtype: float64

In [8]:
%timeit df_with_index.loc[10099]

105 µs ± 2.36 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


# The fundamental index type: Index

In [9]:
# show that the columns are actually an index
temps = pd.DataFrame({"City": ['Missoula', 'Philadelphia'],
                     'Temperature': [70, 80]})
temps

           City  Temperature
0      Missoula           70
1  Philadelphia           80

In [10]:
temps.columns

Index(['City', 'Temperature'], dtype='object')

# Integer index labels using Int64Index and RangeIndex

In [12]:
# explicitly create an Int64Index
df_i64 = pd.DataFrame(np.arange(10, 20), index=np.arange(0, 10))
df_i64[:5]

    0
0  10
1  11
2  12
3  13
4  14

In [14]:
df_i64.index

Int64Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype='int64')

In [15]:
df_range = pd.DataFrame(np.arange(10, 15))
df_range[:5]

    0
0  10
1  11
2  12
3  13
4  14

In [16]:
df_range.index

RangeIndex(start=0, stop=5, step=1)

# Floating point labels using Float64Index

In [17]:
df_f64 = pd.DataFrame(np.arange(0, 1000, 5),
                     np.arange(0.0, 100, 0.5))
df_f64.iloc[:5]

      0
0.0   0
0.5   5
1.0  10
1.5  15
2.0  20

In [18]:
df_f64.index

Float64Index([ 0.0,  0.5,  1.0,  1.5,  2.0,  2.5,  3.0,  3.5,  4.0,  4.5,
              ...
              95.0, 95.5, 96.0, 96.5, 97.0, 97.5, 98.0, 98.5, 99.0, 99.5],
             dtype='float64', length=200)

# Representing discrete intervals using IntervalIndex

In [20]:
# a DataFrame with an IntervalIndex
df_interval = pd.DataFrame({"A": [1, 2, 3, 4]},
                          index=pd.IntervalIndex.from_breaks(
                          [0, 0.5, 1.0, 1.5, 2.0]))
# from_breaks 분할 배열에서 IntervalIndex 생성
df_interval # 이산간격  Split 배열로부터 IntervalIndex를 구축한다. 기본 '오른쪽' 닫힘

            A
(0.0, 0.5]  1
(0.5, 1.0]  2
(1.0, 1.5]  3
(1.5, 2.0]  4

In [21]:
df_interval.index

IntervalIndex([(0.0, 0.5], (0.5, 1.0], (1.0, 1.5], (1.5, 2.0]],
              closed='right',
              dtype='interval[float64]')

In [22]:
df_interval.loc[1.3]

A    3
Name: (1.0, 1.5], dtype: int64

# Categorical values as an index: CategoricalIndex

In [23]:
df_categorical = pd.DataFrame({'A':np.arange(6),
                              'B':list('aabbca')})
df_categorical['B'] = df_categorical['B'].astype('category')
df_categorical

   A  B
0  0  a
1  1  a
2  2  b
3  3  b
4  4  c
5  5  a

In [24]:
# shift the categorical column to the index
df_categorical = df_categorical.set_index('B')
df_categorical.index # CategoricalIndex 정렬 가능하고 슬라이스 가능한 순서를 구현하는 불변 인덱스

CategoricalIndex(['a', 'a', 'b', 'b', 'c', 'a'], categories=['a', 'b', 'c'], ordered=False, name='B', dtype='category')

In [25]:
df_categorical.loc['a']

   A
B   
a  0
a  1
a  5