In [1]:
import numpy as np
import pandas as pd

## 시리즈

In [2]:
# Series 만들기
s = pd.Series([1,3,5,np.nan,6,8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [3]:
# 날짜 만들기
dates = pd.date_range('20130101', periods=6)
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

## 데이터프레임

In [4]:
# DataFrame 만들기
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=['A','B','C','D'])
df

Unnamed: 0,A,B,C,D
2013-01-01,-0.465552,-0.092386,0.730679,0.025527
2013-01-02,-1.123452,-1.061673,-1.469172,0.589329
2013-01-03,-1.561479,-0.567944,-0.214901,-0.364976
2013-01-04,-0.843882,-0.66182,0.578336,-1.315972
2013-01-05,-1.757923,0.255046,0.46381,0.73763
2013-01-06,-2.077071,1.282669,-1.006562,0.158308


In [5]:
df.head()

Unnamed: 0,A,B,C,D
2013-01-01,-0.465552,-0.092386,0.730679,0.025527
2013-01-02,-1.123452,-1.061673,-1.469172,0.589329
2013-01-03,-1.561479,-0.567944,-0.214901,-0.364976
2013-01-04,-0.843882,-0.66182,0.578336,-1.315972
2013-01-05,-1.757923,0.255046,0.46381,0.73763


In [6]:
# 인덱스 확인하기
df.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [7]:
# 컬럼 확인하기
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [8]:
# 내용 확인하기
df.values

array([[-0.46555204, -0.0923855 ,  0.73067862,  0.02552729],
       [-1.12345227, -1.06167337, -1.46917185,  0.58932898],
       [-1.56147908, -0.56794354, -0.21490133, -0.36497639],
       [-0.84388217, -0.66182023,  0.57833581, -1.31597206],
       [-1.75792347,  0.255046  ,  0.46380958,  0.73762961],
       [-2.07707114,  1.28266884, -1.00656169,  0.15830757]])

In [9]:
# 데이터프레임 정보 확인하기
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 6 entries, 2013-01-01 to 2013-01-06
Freq: D
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   A       6 non-null      float64
 1   B       6 non-null      float64
 2   C       6 non-null      float64
 3   D       6 non-null      float64
dtypes: float64(4)
memory usage: 240.0 bytes


In [10]:
# 데이터프레임의 통계적 개요 확인하기
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,-1.304893,-0.141018,-0.152968,-0.028359
std,0.602873,0.835131,0.912239,0.74529
min,-2.077071,-1.061673,-1.469172,-1.315972
25%,-1.708812,-0.638351,-0.808647,-0.26735
50%,-1.342466,-0.330165,0.124454,0.091917
75%,-0.913775,0.168188,0.549704,0.481574
max,-0.465552,1.282669,0.730679,0.73763


In [11]:
# 정렬하기
df.sort_values(by='B', ascending=False)
# ascending 옵션으로 내림차순 오름차순 선택 가능

Unnamed: 0,A,B,C,D
2013-01-06,-2.077071,1.282669,-1.006562,0.158308
2013-01-05,-1.757923,0.255046,0.46381,0.73763
2013-01-01,-0.465552,-0.092386,0.730679,0.025527
2013-01-03,-1.561479,-0.567944,-0.214901,-0.364976
2013-01-04,-0.843882,-0.66182,0.578336,-1.315972
2013-01-02,-1.123452,-1.061673,-1.469172,0.589329


In [12]:
# 해당 컬럼만 보기
df['A']
# 해당 컬럼만 Series로 보여준다.

2013-01-01   -0.465552
2013-01-02   -1.123452
2013-01-03   -1.561479
2013-01-04   -0.843882
2013-01-05   -1.757923
2013-01-06   -2.077071
Freq: D, Name: A, dtype: float64

In [13]:
# 3행까지 보기
# df.head(3)
df[:3]
# 둘 다 가능하다.

Unnamed: 0,A,B,C,D
2013-01-01,-0.465552,-0.092386,0.730679,0.025527
2013-01-02,-1.123452,-1.061673,-1.469172,0.589329
2013-01-03,-1.561479,-0.567944,-0.214901,-0.364976


In [14]:
# 인덱스를 직접 지정해서 보고싶다면?
df['20130101':'20130104']

Unnamed: 0,A,B,C,D
2013-01-01,-0.465552,-0.092386,0.730679,0.025527
2013-01-02,-1.123452,-1.061673,-1.469172,0.589329
2013-01-03,-1.561479,-0.567944,-0.214901,-0.364976
2013-01-04,-0.843882,-0.66182,0.578336,-1.315972


In [15]:
# loc 이용하기
df.loc[dates[0]]

A   -0.465552
B   -0.092386
C    0.730679
D    0.025527
Name: 2013-01-01 00:00:00, dtype: float64

In [16]:
# A, B 열의 모든 행 보기
df.loc[:, ['A', 'B']]

Unnamed: 0,A,B
2013-01-01,-0.465552,-0.092386
2013-01-02,-1.123452,-1.061673
2013-01-03,-1.561479,-0.567944
2013-01-04,-0.843882,-0.66182
2013-01-05,-1.757923,0.255046
2013-01-06,-2.077071,1.282669


In [17]:
# 행과 열 범위 모두 지정해주기
df.loc['20130102':'20130104',['A','B']]

Unnamed: 0,A,B
2013-01-02,-1.123452,-1.061673
2013-01-03,-1.561479,-0.567944
2013-01-04,-0.843882,-0.66182


In [18]:
df.loc['20130102',['A','B']]

A   -1.123452
B   -1.061673
Name: 2013-01-02 00:00:00, dtype: float64

In [19]:
df.loc[dates[0], 'A']

-0.4655520359385534

In [20]:
# iloc 사용하기
# 데이터프레임의 3행 보기
df.iloc[3]

A   -0.843882
B   -0.661820
C    0.578336
D   -1.315972
Name: 2013-01-04 00:00:00, dtype: float64

In [21]:
# 3, 4번째 행의 0부터 1까지 열
df.iloc[3:5,0:2]

Unnamed: 0,A,B
2013-01-04,-0.843882,-0.66182
2013-01-05,-1.757923,0.255046


In [22]:
# 범위가 아니라 행이나 열을 지정해서 데이터 가져오기
df.iloc[[1,2,4], [0,2]]

Unnamed: 0,A,C
2013-01-02,-1.123452,-1.469172
2013-01-03,-1.561479,-0.214901
2013-01-05,-1.757923,0.46381


In [23]:
# 전체를 가져올 때
df.iloc[1:3,:]

Unnamed: 0,A,B,C,D
2013-01-02,-1.123452,-1.061673,-1.469172,0.589329
2013-01-03,-1.561479,-0.567944,-0.214901,-0.364976


In [24]:
df.iloc[:,1:3]

Unnamed: 0,B,C
2013-01-01,-0.092386,0.730679
2013-01-02,-1.061673,-1.469172
2013-01-03,-0.567944,-0.214901
2013-01-04,-0.66182,0.578336
2013-01-05,0.255046,0.46381
2013-01-06,1.282669,-1.006562


In [25]:
# 특정 열 데이터 보기
# df['A']
df.A

2013-01-01   -0.465552
2013-01-02   -1.123452
2013-01-03   -1.561479
2013-01-04   -0.843882
2013-01-05   -1.757923
2013-01-06   -2.077071
Freq: D, Name: A, dtype: float64

In [26]:
# 특정 조건을 만족하는 데이터만 보기
df[df.A > 0]

Unnamed: 0,A,B,C,D


In [27]:
# 데이터 전체에 조건걸기
df[df>0]
# 만족하지 않은 곳은 결측치 NaN 처리가 된다.

Unnamed: 0,A,B,C,D
2013-01-01,,,0.730679,0.025527
2013-01-02,,,,0.589329
2013-01-03,,,,
2013-01-04,,,0.578336,
2013-01-05,,0.255046,0.46381,0.73763
2013-01-06,,1.282669,,0.158308


In [28]:
# 복사할 때 카피를 사용하는 이유
df2 = df.copy()
# 내용도 복사된다.

In [29]:
# 카피를 사용하지 않으면?
df3 = df2

# 새로운 컬럼 추가하기
df3['E'] = ['one', 'one', 'two', 'three', 'four', 'three']
df3
df2
df
# 위치만 복사된다.

Unnamed: 0,A,B,C,D
2013-01-01,-0.465552,-0.092386,0.730679,0.025527
2013-01-02,-1.123452,-1.061673,-1.469172,0.589329
2013-01-03,-1.561479,-0.567944,-0.214901,-0.364976
2013-01-04,-0.843882,-0.66182,0.578336,-1.315972
2013-01-05,-1.757923,0.255046,0.46381,0.73763
2013-01-06,-2.077071,1.282669,-1.006562,0.158308


In [30]:
# E 컬럼에 two, four가 있는지 조건을 걸고 싶을 때
df2['E'].isin(['two','four'])

2013-01-01    False
2013-01-02    False
2013-01-03     True
2013-01-04    False
2013-01-05     True
2013-01-06    False
Freq: D, Name: E, dtype: bool

In [31]:
# 조건을 데이터프레임에 넣기
df2[df2['E'].isin(['two','four'])]

Unnamed: 0,A,B,C,D,E
2013-01-03,-1.561479,-0.567944,-0.214901,-0.364976,two
2013-01-05,-1.757923,0.255046,0.46381,0.73763,four


## 두 데이터프레임 병합하기

In [32]:
df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'], 
                    'B': ['B0', 'B1', 'B2', 'B3'],
                    'C': ['C0', 'C1', 'C2', 'C3'],
                    'D': ['D0', 'D1', 'D2', 'D3']},
                   index=[0, 1, 2, 3])

df2 = pd.DataFrame({'A': ['A4', 'A5', 'A6', 'A7'],
                    'B': ['B4', 'B5', 'B6', 'B7'],
                    'C': ['C4', 'C5', 'C6', 'C7'],
                    'D': ['D4', 'D5', 'D6', 'D7']},
                   index=[4, 5, 6, 7])

df3 = pd.DataFrame({'A': ['A8', 'A9', 'A10', 'A11'],
                    'B': ['B8', 'B9', 'B10', 'B11'],
                    'C': ['C8', 'C9', 'C10', 'C11'],
                    'D': ['D8', 'D9', 'D10', 'D11']},
                   index=[8, 9, 10, 11])

In [33]:
df1

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A3,B3,C3,D3


In [34]:
df2

Unnamed: 0,A,B,C,D
4,A4,B4,C4,D4
5,A5,B5,C5,D5
6,A6,B6,C6,D6
7,A7,B7,C7,D7


In [35]:
df3

Unnamed: 0,A,B,C,D
8,A8,B8,C8,D8
9,A9,B9,C9,D9
10,A10,B10,C10,D10
11,A11,B11,C11,D11


### concat

In [37]:
# concat으로 데이터프레임 합치기
# 열방향으로 단순하게 합친다
result = pd.concat([df1,df2,df3])
result
# 아무 옵션없이 단순히 열방향으로만 합친다.

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A3,B3,C3,D3
4,A4,B4,C4,D4
5,A5,B5,C5,D5
6,A6,B6,C6,D6
7,A7,B7,C7,D7
8,A8,B8,C8,D8
9,A9,B9,C9,D9


In [38]:
# key 파라미터로 다중 인덱스 설정하기
result = pd.concat([df1,df2,df3], keys=['x','y','z'])
result

Unnamed: 0,Unnamed: 1,A,B,C,D
x,0,A0,B0,C0,D0
x,1,A1,B1,C1,D1
x,2,A2,B2,C2,D2
x,3,A3,B3,C3,D3
y,4,A4,B4,C4,D4
y,5,A5,B5,C5,D5
y,6,A6,B6,C6,D6
y,7,A7,B7,C7,D7
z,8,A8,B8,C8,D8
z,9,A9,B9,C9,D9


In [39]:
# 인덱스 확인하기
result.index

MultiIndex([('x',  0),
            ('x',  1),
            ('x',  2),
            ('x',  3),
            ('y',  4),
            ('y',  5),
            ('y',  6),
            ('y',  7),
            ('z',  8),
            ('z',  9),
            ('z', 10),
            ('z', 11)],
           )

In [41]:
result.index.get_level_values

<bound method MultiIndex.get_level_values of MultiIndex([('x',  0),
            ('x',  1),
            ('x',  2),
            ('x',  3),
            ('y',  4),
            ('y',  5),
            ('y',  6),
            ('y',  7),
            ('z',  8),
            ('z',  9),
            ('z', 10),
            ('z', 11)],
           )>

In [42]:
result.index.get_level_values(0)

Index(['x', 'x', 'x', 'x', 'y', 'y', 'y', 'y', 'z', 'z', 'z', 'z'], dtype='object')

In [44]:
result.index.get_level_values(1)

Int64Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], dtype='int64')

In [45]:
# axis 파라미터 설정하기
df4 = pd.DataFrame({'B': ['B2', 'B3', 'B6', 'B7'], 
                    'D': ['D2', 'D3', 'D6', 'D7'],
                    'F': ['F2', 'F3', 'F6', 'F7']},
                   index=[2, 3, 6, 7])

result = pd.concat([df1, df4], axis=1)

In [46]:
df1

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A3,B3,C3,D3


In [47]:
df4

Unnamed: 0,B,D,F
2,B2,D2,F2
3,B3,D3,F3
6,B6,D6,F6
7,B7,D7,F7


In [48]:
result
# index를 기준으로 합친다.

Unnamed: 0,A,B,C,D,B.1,D.1,F
0,A0,B0,C0,D0,,,
1,A1,B1,C1,D1,,,
2,A2,B2,C2,D2,B2,D2,F2
3,A3,B3,C3,D3,B3,D3,F3
6,,,,,B6,D6,F6
7,,,,,B7,D7,F7


In [49]:
# join 파라미터 설정하기
# inner join
result = pd.concat([df1, df4], axis=1, join='inner')
result

Unnamed: 0,A,B,C,D,B.1,D.1,F
2,A2,B2,C2,D2,B2,D2,F2
3,A3,B3,C3,D3,B3,D3,F3


In [57]:
# 두 데이터프레임의 index를 무시하고 합치기
# ignore_index
result = pd.concat([df1, df4], ignore_index=True)
result

Unnamed: 0,A,B,C,D,F
0,A0,B0,C0,D0,
1,A1,B1,C1,D1,
2,A2,B2,C2,D2,
3,A3,B3,C3,D3,
4,,B2,,D2,F2
5,,B3,,D3,F3
6,,B6,,D6,F6
7,,B7,,D7,F7


### merge

In [58]:
left = pd.DataFrame({'key': ['K0', 'K4', 'K2', 'K3'],
                     'A': ['A0', 'A1', 'A2', 'A3'],
                     'B': ['B0', 'B1', 'B2', 'B3']})

right = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
                      'C': ['C0', 'C1', 'C2', 'C3'],
                      'D': ['D0', 'D1', 'D2', 'D3']})

In [59]:
left

Unnamed: 0,key,A,B
0,K0,A0,B0
1,K4,A1,B1
2,K2,A2,B2
3,K3,A3,B3


In [60]:
right

Unnamed: 0,key,C,D
0,K0,C0,D0
1,K1,C1,D1
2,K2,C2,D2
3,K3,C3,D3


In [61]:
# 공통된 key를 기준으로 합치기
pd.merge(left, right, on='key')

Unnamed: 0,key,A,B,C,D
0,K0,A0,B0,C0,D0
1,K2,A2,B2,C2,D2
2,K3,A3,B3,C3,D3


In [62]:
# 하나의 기준으로 합치기
# how
pd.merge(left, right, how='left', on='key')

Unnamed: 0,key,A,B,C,D
0,K0,A0,B0,C0,D0
1,K4,A1,B1,,
2,K2,A2,B2,C2,D2
3,K3,A3,B3,C3,D3


In [63]:
pd.merge(left, right, how='right', on='key')
# 공통된 요소가 아닌 곳은 NaN 처리

Unnamed: 0,key,A,B,C,D
0,K0,A0,B0,C0,D0
1,K2,A2,B2,C2,D2
2,K3,A3,B3,C3,D3
3,K1,,,C1,D1


In [64]:
# inner
pd.merge(left, right, how='inner', on='key')

Unnamed: 0,key,A,B,C,D
0,K0,A0,B0,C0,D0
1,K2,A2,B2,C2,D2
2,K3,A3,B3,C3,D3


In [73]:
result.drop([1,2,3])

Unnamed: 0,A,B,C,D,F
0,A0,B0,C0,D0,
4,,B2,,D2,F2
5,,B3,,D3,F3
6,,B6,,D6,F6
7,,B7,,D7,F7
