<a href="https://colab.research.google.com/github/JakeOh/202205_itw_bd34/blob/main/da09_merge.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# DataFrame 합치기

In [1]:
# 필요한 라이브러리
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# merge

두 개의 DataFrame을 공통된 컬럼(들)을 기준으로 합치는 것. SQL의 join과 같은 기능.

In [2]:
emp = pd.DataFrame(data={
    'empno': [100, 101, 200, 201],
    'ename': ['Scott', 'King', 'Allen', 'Tiger'],
    'deptno': [10, 20, 10, 50]
})

In [3]:
emp

Unnamed: 0,empno,ename,deptno
0,100,Scott,10
1,101,King,20
2,200,Allen,10
3,201,Tiger,50


In [4]:
dept = pd.DataFrame(data={
    'deptno': [10, 20, 30],
    'dname': ['HR', 'IT', 'Sales']
})

In [5]:
dept

Unnamed: 0,deptno,dname
0,10,HR
1,20,IT
2,30,Sales


merge(join)

*   `pd.merge(df1, df2, how, on, ...)` 함수
*   `pd.DataFrame.merge(df, how, on, ...)` 메서드
*   `how` 파라미터: merge(join) 방식. inner, left, right, outer
    *   merge 방식의 기본값은 inner. `how=inner`는 생략 가능
*   `on` 파라미터: merge(join)의 기준이 되는 컬럼(컬럼들의 리스트).
    *   merge하려는 2개의 데이터 프레임에서 컬럼 이름이 같은 경우에는 on 파라미터를 생략할 수 있음.


## inner join

In [6]:
pd.merge(emp, dept, how='inner', on='deptno')
# how='inner' 생략 가능
# emp와 dept에서 deptno 컬럼 이름이 동일. -> on 생략 가능

Unnamed: 0,empno,ename,deptno,dname
0,100,Scott,10,HR
1,200,Allen,10,HR
2,101,King,20,IT


In [7]:
emp.merge(dept)

Unnamed: 0,empno,ename,deptno,dname
0,100,Scott,10,HR
1,200,Allen,10,HR
2,101,King,20,IT


## left (outer) join

In [8]:
pd.merge(emp, dept, how='left')

Unnamed: 0,empno,ename,deptno,dname
0,100,Scott,10,HR
1,101,King,20,IT
2,200,Allen,10,HR
3,201,Tiger,50,


## right (outer) join

In [9]:
pd.merge(emp, dept, how='right')

Unnamed: 0,empno,ename,deptno,dname
0,100.0,Scott,10,HR
1,200.0,Allen,10,HR
2,101.0,King,20,IT
3,,,30,Sales


## (full) outer join

In [10]:
pd.merge(emp, dept, how='outer')

Unnamed: 0,empno,ename,deptno,dname
0,100.0,Scott,10,HR
1,200.0,Allen,10,HR
2,101.0,King,20,IT
3,201.0,Tiger,50,
4,,,30,Sales


## join의 기준이 되는 컬럼 이름이 다른 경우

In [11]:
employees = pd.DataFrame(data={
    'empno': [100, 101, 200, 201],
    'ename': ['Scott', 'King', 'Allen', 'Tiger'],
    'deptno': [10, 20, 10, 50]
})
employees

Unnamed: 0,empno,ename,deptno
0,100,Scott,10
1,101,King,20
2,200,Allen,10
3,201,Tiger,50


In [12]:
departments = pd.DataFrame(data={
    'dno': [10, 20, 30],
    'dname': ['HR', 'IT', 'Sales']
})
departments

Unnamed: 0,dno,dname
0,10,HR
1,20,IT
2,30,Sales


In [13]:
pd.merge(employees, departments, left_on='deptno', right_on='dno')

Unnamed: 0,empno,ename,deptno,dno,dname
0,100,Scott,10,10,HR
1,200,Allen,10,10,HR
2,101,King,20,20,IT


## 인덱스를 사용한 merge

In [15]:
np.random.seed(1)

df1 = pd.DataFrame(data={'val_1': np.random.randint(10, size=4),
                         'val_2': np.random.randint(100, size=4)},
                   index=['a', 'b', 'c', 'd'])
df1

Unnamed: 0,val_1,val_2
a,5,79
b,8,64
c,9,16
d,5,1


In [18]:
df2 = pd.DataFrame(data={'data_1': np.random.randn(4),
                         'data_2': np.random.randn(4)},
                   index=['d', 'b', 'c', 'a'])
df2

Unnamed: 0,data_1,data_2
d,-0.528172,1.744812
b,-1.072969,-0.761207
c,0.865408,0.319039
a,-2.301539,-0.24937


In [19]:
pd.merge(df1, df2, left_index=True, right_index=True)

Unnamed: 0,val_1,val_2,data_1,data_2
a,5,79,-2.301539,-0.24937
b,8,64,-1.072969,-0.761207
c,9,16,0.865408,0.319039
d,5,1,-0.528172,1.744812


## 인덱스와 컬럼을 사용한 merge

In [20]:
df3 = pd.DataFrame(data={'name': ['d', 'b', 'c', 'a'],
                         'data': np.random.randint(100, size=4)})
df3

Unnamed: 0,name,data
0,d,29
1,b,14
2,c,50
3,a,68


In [24]:
pd.merge(df1, df3, left_index=True, right_on='name')

Unnamed: 0,val_1,val_2,name,data
3,5,79,a,68
1,8,64,b,14
2,9,16,c,50
0,5,1,d,29
