# Database-like operations

In [1]:
# !pip install numpy
# !pip install pandas

In [2]:
import numpy as np
import pandas as pd

## Concatenate

In this example, we take three DataFrames with similar column names and concatenate them using the concat() function.

In [3]:
df1 = pd.DataFrame({'A':[0,1,2],'B':[0,1,2],'C':[0,1,2]},index=[0,1,2])
df1

Unnamed: 0,A,B,C
0,0,0,0
1,1,1,1
2,2,2,2


In [4]:
df2 = pd.DataFrame({'A':[3,4,5,6],'B':[3,4,5,6],'C':[3,4,5,6]},
                   index=[3,4,5,6])
df2

Unnamed: 0,A,B,C
3,3,3,3
4,4,4,4
5,5,5,5
6,6,6,6


In [5]:
df3 = pd.DataFrame({'A':[7,8,9],'B':[7,8,9],'C':[7,8,9]},index=[7,8,9])
df3

Unnamed: 0,A,B,C
7,7,7,7
8,8,8,8
9,9,9,9


In [6]:
# concatenating dataframes
frames = [df1, df2, df3]
res = pd.concat(frames)

In [7]:
print(res.shape)
res

(10, 3)


Unnamed: 0,A,B,C
0,0,0,0
1,1,1,1
2,2,2,2
3,3,3,3
4,4,4,4
5,5,5,5
6,6,6,6
7,7,7,7
8,8,8,8
9,9,9,9


In [8]:
# Getting hierarchical index
res1 = pd.concat(frames,keys=['a','b','c'])
res1

Unnamed: 0,Unnamed: 1,A,B,C
a,0,0,0,0
a,1,1,1,1
a,2,2,2,2
b,3,3,3,3
b,4,4,4,4
b,5,5,5,5
b,6,6,6,6
c,7,7,7,7
c,8,8,8,8
c,9,9,9,9


In [9]:
# accessing the dataframe by index 'c'
res1.loc['c']

Unnamed: 0,A,B,C
7,7,7,7
8,8,8,8
9,9,9,9


In [10]:
# accessing the dataframe by index ['c',9]
res1.loc['c',9]

A    9
B    9
C    9
Name: (c, 9), dtype: int64

We take two DataFrames with similar indexes and concatenate them using the `concat()` function.

In [11]:
df4 = pd.DataFrame({'E':[1,2,3,5], 'F':[1,2,3,5]}, index=[1,2,3,5])
df4

Unnamed: 0,E,F
1,1,1
2,2,2
3,3,3
5,5,5


In [12]:
# By default, concatenation is performed using axis=0
pd.concat([df1,df4])

Unnamed: 0,A,B,C,E,F
0,0.0,0.0,0.0,,
1,1.0,1.0,1.0,,
2,2.0,2.0,2.0,,
1,,,,1.0,1.0
2,,,,2.0,2.0
3,,,,3.0,3.0
5,,,,5.0,5.0


Notice the missing values are filled with `NaN`

In [13]:
# Using the second axis (axis=1)
res2 = pd.concat([df1,df4], axis=1)
res2

Unnamed: 0,A,B,C,E,F
0,0.0,0.0,0.0,,
1,1.0,1.0,1.0,1.0,1.0
2,2.0,2.0,2.0,2.0,2.0
3,,,,3.0,3.0
5,,,,5.0,5.0


In [14]:
res2.reset_index()

Unnamed: 0,index,A,B,C,E,F
0,0,0.0,0.0,0.0,,
1,1,1.0,1.0,1.0,1.0,1.0
2,2,2.0,2.0,2.0,2.0,2.0
3,3,,,,3.0,3.0
4,5,,,,5.0,5.0


In [15]:
# Removing the index column
res2.reset_index(drop=True)

Unnamed: 0,A,B,C,E,F
0,0.0,0.0,0.0,,
1,1.0,1.0,1.0,1.0,1.0
2,2.0,2.0,2.0,2.0,2.0
3,,,,3.0,3.0
4,,,,5.0,5.0


The following examples show how to execute something equivalent to an INNER JOIN between two database tables. In this case, it will involve two DataFrames.

`join = 'inner'` produces the following result:

In [16]:
pd.concat([df1,df4], axis=1, join='inner')

Unnamed: 0,A,B,C,E,F
1,1,1,1,1,1
2,2,2,2,2,2


You only get the rows belonging to both DataFrames.

If you do not specify `join = 'inner'`, you get a bigger DataFrame (equivalent to a SQL CROSS JOIN). Notice that missing values are filled with NaN.

In [17]:
pd.concat([df1,df4])

Unnamed: 0,A,B,C,E,F
0,0.0,0.0,0.0,,
1,1.0,1.0,1.0,,
2,2.0,2.0,2.0,,
1,,,,1.0,1.0
2,,,,2.0,2.0
3,,,,3.0,3.0
5,,,,5.0,5.0


In [18]:
# outer isthe default option for the parameter join
pd.concat([df1,df4], join='outer')

Unnamed: 0,A,B,C,E,F
0,0.0,0.0,0.0,,
1,1.0,1.0,1.0,,
2,2.0,2.0,2.0,,
1,,,,1.0,1.0
2,,,,2.0,2.0
3,,,,3.0,3.0
5,,,,5.0,5.0


## Merge

The different types of join or merge in Pandas:
- `Inner Join`: To keep only rows that match from the DataFrames, specify how=‘inner’.
- `Outer Join` or Full outer join: To keep all rows from both data frames, specify how=‘outer’.
- `Left Join` or Left outer join: To include all the rows of your data frame x and only those from y that match, specify how=‘left’.
- `Right Join` or Right outer join: To include all the rows of your data frame y and only those from x that match, specify how=‘right’.

![image info](./Merge.png)

In [19]:
d1 = {'Customer_id':pd.Series([1,2,3,4,5,6]),
      'Product':pd.Series(['Radio','Radio','Radio',
       'Television','Television','Television'])}
df1 = pd.DataFrame(d1)
df1

Unnamed: 0,Customer_id,Product
0,1,Radio
1,2,Radio
2,3,Radio
3,4,Television
4,5,Television
5,6,Television


In [20]:
d2 = {'Customer_id':pd.Series([2,4,6,8,10]),
    'State':pd.Series(['Nevada','Nevada','Texas','Florida','Florida'])}
df2 = pd.DataFrame(d2)
df2

Unnamed: 0,Customer_id,State
0,2,Nevada
1,4,Nevada
2,6,Texas
3,8,Florida
4,10,Florida


**Inner Join**

In [21]:
#inner join 
pd.merge(df1, df2, on='Customer_id', how='inner') 

Unnamed: 0,Customer_id,Product,State
0,2,Radio,Nevada
1,4,Television,Nevada
2,6,Television,Texas


**Outer Join**

In [22]:
# outer join 
pd.merge(df1, df2, on='Customer_id', how='outer') 

Unnamed: 0,Customer_id,Product,State
0,1,Radio,
1,2,Radio,Nevada
2,3,Radio,
3,4,Television,Nevada
4,5,Television,
5,6,Television,Texas
6,8,,Florida
7,10,,Florida


**Left Join**

In [23]:
# left join 
pd.merge(df1, df2, on='Customer_id', how='left') 

Unnamed: 0,Customer_id,Product,State
0,1,Radio,
1,2,Radio,Nevada
2,3,Radio,
3,4,Television,Nevada
4,5,Television,
5,6,Television,Texas


**Right Join**

In [24]:
# right join 
pd.merge(df1, df2, on='Customer_id', how='right') 

Unnamed: 0,Customer_id,Product,State
0,2,Radio,Nevada
1,4,Television,Nevada
2,6,Television,Texas
3,8,,Florida
4,10,,Florida


## Join

It is a method that joins standard fields of various DataFrames.

The df.join() method join columns with other DataFrame either on an index or on a key column.

`DataFrame.join()` is a convenient method for combining the columns of two potentially differently-indexed DataFrames into a single result DataFrame.

In [25]:
left = pd.DataFrame({'Col_A':['A0','A1','A2'],'Col_B':['B0','B1','B2']},
                    index=['a','b','c'])
left

Unnamed: 0,Col_A,Col_B
a,A0,B0
b,A1,B1
c,A2,B2


In [26]:
right = pd.DataFrame({'Col_C':['C0','C1','C2'],'Col_D':['D0','D1','D2']},
                     index=['a','c','d'])
right

Unnamed: 0,Col_C,Col_D
a,C0,D0
c,C1,D1
d,C2,D2


In [27]:
left.join(right)

Unnamed: 0,Col_A,Col_B,Col_C,Col_D
a,A0,B0,C0,D0
b,A1,B1,,
c,A2,B2,C1,D1


In [28]:
right.join(left)

Unnamed: 0,Col_C,Col_D,Col_A,Col_B
a,C0,D0,A0,B0
c,C1,D1,A2,B2
d,C2,D2,,


In [29]:
left.join(right, how='inner')

Unnamed: 0,Col_A,Col_B,Col_C,Col_D
a,A0,B0,C0,D0
c,A2,B2,C1,D1


In [30]:
right.join(left, how='inner')

Unnamed: 0,Col_C,Col_D,Col_A,Col_B
a,C0,D0,A0,B0
c,C1,D1,A2,B2


Reference:
- VanderPlas, J. (2017) Python Data Science Handbook: Essential Tools for Working with Data. USA: O’Reilly Media, Inc. chapter 3