## How to concat and merge dataframes?

### Selecting a Merge-Function
#### df1.append(df2): stacking vertically
#### pd.concat([df1,df2],axis=1): stacking many horizontally or vertically (simple inner/outer joins on indexes)
#### df1.join(df2): inner/outer/left/right joins on indexes
#### pd.merge([df1,df2]): many joins on multiple cols

#### best practice: concat and merge

In [4]:
import pandas as pd

In [5]:
#create a dataframe movies
import numpy as np
movies = pd.DataFrame(
    {'movie_id': np.arange(1,6,1),
    'movie': ['Toy Story (1995)', 'Golden Eye (1995)', 'Four Rooms (1995)', 'Get Shorty (1995)', 'Copycat (1995)']
    })
movies

Unnamed: 0,movie_id,movie
0,1,Toy Story (1995)
1,2,Golden Eye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


In [6]:
#create a dataframe ratings
s=10
ratings = pd.DataFrame(
    {'user_id': np.random.randint(100, size=s),
    'movie_id': np.random.randint(1,6, size=s),
    'rating': np.random.randint(1,4, size=s),
    'timestamp': np.random.randint(878871216, 890387596, size=s)
    })
ratings

Unnamed: 0,user_id,movie_id,rating,timestamp
0,5,5,2,880209267
1,56,5,1,885802805
2,5,5,1,882365327
3,36,1,1,889641582
4,34,5,3,884916453
5,87,5,2,879133726
6,59,5,1,887803789
7,60,5,1,882977380
8,0,4,3,889875874
9,11,2,2,881365858


### Show details of the dataframes

In [7]:
ratings.loc[ratings.movie_id==1,:]

Unnamed: 0,user_id,movie_id,rating,timestamp
3,36,1,1,889641582


In [8]:
movies.columns

Index(['movie_id', 'movie'], dtype='object')

In [9]:
ratings.columns

Index(['user_id', 'movie_id', 'rating', 'timestamp'], dtype='object')

### How to use merge

In [10]:
#IMPORTANT: PANDAS
#merge #pandas

#pd.merge(df1,df2): pandas returns the cols in the order you pass it (first cols of df1 then df2)
#pd.merge automatically merges on cols with the same name (this col will appear just once)
movie_ratings = pd.merge(movies, ratings)
movie_ratings.columns

Index(['movie_id', 'movie', 'user_id', 'rating', 'timestamp'], dtype='object')

In [11]:
#merge searches for the first movie_id in movies and adds the first row with information about the movie_id (and then the 2nd, 3rd,...)
movie_ratings

Unnamed: 0,movie_id,movie,user_id,rating,timestamp
0,1,Toy Story (1995),36,1,889641582
1,2,Golden Eye (1995),11,2,881365858
2,4,Get Shorty (1995),0,3,889875874
3,5,Copycat (1995),5,2,880209267
4,5,Copycat (1995),56,1,885802805
5,5,Copycat (1995),5,1,882365327
6,5,Copycat (1995),34,3,884916453
7,5,Copycat (1995),87,2,879133726
8,5,Copycat (1995),59,1,887803789
9,5,Copycat (1995),60,1,882977380


In [12]:
#comparing the shape of the dataframes with the merge
print(movies.shape)
print(ratings.shape)
print(movie_ratings.shape)

(5, 2)
(10, 4)
(10, 5)


### How to merge if there are different col-names

In [13]:
#change the movie.columns and compare to ratings.columns
#make sure there are not the same col-names
movies.columns = ['m_id', 'title']
print(movies.columns)
print(ratings.columns)

Index(['m_id', 'title'], dtype='object')
Index(['user_id', 'movie_id', 'rating', 'timestamp'], dtype='object')


In [15]:
#IMPORTANT: PANDAS
#merge #mergeoncols #pandas

#merge-function if dataframes don't share the same col-name
pd.merge(movies,ratings,left_on='m_id',right_on='movie_id')

Unnamed: 0,m_id,title,user_id,movie_id,rating,timestamp
0,1,Toy Story (1995),36,1,1,889641582
1,2,Golden Eye (1995),11,2,2,881365858
2,4,Get Shorty (1995),0,4,3,889875874
3,5,Copycat (1995),5,5,2,880209267
4,5,Copycat (1995),56,5,1,885802805
5,5,Copycat (1995),5,5,1,882365327
6,5,Copycat (1995),34,5,3,884916453
7,5,Copycat (1995),87,5,2,879133726
8,5,Copycat (1995),59,5,1,887803789
9,5,Copycat (1995),60,5,1,882977380


### How to join on indexes

In [16]:
movies = movies.set_index('m_id')
movies

Unnamed: 0_level_0,title
m_id,Unnamed: 1_level_1
1,Toy Story (1995)
2,Golden Eye (1995)
3,Four Rooms (1995)
4,Get Shorty (1995)
5,Copycat (1995)


In [17]:
#IMPORTANT: PANDAS
#merge #mergeonindex #pandas

#merge (one index)
#index of the right dataframe is used as the new index
pd.merge(movies,ratings,left_index=True, right_on='movie_id')

Unnamed: 0,title,user_id,movie_id,rating,timestamp
3,Toy Story (1995),36,1,1,889641582
9,Golden Eye (1995),11,2,2,881365858
8,Get Shorty (1995),0,4,3,889875874
0,Copycat (1995),5,5,2,880209267
1,Copycat (1995),56,5,1,885802805
2,Copycat (1995),5,5,1,882365327
4,Copycat (1995),34,5,3,884916453
5,Copycat (1995),87,5,2,879133726
6,Copycat (1995),59,5,1,887803789
7,Copycat (1995),60,5,1,882977380


### How to join on two indexes

In [18]:
ratings = ratings.set_index('movie_id')
ratings

Unnamed: 0_level_0,user_id,rating,timestamp
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
5,5,2,880209267
5,56,1,885802805
5,5,1,882365327
1,36,1,889641582
5,34,3,884916453
5,87,2,879133726
5,59,1,887803789
5,60,1,882977380
4,0,3,889875874
2,11,2,881365858


In [19]:
#IMPORTANT: PANDAS
#merge #mergeontwoindexes #pandas

#merge (two index)
#index of the left dataframe is used as the new index
pd.merge(movies, ratings, left_index=True, right_index=True)

Unnamed: 0,title,user_id,rating,timestamp
1,Toy Story (1995),36,1,889641582
2,Golden Eye (1995),11,2,881365858
4,Get Shorty (1995),0,3,889875874
5,Copycat (1995),5,2,880209267
5,Copycat (1995),56,1,885802805
5,Copycat (1995),5,1,882365327
5,Copycat (1995),34,3,884916453
5,Copycat (1995),87,2,879133726
5,Copycat (1995),59,1,887803789
5,Copycat (1995),60,1,882977380


### Inner, Outer, Left and Right Merge

In [20]:
A = pd.DataFrame({
    'color': ['green','yellow','red'],
    'num': [1,2,3]
})
B = pd.DataFrame({
    'color':['green','yellow','pink'],
    'size': ['S','M','L']
})

### Inner

In [21]:
#IMPORTANT: PANDAS
#merge #inner #pandas
#only include observations found in both A and B
pd.merge(A,B,how='inner')

Unnamed: 0,color,num,size
0,green,1,S
1,yellow,2,M


### Outer

In [22]:
#IMPORTANT: PANDAS
#merge #outer #pandas
#includes observations found in either A or B
pd.merge(A,B,how='outer')

Unnamed: 0,color,num,size
0,green,1.0,S
1,yellow,2.0,M
2,red,3.0,
3,pink,,L


### Left

In [23]:
#IMPORTANT: PANDAS
#merge #left #pandas
#include all observations found in A
pd.merge(A,B,how='left')

Unnamed: 0,color,num,size
0,green,1,S
1,yellow,2,M
2,red,3,


### Right

In [24]:
#IMPORTANT: PANDAS
#merge #right #pandas
#include all observations found in B
pd.merge(A,B,how='right')

Unnamed: 0,color,num,size
0,green,1.0,S
1,yellow,2.0,M
2,pink,,L
