In [6]:
#Merging Datasets

import pandas as pd

# Example DataFrames
data1 = {'id' : [1, 2, 3, 4],
         'name' : ['Alice', 'Bob', 'Charlie', 'David'],}
df1 = pd.DataFrame(data1)
data2 = {'id' : [2, 3, 4, 5],
         'age' : [24, 30, 22, 28],}
df2 = pd.DataFrame(data2)

# Merge DataFrames on 'id' column

# Common key for merge is 'id'
# Inner join: only rows with matching 'id' in both DataFrames
merged_df = pd.merge(df1, df2, on='id', how='inner')
print(f'Merged DataFrame:\n{merged_df}')
# Output:
#    id     name  age
# 0  2      Bob   24
# 1  3  Charlie   30
# 2  4    David   22

# Left join: all rows from df1, matching rows from df2
# Note: if no match, NaN for df2 columns
# This is useful for keeping all records from the left DataFrame
merged_df_left = pd.merge(df1, df2, on='id', how='left')
print(f'Left Merged DataFrame:\n{merged_df_left}')

# right join: all rows from df2, matching rows from df1
# Note: if no match, NaN for df1 columns
# This is useful for keeping all records from the right DataFrame
merged_df_right = pd.merge(df1, df2, on='id', how='right')
print(f'Right Merged DataFrame:\n{merged_df_right}')

# Outer join: all rows from both DataFrames, NaN where no match
# This is useful for keeping all records from both DataFrames
merged_df_outer = pd.merge(df1, df2, on='id', how='outer')
print(f'Outer Merged DataFrame:\n{merged_df_outer}')


Merged DataFrame:
   id     name  age
0   2      Bob   24
1   3  Charlie   30
2   4    David   22
Left Merged DataFrame:
   id     name   age
0   1    Alice   NaN
1   2      Bob  24.0
2   3  Charlie  30.0
3   4    David  22.0
Right Merged DataFrame:
   id     name  age
0   2      Bob   24
1   3  Charlie   30
2   4    David   22
3   5      NaN   28
Outer Merged DataFrame:
   id     name   age
0   1    Alice   NaN
1   2      Bob  24.0
2   3  Charlie  30.0
3   4    David  22.0
4   5      NaN  28.0


In [None]:
# Concatenating DataFrames
# Concatenating along rows (axis=0)
# This is useful for stacking DataFrames on top of each other
# Note: columns must match, otherwise NaN for missing columns
union_df = pd.concat([df1, df2], ignore_index=True) # ignore_index=True resets the index, so no duplicate indices.
print(f'Concatenated DataFrame:\n{union_df}')

# Concatenating along columns (axis=1)
# This is useful for side-by-side DataFrames
# Note: rows must match, otherwise NaN for missing rows
# This is useful for side-by-side DataFrames
union_df_col = pd.concat([df1, df1], axis=1)
print(f'Concatenated DataFrame along columns:\n{union_df_col}')
# Note: This will not work if the number of rows in df1 and df2 are different

Concatenated DataFrame:
   id     name   age
0   1    Alice   NaN
1   2      Bob   NaN
2   3  Charlie   NaN
3   4    David   NaN
4   2      NaN  24.0
5   3      NaN  30.0
6   4      NaN  22.0
7   5      NaN  28.0
Concatenated DataFrame along columns:
   id     name  id     name
0   1    Alice   1    Alice
1   2      Bob   2      Bob
2   3  Charlie   3  Charlie
3   4    David   4    David
