In [1]:
import pandas as pd

# Combining Dataframes

In [2]:
df1 = pd.DataFrame({'A': [1, 2], 'B': [3, 4]})
display(df1)

df2 = pd.DataFrame({'C': [5, 6], 'D': [7, 8]})
display(df2)

Unnamed: 0,A,B
0,1,3
1,2,4


Unnamed: 0,C,D
0,5,7
1,6,8


In [3]:
result = pd.concat([df1, df2], axis=1)
display(result)

# axis=1: Specifies that the concatenation should occur along columns. 
# Therefore, the resulting DataFrame will have the columns from both df1 and df2 side by side.

Unnamed: 0,A,B,C,D
0,1,3,5,7
1,2,4,6,8


In [4]:
df1 = pd.DataFrame({'A': [1, 2]}, index=['X', 'Y'])
display(df1)
df2 = pd.DataFrame({'A': [3, 4]}, index=['X', 'Y'])
display(df2)
result = pd.concat([df1, df2], ignore_index=True)
display(result)


#ignores the existing indices and the result dataframe has a default index
#useful when preventing issues with duplicate indices when combining dataframes

Unnamed: 0,A
X,1
Y,2


Unnamed: 0,A
X,3
Y,4


Unnamed: 0,A
0,1
1,2
2,3
3,4


In [34]:

df1 = pd.DataFrame({'A': [1, 2], 'B': [3, 4]})
display(df1)
df2 = pd.DataFrame({'A': [5, 6], 'B': [7, 8]})
display(df2)

result = pd.concat([df1, df2], keys=['df1', 'df2'])
display(result)

#The keys 'df1' and 'df2' help distinguish which part of the data comes from each original DataFrame, 
# creating a clear structure when the two datasets are concatenated.

Unnamed: 0,A,B
0,1,3
1,2,4


Unnamed: 0,A,B
0,5,7
1,6,8


Unnamed: 0,Unnamed: 1,A,B
df1,0,1,3
df1,1,2,4
df2,0,5,7
df2,1,6,8


In [7]:
df1 = pd.DataFrame({'A': [1, 2]}, index=['X', 'Y'])
display(df1)
df2 = pd.DataFrame({'A': [3, 4]}, index=['Z', 'W'])
display(df2)


result = pd.concat([df1, df2], axis=1, keys=['df1', 'df2'])
display(result)


Unnamed: 0,A
X,1
Y,2


Unnamed: 0,A
Z,3
W,4


Unnamed: 0_level_0,df1,df2
Unnamed: 0_level_1,A,A
X,1.0,
Y,2.0,
Z,,3.0
W,,4.0


In [8]:

# Create two sample DataFrames
df1 = pd.DataFrame({'ID': [1, 2, 3], 'Name': ['Alice', 'Bob', 'Charlie']})
df2 = pd.DataFrame({'ID': [2, 3, 4], 'Age': [25, 30, 22]})

display(df1)
display(df2)


result_inner = pd.merge(df1, df2, on='ID')
display(result_inner)



Unnamed: 0,ID,Name
0,1,Alice
1,2,Bob
2,3,Charlie


Unnamed: 0,ID,Age
0,2,25
1,3,30
2,4,22


Unnamed: 0,ID,Name,Age
0,2,Bob,25
1,3,Charlie,30


In [9]:
result_left = pd.merge(df1, df2, on='ID', how='left')
display(result_left)


Unnamed: 0,ID,Name,Age
0,1,Alice,
1,2,Bob,25.0
2,3,Charlie,30.0


In [10]:
result_outer = pd.merge(df1, df2, on='ID', how='outer')
display(result_outer)

Unnamed: 0,ID,Name,Age
0,1,Alice,
1,2,Bob,25.0
2,3,Charlie,30.0
3,4,,22.0


# Functions

In [39]:

# Create a DataFrame
df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
df


Unnamed: 0,A,B
0,1,4
1,2,5
2,3,6


In [40]:
# Define a custom function
def square(x):
    return x**2


In [41]:

# The apply() function is used to apply the square function to each element in column 'A'.
df['A_squared'] = df['A'].apply(square)

# Display the result
display(df)

Unnamed: 0,A,B,A_squared
0,1,4,1
1,2,5,4
2,3,6,9


In [43]:
df = pd.DataFrame({'A': ['apple', 'banana', 'cherry']})
df

Unnamed: 0,A
0,apple
1,banana
2,cherry


In [44]:
# Define a dictionary for mapping
fruit_colors = {'apple': 'red', 'banana': 'yellow', 'test': 'red'}

# The map() function is used to create a new column 'Color' by mapping values in column 'A' to their corresponding colors.
df['Color'] = df['A'].map(fruit_colors)
df

Unnamed: 0,A,Color
0,apple,red
1,banana,yellow
2,cherry,


In [16]:
# Create a DataFrame
df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
display(df)


# Define a custom function
def double(x):
    return x * 2

# Apply the function element-wise to the entire DataFrame
df_doubled = df.applymap(double)

# Display the result
display(df_doubled)

Unnamed: 0,A,B
0,1,4
1,2,5
2,3,6


Unnamed: 0,A,B
0,2,8
1,4,10
2,6,12


In [45]:
# Create a DataFrame
df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
display(df)

# Use lambda function to square each element in column 'A'
df['A'] = df['A'].apply(lambda x: x**2)

# Display the result
display(df)

# The lambda x: x**2 defines an anonymous function to square a given value.
# The apply() function is used to apply the lambda function to each element in column 'A'.

Unnamed: 0,A,B
0,1,4
1,2,5
2,3,6


Unnamed: 0,A,B
0,1,4
1,4,5
2,9,6


In [46]:
# Create a DataFrame
df = pd.DataFrame({'A': ['apple', 'banana', 'cherry']})
display(df)

# Use lambda function to map fruit names to their lengths
df['Name_Length'] = df['A'].apply(lambda x: len(x))

# Display the result
display(df)

Unnamed: 0,A
0,apple
1,banana
2,cherry


Unnamed: 0,A,Name_Length
0,apple,5
1,banana,6
2,cherry,6


# Grouping

In [19]:

# Create a sample DataFrame
data = {'Category': ['A', 'B', 'A', 'B', 'A', 'B'],
        'Value': [10, 15, 20, 25, 30, 35]}
df = pd.DataFrame(data)
display(df)

# Group by 'Category' and calculate the mean for each group
grouped_df = df.groupby('Category').mean()

display(grouped_df)


Unnamed: 0,Category,Value
0,A,10
1,B,15
2,A,20
3,B,25
4,A,30
5,B,35


Unnamed: 0_level_0,Value
Category,Unnamed: 1_level_1
A,20.0
B,25.0


In [20]:
# Group by 'Category' and calculate the mean for each group
grouped_df_sum = df.groupby('Category').sum()

display(grouped_df_sum)

Unnamed: 0_level_0,Value
Category,Unnamed: 1_level_1
A,60
B,75


In [48]:

# Create a sample DataFrame
data = {'Category': ['A', 'B', 'A', 'B', 'A', 'B'],
        'Type': ['X', 'Y', 'X', 'Y', 'X', 'Y'],
        'Value': [10, 15, 20, 25, 30, 35]}
df = pd.DataFrame(data)
display(df)


# Group by both 'Category' and 'Type' and calculate the sum for each group
grouped_df = df.groupby(['Category', 'Type']).sum()

display(grouped_df)


Unnamed: 0,Category,Type,Value
0,A,X,10
1,B,Y,15
2,A,X,20
3,B,Y,25
4,A,X,30
5,B,Y,35


Unnamed: 0_level_0,Unnamed: 1_level_0,Value
Category,Type,Unnamed: 2_level_1
A,X,60
B,Y,75


In [22]:

# Create a sample DataFrame
data = {'Category': ['A', 'B', 'A', 'B', 'A', 'B'],
        'Value': [10, 15, 20, 25, 30, 35]}
df = pd.DataFrame(data)
display(df)

# Group by 'Category' and apply both mean and sum for each group
grouped_df = df.groupby('Category').agg({'Value': ['mean', 'sum']})

display(grouped_df)


Unnamed: 0,Category,Value
0,A,10
1,B,15
2,A,20
3,B,25
4,A,30
5,B,35


Unnamed: 0_level_0,Value,Value
Unnamed: 0_level_1,mean,sum
Category,Unnamed: 1_level_2,Unnamed: 2_level_2
A,20.0,60
B,25.0,75


In [23]:
# Create a sample DataFrame
data = {'Category': ['A', 'B', 'A', 'B', 'A', 'B'],
        'Value': [10, 15, 20, 25, 30, 35]}
df = pd.DataFrame(data)
display(df)

# Define a function to calculate Z-scores
z_score = lambda x: (x - x.mean()) / x.std()

# Use transform to calculate Z-scores within each group
df['Z_Score'] = df.groupby('Category')['Value'].transform(z_score)

display(df)

Unnamed: 0,Category,Value
0,A,10
1,B,15
2,A,20
3,B,25
4,A,30
5,B,35


Unnamed: 0,Category,Value,Z_Score
0,A,10,-1.0
1,B,15,-1.0
2,A,20,0.0
3,B,25,0.0
4,A,30,1.0
5,B,35,1.0


In [24]:

# Create a DataFrame
df = pd.DataFrame({'Group': ['A', 'A', 'B', 'B'], 'Values': [1, 2, 3, 4]})
display(df)

# Calculate the mean within each group and broadcast to original shape
df['Group_Mean'] = df.groupby('Group')['Values'].transform('mean')

# Display the result
display(df)


Unnamed: 0,Group,Values
0,A,1
1,A,2
2,B,3
3,B,4


Unnamed: 0,Group,Values,Group_Mean
0,A,1,1.5
1,A,2,1.5
2,B,3,3.5
3,B,4,3.5


# Examples

In [50]:

df1 = pd.DataFrame({'ID': [1, 2, 3], 'Name': ['Alice', 'Bob', 'Charlie']})
df2 = pd.DataFrame({'ID': [3, 4, 5], 'Age': [25, 30, 22]})

display(df1)
display(df2)

Unnamed: 0,ID,Name
0,1,Alice
1,2,Bob
2,3,Charlie


Unnamed: 0,ID,Age
0,3,25
1,4,30
2,5,22


In [51]:
result_df=pd.merge(df1,df2,how='outer',on='ID')
display(result_df)

Unnamed: 0,ID,Name,Age
0,1,Alice,
1,2,Bob,
2,3,Charlie,25.0
3,4,,30.0
4,5,,22.0


In [54]:
df3 = pd.DataFrame({'A': [1, 2], 'B': [3, 4]})
df4 = pd.DataFrame({'C': [5, 6], 'D': [7, 8]})

display(df3)
display(df4)

result = pd.concat([df3, df4], axis=1)
result

Unnamed: 0,A,B
0,1,3
1,2,4


Unnamed: 0,C,D
0,5,7
1,6,8


Unnamed: 0,A,B,C,D
0,1,3,5,7
1,2,4,6,8


In [27]:
df1 = pd.DataFrame({'ID': [1, 2, 3], 'Name': ['Alice', 'Bob', 'Charlie']})
df2 = pd.DataFrame({'ID': [4, 5, 6], 'Name': ['David', 'Eve', 'Frank']})

display(df1)
display(df2)

Unnamed: 0,ID,Name
0,1,Alice
1,2,Bob
2,3,Charlie


Unnamed: 0,ID,Name
0,4,David
1,5,Eve
2,6,Frank


In [56]:
result=pd.concat([df1, df2], axis=0)
result

Unnamed: 0,ID,Name,Age
0,1,Alice,
1,2,Bob,
2,3,Charlie,
0,3,,25.0
1,4,,30.0
2,5,,22.0


In [58]:
df1 = pd.DataFrame({'Values': [10, 15, 20, 25, 30]})
df1

Unnamed: 0,Values
0,10
1,15
2,20
3,25
4,30


In [59]:
df1['Double_Values'] = df1['Values'].apply(lambda x: x*2)
df1

Unnamed: 0,Values,Double_Values
0,10,20
1,15,30
2,20,40
3,25,50
4,30,60


In [60]:
df2 = pd.DataFrame({'Scores': [90, 60, 40, 75, 55]})
df2

Unnamed: 0,Scores
0,90
1,60
2,40
3,75
4,55


In [61]:
def categorize_scores(score):
    if score>=50:
        return 'High'
    return 'Low'



In [63]:
df2['Score_Category']=df2['Scores'].apply(categorize_scores)

df2

Unnamed: 0,Scores,Score_Category
0,90,High
1,60,High
2,40,Low
3,75,High
4,55,High


In [30]:
sales_data = pd.DataFrame({
    'Product': ['A', 'B', 'A', 'B', 'A', 'B'],
    'Sales': [100, 150, 120, 200, 180, 220],
})
sales_data

Unnamed: 0,Product,Sales
0,A,100
1,B,150
2,A,120
3,B,200
4,A,180
5,B,220


In [64]:
sales_data.groupby('Product').mean()

Unnamed: 0_level_0,Sales
Product,Unnamed: 1_level_1
A,133.333333
B,190.0
