# ChatGPT Pandas GroupBy, Agg, Pivot_Table and Transform

## 📊 Exercise 1: Multiple aggregations on grouped data

Instruction:

Group transactions by customer_id and calculate both the total and average transaction amount per customer.

Data:

In [1]:
import pandas as pd

df = pd.DataFrame({
    "customer_id": [1,1,2,2,2,3,3,4],
    "transaction_id": [101,102,201,202,203,301,302,401],
    "amount": [200,150,300,100,400,250,250,500]
})
print(df)


   customer_id  transaction_id  amount
0            1             101     200
1            1             102     150
2            2             201     300
3            2             202     100
4            2             203     400
5            3             301     250
6            3             302     250
7            4             401     500


In [2]:
group = df.groupby(df['customer_id']).agg(
    total = ("amount", "sum"),
    average = ("amount", "mean")
).reset_index()

group

Unnamed: 0,customer_id,total,average
0,1,350,175.0
1,2,800,266.666667
2,3,500,250.0
3,4,500,500.0


Exercise 1 solution

In [3]:
result = df.groupby("customer_id")["amount"].agg(["sum", "mean"])
print(result)


             sum        mean
customer_id                 
1            350  175.000000
2            800  266.666667
3            500  250.000000
4            500  500.000000


## 📊 Exercise 2: Grouping by multiple keys

Instruction:

Group transactions by both customer_id and transaction_type, and find the total amount.

Data:

In [8]:
df = pd.DataFrame({
    "customer_id": [1,1,2,2,2,3,3,4],
    "transaction_type": ["cash","card","card","cash","card","cash","cash","card"],
    "amount": [200,150,300,100,400,250,250,500]
})
df

Unnamed: 0,customer_id,transaction_type,amount
0,1,cash,200
1,1,card,150
2,2,card,300
3,2,cash,100
4,2,card,400
5,3,cash,250
6,3,cash,250
7,4,card,500


In [18]:
total_amount = df['amount'].groupby(['customer_id', 'transaction_type']).sum()
total_amount

KeyError: 'customer_id'

Exercise 2 solution

In [17]:
result = df.groupby(["customer_id","transaction_type"])["amount"].sum()
result

customer_id  transaction_type
1            card                150
             cash                200
2            card                700
             cash                100
3            cash                500
4            card                500
Name: amount, dtype: int64

## 📊 Exercise 3: Sorting grouped results

Instruction:

Find the total spend per customer, sorted by highest spender.

Data: (reuse from above)

Solution:

In [20]:
df = pd.DataFrame({
    "customer_id": [1,1,2,2,2,3,3,4],
    "transaction_type": ["cash","card","card","cash","card","cash","cash","card"],
    "amount": [200,150,300,100,400,250,250,500]
})
df

Unnamed: 0,customer_id,transaction_type,amount
0,1,cash,200
1,1,card,150
2,2,card,300
3,2,cash,100
4,2,card,400
5,3,cash,250
6,3,cash,250
7,4,card,500


In [24]:
total_spend = df.groupby(['customer_id'])['amount'].sum().sort_values(ascending=False)
total_spend

customer_id
2    800
3    500
4    500
1    350
Name: amount, dtype: int64

Exercise 3 solution

In [25]:
result = df.groupby("customer_id")["amount"].sum().sort_values(ascending=False)
print(result)


customer_id
2    800
3    500
4    500
1    350
Name: amount, dtype: int64


## 📊 Exercise 4: Handling missing values in groupby

Instruction:

Group by customer_id, but fill missing transaction amounts with 0 before summing.

Data:

In [26]:
df = pd.DataFrame({
    "customer_id": [1,1,2,3,3,4],
    "amount": [200,None,300,250,None,500]
})
df

Unnamed: 0,customer_id,amount
0,1,200.0
1,1,
2,2,300.0
3,3,250.0
4,3,
5,4,500.0


In [31]:
df.fillna(0)
group2 = df.groupby(['customer_id'])['amount'].fillna(0).sum()
group2

  group2 = df.groupby(['customer_id'])['amount'].fillna(0).sum()


Unnamed: 0,customer_id,amount
0,1,200.0
1,1,
2,2,300.0
3,3,250.0
4,3,
5,4,500.0


Exercise 4 solution

In [32]:
result = df.fillna({"amount":0}).groupby("customer_id")["amount"].sum()
print(result)


customer_id
1    200.0
2    300.0
3    250.0
4    500.0
Name: amount, dtype: float64


## 📊 Exercise 5: Using groupby().apply() with custom fraud flagging

Instruction:

Flag customers whose average transaction amount is above 300.

Data:

In [33]:
df = pd.DataFrame({
    "customer_id": [1,1,2,2,2,3,3,4],
    "amount": [200,150,300,100,400,250,250,500]
})
df

Unnamed: 0,customer_id,amount
0,1,200
1,1,150
2,2,300
3,2,100
4,2,400
5,3,250
6,3,250
7,4,500


In [35]:
flag = df.groupby(['customer_id'])['amount'].mean() > 300

In [36]:
flag

customer_id
1    False
2    False
3    False
4     True
Name: amount, dtype: bool

Exercise 5 solution

In [38]:
def flag_fraud(group):
    avg = group["amount"].mean()
    return pd.Series({"avg_amount": avg, "fraud_flag": avg > 300})

result = df.groupby("customer_id").apply(flag_fraud)
print(result)


             avg_amount  fraud_flag
customer_id                        
1            175.000000       False
2            266.666667       False
3            250.000000       False
4            500.000000        True


  result = df.groupby("customer_id").apply(flag_fraud)


## 📊 Exercise 6: Multi-aggregation with rename

Instruction:

For each customer_id, calculate total amount, average amount, and transaction count. Rename the columns clearly.

Data:

In [39]:
import pandas as pd

df = pd.DataFrame({
    "customer_id": [1,1,2,2,2,3,4,4,4],
    "transaction_id": [101,102,201,202,203,301,401,402,403],
    "amount": [200,150,300,100,400,250,500,100,200],
    "transaction_type": ["card","cash","cash","card","cash","card","cash","card","cash"]
})

df

Unnamed: 0,customer_id,transaction_id,amount,transaction_type
0,1,101,200,card
1,1,102,150,cash
2,2,201,300,cash
3,2,202,100,card
4,2,203,400,cash
5,3,301,250,card
6,4,401,500,cash
7,4,402,100,card
8,4,403,200,cash


In [40]:
rename = df.groupby(['customer_id']).agg(
    total_amount = ('amount', 'sum'),
    average_amount = ('amount', 'mean'),
    txn_count = ('amount', 'count')
).reset_index()

rename

Unnamed: 0,customer_id,total_amount,average_amount,txn_count
0,1,350,175.0,2
1,2,800,266.666667,3
2,3,250,250.0,1
3,4,800,266.666667,3


Exercise 6 solution

In [47]:
result = df.groupby("customer_id").agg(
    total_amount=("amount","sum"),
    avg_amount=("amount","mean"),
    txn_count=("transaction_id","count")
).reset_index()
result

Unnamed: 0,customer_id,total_amount,avg_amount,txn_count
0,1,350,175.0,2
1,2,800,266.666667,3
2,3,250,250.0,1
3,4,800,266.666667,3


In [48]:
pivot = pd.pivot_table(df, values="amount", index="customer_id",
                       aggfunc=["sum","mean","count"]).reset_index()
pivot


Unnamed: 0_level_0,customer_id,sum,mean,count
Unnamed: 0_level_1,Unnamed: 1_level_1,amount,amount,amount
0,1,350,175.0,2
1,2,800,266.666667,3
2,3,250,250.0,1
3,4,800,266.666667,3


## 📊 Exercise 7: Multi-key grouping with multiple aggregations

Instruction:

Find average and maximum amount by customer_id and transaction_type.

Solution:

In [49]:
pivot = pd.pivot_table(df, values="amount", index=["customer_id", "transaction_type"],
                      aggfunc=["mean", "max"]).reset_index()
pivot

Unnamed: 0_level_0,customer_id,transaction_type,mean,max
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,amount,amount
0,1,card,200.0,200
1,1,cash,150.0,150
2,2,card,100.0,100
3,2,cash,350.0,400
4,3,card,250.0,250
5,4,card,100.0,100
6,4,cash,350.0,500


In [50]:
exercise7 = df.groupby(['customer_id', 'transaction_type']).agg(
    average = ('amount', 'mean'),
    maximum = ('amount', 'max')
).reset_index()

exercise7

Unnamed: 0,customer_id,transaction_type,average,maximum
0,1,card,200.0,200
1,1,cash,150.0,150
2,2,card,100.0,100
3,2,cash,350.0,400
4,3,card,250.0,250
5,4,card,100.0,100
6,4,cash,350.0,500


Exercise 7 solution

In [52]:
result = df.groupby(["customer_id","transaction_type"]).agg(
    avg_amount=("amount","mean"),
    max_amount=("amount","max")
)
result


Unnamed: 0_level_0,Unnamed: 1_level_0,avg_amount,max_amount
customer_id,transaction_type,Unnamed: 2_level_1,Unnamed: 3_level_1
1,card,200.0,200
1,cash,150.0,150
2,card,100.0,100
2,cash,350.0,400
3,card,250.0,250
4,card,100.0,100
4,cash,350.0,500


In [53]:
pivot = pd.pivot_table(df, values="amount",
                       index="customer_id", columns="transaction_type",
                       aggfunc=["mean","max"])
pivot


Unnamed: 0_level_0,mean,mean,max,max
transaction_type,card,cash,card,cash
customer_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,200.0,150.0,200.0,150.0
2,100.0,350.0,100.0,400.0
3,250.0,,250.0,
4,100.0,350.0,100.0,500.0


## 📊 Exercise 8: Aggregation with multiple functions on same column

Instruction:

For each customer, compute min, max, mean, and std deviation of amount.

In [57]:
exercise8 = pd.pivot_table(df, values='amount', index='customer_id',
                          aggfunc=['min', 'max', 'mean', 'std'], fill_value=0).reset_index()
exercise8

Unnamed: 0_level_0,customer_id,min,max,mean,std
Unnamed: 0_level_1,Unnamed: 1_level_1,amount,amount,amount,amount
0,1,150,200,175.0,35.355339
1,2,100,400,266.666667,152.752523
2,3,250,250,250.0,
3,4,100,500,266.666667,208.1666


In [60]:
exercise8 = df.groupby('customer_id').agg(
    mininum = ('amount', 'min'),
    maximum = ('amount', 'max'),
    mean = ('amount', 'mean'),
    standard_deviation = ('amount', 'std')
).reset_index()

exercise8

Unnamed: 0,customer_id,mininum,maximum,mean,standard_deviation
0,1,150,200,175.0,35.355339
1,2,100,400,266.666667,152.752523
2,3,250,250,250.0,
3,4,100,500,266.666667,208.1666


Exercise 8 solution

In [76]:
result = df.groupby("customer_id")["amount"].agg(["min","max","mean","std"])
result


Unnamed: 0_level_0,min,max,mean,std
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,150,200,175.0,35.355339
2,100,400,266.666667,152.752523
3,250,250,250.0,
4,100,500,266.666667,208.1666


In [63]:
pivot = pd.pivot_table(df, values="amount", index="customer_id",
                       aggfunc=["min","max","mean","std"])
pivot


Unnamed: 0_level_0,min,max,mean,std
Unnamed: 0_level_1,amount,amount,amount,amount
customer_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,150,200,175.0,35.355339
2,100,400,266.666667,152.752523
3,250,250,250.0,
4,100,500,266.666667,208.1666


## 📊 Exercise 9: Aggregation with conditional logic

Instruction:

For each customer, calculate total amount but only for cash transactions.

In [66]:
exercise9 = pd.pivot_table(df[df['transaction_type'] == 'cash'], values='amount', index='customer_id',
                          aggfunc='sum', fill_value=0).reset_index()
exercise9

Unnamed: 0,customer_id,amount
0,1,150
1,2,700
2,4,700


Exercise 9 solution

In [69]:
result = df[df["transaction_type"]=="cash"].groupby("customer_id")["amount"].agg("sum")
result

customer_id
1    150
2    700
4    700
Name: amount, dtype: int64

In [68]:
pivot = pd.pivot_table(df[df["transaction_type"]=="cash"], 
                       values="amount", index="customer_id", aggfunc="sum")
pivot


Unnamed: 0_level_0,amount
customer_id,Unnamed: 1_level_1
1,150
2,700
4,700


## 📊 Exercise 10: Aggregation across multiple columns

Instruction:

For each customer, compute:

Total amount spent

Number of distinct transaction types

In [77]:
exercise10 = df.groupby('customer_id', as_index=False).agg(
    total_spent = ('amount', 'sum'),
    number = ('transaction_type', 'nunique')
).reset_index()

exercise10

Unnamed: 0,index,customer_id,total_spent,number
0,0,1,350,2
1,1,2,800,2
2,2,3,250,1
3,3,4,800,2


Exercise 10 solution

In [74]:
pivot = pd.pivot_table(df, values="amount", index="customer_id",
                       aggfunc="sum")
pivot

Unnamed: 0_level_0,amount
customer_id,Unnamed: 1_level_1
1,350
2,800
3,250
4,800


In [79]:
df = pd.DataFrame({
    "customer_id": [1,1,2,2,3],
    "amount": [200,300,100,400,250]
})
df

Unnamed: 0,customer_id,amount
0,1,200
1,1,300
2,2,100
3,2,400
4,3,250


In [82]:
df["avg_amount"] = df.groupby("customer_id")["amount"].mean()
df["diff_from_avg"] = df["amount"] - df["avg_amount"]
df

Unnamed: 0,customer_id,amount,avg_amount,diff_from_avg
0,1,200,,
1,1,300,250.0,50.0
2,2,100,250.0,-150.0
3,2,400,250.0,150.0
4,3,250,,


In [80]:
df["avg_amount"] = df.groupby("customer_id")["amount"].transform("mean")
df["diff_from_avg"] = df["amount"] - df["avg_amount"]
df


Unnamed: 0,customer_id,amount,avg_amount,diff_from_avg
0,1,200,250.0,-50.0
1,1,300,250.0,50.0
2,2,100,250.0,-150.0
3,2,400,250.0,150.0
4,3,250,250.0,0.0
