In [1]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.DataFrame({
    'Name': ['John', 'Anna', 'Peter', 'Linda'],
    'Age': [28, 24, 35, 32],
    'Salary': [50000, 60000, 55000, 65000]
})

# Age가 30 이상이면 'Senior', 그렇지 않으면 'Junior'로 분류
df['Category'] = df['Age'].apply(lambda x: 'Senior' if x >= 30 else 'Junior')
print(df)

    Name  Age  Salary Category
0   John   28   50000   Junior
1   Anna   24   60000   Junior
2  Peter   35   55000   Senior
3  Linda   32   65000   Senior


In [3]:
grouped = df.groupby('Category')
for name, group in grouped:
    print(f"\nGroup: {name}")
    print(group)


Group: Junior
   Name  Age  Salary Category
0  John   28   50000   Junior
1  Anna   24   60000   Junior

Group: Senior
    Name  Age  Salary Category
2  Peter   35   55000   Senior
3  Linda   32   65000   Senior


In [4]:
# 그룹별로 데이터를 출력
df_grouped = grouped.apply(lambda x: x)
print(df_grouped)

             Name  Age  Salary Category
Category                               
Junior   0   John   28   50000   Junior
         1   Anna   24   60000   Junior
Senior   2  Peter   35   55000   Senior
         3  Linda   32   65000   Senior


In [5]:
# 그룹화하여 Salary 합계 계산
df_grouped = df.groupby('Category')['Salary'].sum().reset_index()
print(df_grouped)

  Category  Salary
0   Junior  110000
1   Senior  120000


여러 함수로 그룹화 후 집계

In [6]:
# Category 별로 Age의 평균과 Salary의 합계 계산
df_grouped = df.groupby('Category').agg({'Age': 'mean', 'Salary': 'sum'}).reset_index()
print(df_grouped)

  Category   Age  Salary
0   Junior  26.0  110000
1   Senior  33.5  120000


그룹별 사용자 정의 함수 적용

In [7]:
# 그룹별로 Salary의 최대값과 Age를 기준으로 새로운 계산 적용
def custom_agg(group):
    return pd.Series({
        'Max Salary': group['Salary'].max(),
        'Age Ratio': group['Age'].mean() / group['Salary'].mean()
    })

df_grouped = df.groupby('Category').apply(custom_agg).reset_index()
print(df_grouped)


  Category  Max Salary  Age Ratio
0   Junior     60000.0   0.000473
1   Senior     65000.0   0.000558


그룹별 필터링

In [8]:
# 평균 나이가 30을 초과하는 그룹만 필터링
df_filtered = df.groupby('Category').filter(lambda x: x['Age'].mean() > 30)
print(df_filtered)


    Name  Age  Salary Category
2  Peter   35   55000   Senior
3  Linda   32   65000   Senior


In [9]:
# 그룹별 Salary의 평균을 구하고, 원래 데이터프레임의 각 행에 그 값을 추가
df['Avg Salary by Category'] = df.groupby('Category')['Salary'].transform('mean')
print(df)

    Name  Age  Salary Category  Avg Salary by Category
0   John   28   50000   Junior                 55000.0
1   Anna   24   60000   Junior                 55000.0
2  Peter   35   55000   Senior                 60000.0
3  Linda   32   65000   Senior                 60000.0


In [10]:
# Category 내에서 Salary의 순위를 계산
df['Salary Rank in Category'] = df.groupby('Category')['Salary'].rank(ascending=False)
print(df)

    Name  Age  Salary Category  Avg Salary by Category  \
0   John   28   50000   Junior                 55000.0   
1   Anna   24   60000   Junior                 55000.0   
2  Peter   35   55000   Senior                 60000.0   
3  Linda   32   65000   Senior                 60000.0   

   Salary Rank in Category  
0                      2.0  
1                      1.0  
2                      2.0  
3                      1.0  
