In [42]:
import pandas as pd
import seaborn as sns

df = sns.load_dataset('titanic')
pd.set_option('display.max_columns', None)
print(df)

     survived  pclass     sex   age  sibsp  parch     fare embarked   class  \
0           0       3    male  22.0      1      0   7.2500        S   Third   
1           1       1  female  38.0      1      0  71.2833        C   First   
2           1       3  female  26.0      0      0   7.9250        S   Third   
3           1       1  female  35.0      1      0  53.1000        S   First   
4           0       3    male  35.0      0      0   8.0500        S   Third   
..        ...     ...     ...   ...    ...    ...      ...      ...     ...   
886         0       2    male  27.0      0      0  13.0000        S  Second   
887         1       1  female  19.0      0      0  30.0000        S   First   
888         0       3  female   NaN      1      2  23.4500        S   Third   
889         1       1    male  26.0      0      0  30.0000        C   First   
890         0       3    male  32.0      0      0   7.7500        Q   Third   

       who  adult_male deck  embark_town alive  alo

In [2]:
def add_10(x):
    return x + 10

df['age_plus_10'] = df['age'].apply(add_10)
print(df[['age', 'age_plus_10']])

      age  age_plus_10
0    22.0         32.0
1    38.0         48.0
2    26.0         36.0
3    35.0         45.0
4    35.0         45.0
..    ...          ...
886  27.0         37.0
887  19.0         29.0
888   NaN          NaN
889  26.0         36.0
890  32.0         42.0

[891 rows x 2 columns]


In [4]:
df['age_mul_5'] = df['age'].apply(lambda x: x * 5)
print(df[['age', 'age_mul_5']])

      age  age_mul_5
0    22.0      110.0
1    38.0      190.0
2    26.0      130.0
3    35.0      175.0
4    35.0      175.0
..    ...        ...
886  27.0      135.0
887  19.0       95.0
888   NaN        NaN
889  26.0      130.0
890  32.0      160.0

[891 rows x 2 columns]


In [None]:
def max_min_diff(x):
    return x.max() - x.min()

diff = df[['age', 'fare']].apply(max_min_diff)
print(diff)

age      79.5800
fare    512.3292
dtype: float64


In [None]:
def calculate_stats(x):
    return pd.Series({
        'max': x.max(),
        'min': x.min(),
        'mean': x.mean(),
        'median': x.median()
    })

# 각 열에 대해 함수 적용 (axis=0)
col_stats = df.select_dtypes(include=['number']).apply(calculate_stats, axis=0)
print(col_stats)

# 각 행별로 최댓값과 최솟값의 차이 계산 (axis=1)
row_diff = df.select_dtypes(include=['number']).apply(lambda x: x.max() - x.min(), axis=1)
print(row_diff)


각 열의 통계량:
        survived    pclass        age     sibsp     parch        fare  \
max     1.000000  3.000000  80.000000  8.000000  6.000000  512.329200   
min     0.000000  1.000000   0.420000  0.000000  0.000000    0.000000   
mean    0.383838  2.308642  29.699118  0.523008  0.381594   32.204208   
median  0.000000  3.000000  28.000000  0.000000  0.000000   14.454200   

        age_plus_10   age_mul_5  
max       90.000000  400.000000  
min       10.420000    2.100000  
mean      39.699118  148.495588  
median    38.000000  140.000000  
각 행별 최댓값과 최솟값의 차이:
0      110.00
1      190.00
2      130.00
3      175.00
4      175.00
        ...  
886    135.00
887     95.00
888     23.45
889    130.00
890    160.00
Length: 891, dtype: float64


In [18]:
mean_filter = df.select_dtypes(include=['number']).loc[:, df.select_dtypes(include=['number']).mean() > 30]
mean_filter

Unnamed: 0,fare,age_plus_10,age_mul_5
0,7.2500,32.0,110.0
1,71.2833,48.0,190.0
2,7.9250,36.0,130.0
3,53.1000,45.0,175.0
4,8.0500,45.0,175.0
...,...,...,...
886,13.0000,37.0,135.0
887,30.0000,29.0,95.0
888,23.4500,,
889,30.0000,36.0,130.0


In [21]:
mean_filter['Level'] = mean_filter.mean(axis=1).apply(lambda x: 'High' if x > 50 else 'Low')

In [22]:
mean_filter['Level'].unique

<bound method Series.unique of 0       Low
1      High
2      High
3      High
4      High
       ... 
886    High
887    High
888     Low
889    High
890    High
Name: Level, Length: 891, dtype: object>

In [31]:
# 1. 결측치를 중앙값으로 대체하는 함수 생성 및 pipe 적용
def fillna_with_median(df):
    # 수치형 컬럼만 중앙값으로 결측치 대체
    num_cols = df.select_dtypes(include='number').columns
    df[num_cols] = df[num_cols].fillna(df[num_cols].median())
    return df

# 2. 총 결측치 개수를 반환하는 함수 생성 및 체이닝
def count_total_missing(df):
    return df.isnull().sum().sum()

# 3. 문자열 데이터의 첫 글자를 추출 및 확인
def extract_first_char(df):
    for col in df.select_dtypes(include='object'):
        df[col + '_first'] = df[col].astype(str).str[0]
    return df

def check_first_chars(df):
    for col in df.columns:
        if col.endswith('_first'):
            print(f"{col} :", df[col].unique())
    return df

# pipe를 사용하여 결측치 대체 후 결측치 개수 확인
df_filled = df.pipe(fillna_with_median)
missing_count = df_filled.pipe(count_total_missing)
print("총 결측치 개수(중앙값 대체 후):", missing_count)

# 결측치가 대체된 데이터프레임에 대해 pipe 연속 적용
df_filled.pipe(extract_first_char).pipe(check_first_chars)


총 결측치 개수(중앙값 대체 후): 692
sex_first : ['m' 'f']
embarked_first : ['S' 'C' 'Q' 'n']
who_first : ['m' 'w' 'c']
embark_town_first : ['S' 'C' 'Q' 'n']
alive_first : ['n' 'y']


Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone,sex_first,embarked_first,who_first,embark_town_first,alive_first
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False,m,S,m,S,n
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False,f,C,w,C,y
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True,f,S,w,S,y
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False,f,S,w,S,y
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True,m,S,m,S,n
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True,m,S,m,S,n
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True,f,S,w,S,y
888,0,3,female,28.0,1,2,23.4500,S,Third,woman,False,,Southampton,no,False,f,S,w,S,n
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True,m,C,m,C,y


In [None]:
# 1. 열 이름 알파벳 순 정렬
df_alpha = df[sorted(df.columns)]

# 2. 열 역순 정렬 후, 사용자 정의 순서로 재정렬
df_reverse = df[df.columns[::-1]]
df_custom = df_reverse[['survived', 'sex', 'pclass', 'age']]

In [60]:
grouped = df.groupby('class').agg(
    count=('age', 'count'),
    mean_age=('age', 'mean'),
    mean_fare=('fare', 'mean')
)

print(grouped)

        count   mean_age  mean_fare
class                              
First     186  38.233441  84.154687
Second    173  29.877630  20.662183
Third     355  25.140620  13.675550


  grouped = df.groupby('class').agg(


In [62]:
grouped_survived = df.groupby(['class', 'sex'])['survived'].mean()
grouped_survived

  grouped_survived = df.groupby(['class', 'sex'])['survived'].mean()


class   sex   
First   female    0.968085
        male      0.368852
Second  female    0.921053
        male      0.157407
Third   female    0.500000
        male      0.135447
Name: survived, dtype: float64

In [64]:
grouped2 = df.groupby('class').agg(
    mean_age=('age', 'mean'),
    std_age=('age', 'std'),
    max_fare=('fare', 'max'),
    min_fare=('fare', 'min')
)
grouped2

  grouped2 = df.groupby('class').agg(


Unnamed: 0_level_0,mean_age,std_age,max_fare,min_fare
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
First,38.233441,14.802856,512.3292,0.0
Second,29.87763,14.001077,73.5,0.0
Third,25.14062,12.495398,69.55,0.0


In [69]:
grouped3 = df.groupby(['class'], observed=True)
grouped3['fare'].cumsum()
df['fare_cumsum']=grouped3['fare'].cumsum()
df

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone,fare_cumsum
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False,7.2500
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False,71.2833
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True,15.1750
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False,124.3833
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True,23.2250
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True,3801.8417
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True,18147.4125
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False,6706.9451
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True,18177.4125


In [70]:
df['age_zscore'] = df.groupby('class')['age'].transform(lambda x : (x - x.mean()) / x.std())
df

  df['age_zscore'] = df.groupby('class')['age'].transform(lambda x : (x - x.mean()) / x.std())


Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone,fare_cumsum,age_zscore
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False,7.2500,-0.251342
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False,71.2833,-0.015770
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True,15.1750,0.068776
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False,124.3833,-0.218434
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True,23.2250,0.789041
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True,3801.8417,-0.205529
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True,18147.4125,-1.299306
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False,6706.9451,
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True,18177.4125,-0.826424


In [None]:
filter_count = df.groupby('class').filter(lambda x : len(x) >= 200)
filter_count['class'].unique()

  filter_count = df.groupby('class').filter(lambda x : len(x) >= 200)


['Third', 'First']
Categories (3, object): ['First', 'Second', 'Third']

In [85]:
mul_index_df = df.groupby(['class', 'sex'])['age'].mean().round(2)
first_female_age_avg = mul_index_df.loc[('First', 'female')]
print('First Class Female Age Avg : ',first_female_age_avg)
male_group = mul_index_df.xs('male', level='sex')
print('\nMale Group\n',male_group)

First Class Female Age Avg :  34.61

Male Group
 class
First     41.28
Second    30.74
Third     26.51
Name: age, dtype: float64


  mul_index_df = df.groupby(['class', 'sex'])['age'].mean().round(2)


In [86]:
pivot_table = df.pivot_table(
    index='class',
    columns='sex',
    values='age',
    aggfunc='mean'
)
print(pivot_table)

sex        female       male
class                       
First   34.611765  41.281386
Second  28.722973  30.740707
Third   21.750000  26.507589


  pivot_table = df.pivot_table(


In [87]:
stacked = pivot_table.stack()
unstacked = pivot_table.unstack()
print('stacked\n', stacked)
print('\nunstacked\n', unstacked)

stacked
 class   sex   
First   female    34.611765
        male      41.281386
Second  female    28.722973
        male      30.740707
Third   female    21.750000
        male      26.507589
dtype: float64

unstacked
 sex     class 
female  First     34.611765
        Second    28.722973
        Third     21.750000
male    First     41.281386
        Second    30.740707
        Third     26.507589
dtype: float64


In [91]:
melted = df.melt(
    id_vars='class',
    value_vars=['sex', 'age'],
    var_name='type',
    value_name='value'
)
print('melted\n', melted)

melted
        class type   value
0      Third  sex    male
1      First  sex  female
2      Third  sex  female
3      First  sex  female
4      Third  sex    male
...      ...  ...     ...
1777  Second  age    27.0
1778   First  age    19.0
1779   Third  age     NaN
1780   First  age    26.0
1781   Third  age    32.0

[1782 rows x 3 columns]
