In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
# 10.1 HOw to Think About Grouping Data and Operations

df = pd.DataFrame({'key1' : ['a', 'a', None, 'b', 'b', 'a', 
                            None],
                   'key2' : pd.Series([1, 2, 1, 2, 1, None, 1],
                                       dtype='Int64'),
                   'data1' : np.random.standard_normal(7),
                   'data2' : np.random.standard_normal(7)}
                   ).set_index(['key1', 'key2'])

df

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1.0,-0.585981,0.205379
a,2.0,0.539226,0.081889
,1.0,-0.763522,0.768955
b,2.0,0.322188,-0.556899
b,1.0,-0.189294,-0.576052
a,,-1.503336,-0.667041
,1.0,1.074801,1.196778


In [3]:
grouped = df['data1'].groupby(df.index.get_level_values('key1'))

grouped

<pandas.core.groupby.generic.SeriesGroupBy object at 0x00000281F4550B60>

In [4]:
grouped.mean()

key1
a   -0.516697
b    0.066447
Name: data1, dtype: float64

In [5]:
means = df['data1'].groupby([df.index.get_level_values('key1'),
                             df.index.get_level_values('key2')]).mean()
means

key1  key2
a     1      -0.585981
      2       0.539226
b     1      -0.189294
      2       0.322188
Name: data1, dtype: float64

In [6]:
means.unstack()

key2,1,2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,-0.585981,0.539226
b,-0.189294,0.322188


In [7]:
states = np.array(['Chiori','Yae Miko','Yae Miko','Chiori',
                  'Chiori','Yae Miko','Chiori']
                  )
years = [2024, 2023, 2024, 2024, 2025, 2024, 2025]

df['data1'].groupby([states, years]).mean()

Chiori    2024   -0.131897
          2025    0.442754
Yae Miko  2023    0.539226
          2024   -1.133429
Name: data1, dtype: float64

In [8]:
df.groupby('key1').mean()

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,-0.516697,-0.126591
b,0.066447,-0.566476


In [9]:
df.groupby('key2').mean()

Unnamed: 0_level_0,data1,data2
key2,Unnamed: 1_level_1,Unnamed: 2_level_1
1,-0.115999,0.398765
2,0.430707,-0.237505


In [10]:
df.groupby(['key1', 'key2']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,-0.585981,0.205379
a,2,0.539226,0.081889
b,1,-0.189294,-0.576052
b,2,0.322188,-0.556899


In [11]:
df.groupby([states, years]).mean()

Unnamed: 0,Unnamed: 1,data1,data2
Chiori,2024,-0.131897,-0.17576
Chiori,2025,0.442754,0.310363
Yae Miko,2023,0.539226,0.081889
Yae Miko,2024,-1.133429,0.050957


In [12]:
df.groupby(['key1', 'key2']).size()

key1  key2
a     1       1
      2       1
b     1       1
      2       1
dtype: int64

In [13]:
df.groupby('key1', dropna=False ).size()

key1
a      3
b      2
NaN    2
dtype: int64

In [14]:
df.groupby(['key1', 'key2'], dropna=False ).size()

key1  key2
a     1       1
      2       1
      <NA>    1
b     1       1
      2       1
NaN   1       2
dtype: int64

In [15]:
df.groupby('key1').count()

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,3,3
b,2,2


In [16]:
df.groupby('key2').count()

Unnamed: 0_level_0,data1,data2
key2,Unnamed: 1_level_1,Unnamed: 2_level_1
1,4,4
2,2,2


In [17]:
df.groupby(['key1', 'key2']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,1,1
a,2,1,1
b,1,1,1
b,2,1,1


In [18]:
# Iteration over groups
for name, group in df.groupby('key1'):
    print(name)
    print(group)

a
              data1     data2
key1 key2                    
a    1    -0.585981  0.205379
     2     0.539226  0.081889
     <NA> -1.503336 -0.667041
b
              data1     data2
key1 key2                    
b    2     0.322188 -0.556899
     1    -0.189294 -0.576052


In [19]:
for (k1, k2), group in df.groupby(['key1', 'key2']):
    print((k1, k2))
    print(group)

('a', np.int64(1))
              data1     data2
key1 key2                    
a    1    -0.585981  0.205379
('a', np.int64(2))
              data1     data2
key1 key2                    
a    2     0.539226  0.081889
('b', np.int64(1))
              data1     data2
key1 key2                    
b    1    -0.189294 -0.576052
('b', np.int64(2))
              data1     data2
key1 key2                    
b    2     0.322188 -0.556899


In [20]:
pieces = {name: group for name, group in df.groupby('key1')}

pieces['b']

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
b,2,0.322188,-0.556899
b,1,-0.189294,-0.576052


In [21]:
grouped = df.groupby({'key1': 'key', 
                      'key2': 'key',
                      'data1': 'value',
                      'data2': 'value'}, axis='columns')


  grouped = df.groupby({'key1': 'key',


In [22]:
for group_key, group_values in grouped:
    print(group_key)
    print(group_values)

value
              data1     data2
key1 key2                    
a    1    -0.585981  0.205379
     2     0.539226  0.081889
NaN  1    -0.763522  0.768955
b    2     0.322188 -0.556899
     1    -0.189294 -0.576052
a    <NA> -1.503336 -0.667041
NaN  1     1.074801  1.196778


In [23]:
# Selection of columns
# Subset of columns
grouped = df.groupby(['key1', 'key2'])[['data1', 'data2']]
grouped.mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,-0.585981,0.205379
a,2,0.539226,0.081889
b,1,-0.189294,-0.576052
b,2,0.322188,-0.556899


In [24]:
df.groupby('key1')['data1']
df.groupby('key1')['data2']  

<pandas.core.groupby.generic.SeriesGroupBy object at 0x00000281A7DB8EC0>

In [25]:
df['data1'].groupby(df.index.get_level_values('key1'))
df[['data2']].groupby(df.index.get_level_values('key1'))

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x00000281A7DBAD80>

In [26]:
df.groupby(['key1', 'key2'])[['data2']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data2
key1,key2,Unnamed: 2_level_1
a,1,0.205379
a,2,0.081889
b,1,-0.576052
b,2,-0.556899


In [27]:
s_grouped = df.groupby(['key1', 'key2'])[['data2']]

s_grouped 

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x00000281A7DBA870>

In [28]:
s_grouped.mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data2
key1,key2,Unnamed: 2_level_1
a,1,0.205379
a,2,0.081889
b,1,-0.576052
b,2,-0.556899


In [29]:
# Grouping with Dictionaries and Series
wifes = pd.DataFrame(np.random.standard_normal ((5, 5)),
                     columns=['istri pertama', 'istri kedua',
                              'istri ketiga', 'istri keempat', 'istri kelima'],
                     index = ['Ning', 'Ei', 'Navia', 'Chiori','Yae Miko'])

In [30]:
wifes.iloc[2:3, [1,2]] = np.nan # add NaN values for testing

wifes

Unnamed: 0,istri pertama,istri kedua,istri ketiga,istri keempat,istri kelima
Ning,-1.414274,1.277294,0.332248,-0.166088,-0.194009
Ei,-0.577407,-2.675079,-1.182742,-1.850849,0.480561
Navia,1.254306,,,0.054553,0.760515
Chiori,0.305259,-0.27973,0.344075,-0.916953,-1.337501
Yae Miko,-1.244987,-0.405918,0.684434,-0.133024,0.4876


In [31]:
mapping = {'istri pertama': 'golden',
           'istri kedua'  : 'thunder',
           'istri ketiga' : 'platinum',
           'istri keempat': 'diamond',
           'istri kelima' : 'emerald'}

wifes.rename(columns=mapping)

Unnamed: 0,golden,thunder,platinum,diamond,emerald
Ning,-1.414274,1.277294,0.332248,-0.166088,-0.194009
Ei,-0.577407,-2.675079,-1.182742,-1.850849,0.480561
Navia,1.254306,,,0.054553,0.760515
Chiori,0.305259,-0.27973,0.344075,-0.916953,-1.337501
Yae Miko,-1.244987,-0.405918,0.684434,-0.133024,0.4876


In [32]:
mapping1 = {'istri pertama': 'golden',
            'istri kedua'  : 'golden',
            'istri ketiga' : 'emerald',
            'istri keempat': 'emerald',
            'istri kelima' : 'golden',
            'istri keenam' : 'diamond'}
wifes.rename(columns=mapping1)

Unnamed: 0,golden,golden.1,emerald,emerald.1,golden.2
Ning,-1.414274,1.277294,0.332248,-0.166088,-0.194009
Ei,-0.577407,-2.675079,-1.182742,-1.850849,0.480561
Navia,1.254306,,,0.054553,0.760515
Chiori,0.305259,-0.27973,0.344075,-0.916953,-1.337501
Yae Miko,-1.244987,-0.405918,0.684434,-0.133024,0.4876


In [33]:
by_column = wifes.groupby(mapping1, axis='columns').sum()

  by_column = wifes.groupby(mapping1, axis='columns').sum()


In [34]:
by_column

Unnamed: 0,emerald,golden
Ning,0.16616,-0.330989
Ei,-3.033591,-2.771925
Navia,0.054553,2.01482
Chiori,-0.572879,-1.311972
Yae Miko,0.551411,-1.163305


In [35]:
map_series = pd.Series(mapping1)

map_series
# Grouping with DataFrame

istri pertama     golden
istri kedua       golden
istri ketiga     emerald
istri keempat    emerald
istri kelima      golden
istri keenam     diamond
dtype: object

In [36]:
wifes.groupby(map_series, axis='columns').count()

  wifes.groupby(map_series, axis='columns').count()


Unnamed: 0,emerald,golden
Ning,2,3
Ei,2,3
Navia,1,2
Chiori,2,3
Yae Miko,2,3


In [37]:
wifes.groupby(len).sum()


Unnamed: 0,istri pertama,istri kedua,istri ketiga,istri keempat,istri kelima
2,-0.577407,-2.675079,-1.182742,-1.850849,0.480561
4,-1.414274,1.277294,0.332248,-0.166088,-0.194009
5,1.254306,0.0,0.0,0.054553,0.760515
6,0.305259,-0.27973,0.344075,-0.916953,-1.337501
8,-1.244987,-0.405918,0.684434,-0.133024,0.4876


In [38]:
key_list = ['mendesah', 'mendesah', 'mendesah',
            'menjepit', 'menjepit']

wifes.groupby([len, key_list]).min()

Unnamed: 0,Unnamed: 1,istri pertama,istri kedua,istri ketiga,istri keempat,istri kelima
2,mendesah,-0.577407,-2.675079,-1.182742,-1.850849,0.480561
4,mendesah,-1.414274,1.277294,0.332248,-0.166088,-0.194009
5,mendesah,1.254306,,,0.054553,0.760515
6,menjepit,0.305259,-0.27973,0.344075,-0.916953,-1.337501
8,menjepit,-1.244987,-0.405918,0.684434,-0.133024,0.4876


In [39]:
wifes.groupby([len, key_list], axis='columns').min()

  wifes.groupby([len, key_list], axis='columns').min()


Unnamed: 0_level_0,11,12,12,13,13
Unnamed: 0_level_1,mendesah,mendesah,menjepit,mendesah,menjepit
Ning,1.277294,0.332248,-0.194009,-1.414274,-0.166088
Ei,-2.675079,-1.182742,0.480561,-0.577407,-1.850849
Navia,,,0.760515,1.254306,0.054553
Chiori,-0.27973,0.344075,-1.337501,0.305259,-0.916953
Yae Miko,-0.405918,0.684434,0.4876,-1.244987,-0.133024


In [40]:
# Grouping by Index Levels

ranjang_istri = pd.MultiIndex.from_arrays(
    [['mendesah', 'mendesah', 'mendesah','menjepit', 'menjepit'],
     ['Navia', 'Ei', 'Chiori','Ning', 'Yae Miko']],
      names=['sex appeals', 'wifes'])
 

In [41]:
ranjang_df = pd.DataFrame(np.random.standard_normal((4, 5)),
                            columns=ranjang_istri)

In [42]:
ranjang_df  
# Grouping by Index Levels

sex appeals,mendesah,mendesah,mendesah,menjepit,menjepit
wifes,Navia,Ei,Chiori,Ning,Yae Miko
0,2.077612,-1.035141,-0.689685,0.118969,1.779661
1,1.375787,1.560302,1.442555,0.295963,-0.787344
2,0.739539,-0.489014,-0.33056,0.085492,-0.76108
3,0.515582,-0.60649,1.17655,0.562697,1.147461


In [43]:
ranjang_df.groupby(level='sex appeals', axis='columns').count()

  ranjang_df.groupby(level='sex appeals', axis='columns').count()


sex appeals,mendesah,menjepit
0,3,2
1,3,2
2,3,2
3,3,2


In [44]:
# 10.2 Data Aggregation and Group Operations

# any, all - Returns True if any or all of 
#            the values in the group are True

# count - Returns the number of non-NA values in the group
#         (equivalent to len(group))

# cummin, cummax - Returns the cumulative minimum or maximum
#                  of the group

# cumsum - Returns the cumulative sum of the group
# cumprod - Returns the cumulative product of the group
# first, last - Returns the first or last value in the group
# max, min - Returns the maximum or minimum value in the group
# mean, median - Returns the mean or median of the group
# nunique - Returns the number of unique values in the group
# prod - Returns the product of the values in the group
# size - Returns the size of the group (equivalent to len(group))
# nth - Returns the nth row of the group
# ohlc - Returns the open, high, low, and close values of the group
# std, var - Returns the standard deviation or variance of the group
# sum - Returns the sum of the values in the group
# quantile - Returns the nth quantile of the group
# sem - Returns the standard error of the mean of the group
# rank - Returns the ranks of the values in the group

In [45]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1.0,-0.585981,0.205379
a,2.0,0.539226,0.081889
,1.0,-0.763522,0.768955
b,2.0,0.322188,-0.556899
b,1.0,-0.189294,-0.576052
a,,-1.503336,-0.667041
,1.0,1.074801,1.196778


In [46]:
grouped = df.groupby('key1')
grouped['data1'].nsmallest(2)

key1  key1  key2
a     a     <NA>   -1.503336
            1      -0.585981
b     b     1      -0.189294
            2       0.322188
Name: data1, dtype: float64

In [47]:
def peak_to_peak(arr):
    return arr.max() - arr.min()

grouped.agg(peak_to_peak)

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,2.042563,0.87242
b,0.511482,0.019153


In [48]:
grouped.describe()

Unnamed: 0_level_0,data1,data1,data1,data1,data1,data1,data1,data1,data2,data2,data2,data2,data2,data2,data2,data2
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
key1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
a,3.0,-0.516697,1.023042,-1.503336,-1.044659,-0.585981,-0.023377,0.539226,3.0,-0.126591,0.472099,-0.667041,-0.292576,0.081889,0.143634,0.205379
b,2.0,0.066447,0.361672,-0.189294,-0.061423,0.066447,0.194318,0.322188,2.0,-0.566476,0.013543,-0.576052,-0.571264,-0.566476,-0.561688,-0.556899


In [49]:
tips = pd.read_csv('tips.csv')

In [50]:
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [51]:
tips['tip_pct'] = tips['tip'] / tips['total_bill']

tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_pct
0,16.99,1.01,Female,No,Sun,Dinner,2,0.059447
1,10.34,1.66,Male,No,Sun,Dinner,3,0.160542
2,21.01,3.5,Male,No,Sun,Dinner,3,0.166587
3,23.68,3.31,Male,No,Sun,Dinner,2,0.13978
4,24.59,3.61,Female,No,Sun,Dinner,4,0.146808


In [52]:
grouped1 = tips.groupby(['day', 'smoker'])

grouped_pct = grouped1['tip_pct']

grouped_pct.agg(['mean', 'std'])

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,std
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1
Fri,No,0.15165,0.028123
Fri,Yes,0.174783,0.051293
Sat,No,0.158048,0.039767
Sat,Yes,0.147906,0.061375
Sun,No,0.160113,0.042347
Sun,Yes,0.18725,0.154134
Thur,No,0.160298,0.038774
Thur,Yes,0.163863,0.039389


In [53]:
grouped_pct.agg(['mean', 'std', peak_to_peak])

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,std,peak_to_peak
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Fri,No,0.15165,0.028123,0.067349
Fri,Yes,0.174783,0.051293,0.159925
Sat,No,0.158048,0.039767,0.235193
Sat,Yes,0.147906,0.061375,0.290095
Sun,No,0.160113,0.042347,0.193226
Sun,Yes,0.18725,0.154134,0.644685
Thur,No,0.160298,0.038774,0.19335
Thur,Yes,0.163863,0.039389,0.15124


In [54]:
grouped_pct.agg([('average', 'mean'),
                 ('std_dev', np.std),])

  grouped_pct.agg([('average', 'mean'),


Unnamed: 0_level_0,Unnamed: 1_level_0,average,std_dev
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1
Fri,No,0.15165,0.028123
Fri,Yes,0.174783,0.051293
Sat,No,0.158048,0.039767
Sat,Yes,0.147906,0.061375
Sun,No,0.160113,0.042347
Sun,Yes,0.18725,0.154134
Thur,No,0.160298,0.038774
Thur,Yes,0.163863,0.039389


In [55]:
function = ['count', 'mean', 'max']

result = grouped1[['tip_pct', 'total_bill']].agg(function)

result

Unnamed: 0_level_0,Unnamed: 1_level_0,tip_pct,tip_pct,tip_pct,total_bill,total_bill,total_bill
Unnamed: 0_level_1,Unnamed: 1_level_1,count,mean,max,count,mean,max
day,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
Fri,No,4,0.15165,0.187735,4,18.42,22.75
Fri,Yes,15,0.174783,0.26348,15,16.813333,40.17
Sat,No,45,0.158048,0.29199,45,19.661778,48.33
Sat,Yes,42,0.147906,0.325733,42,21.276667,50.81
Sun,No,57,0.160113,0.252672,57,20.506667,48.17
Sun,Yes,19,0.18725,0.710345,19,24.12,45.35
Thur,No,45,0.160298,0.266312,45,17.113111,41.19
Thur,Yes,17,0.163863,0.241255,17,19.190588,43.11


In [56]:
result['tip_pct']

Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean,max
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Fri,No,4,0.15165,0.187735
Fri,Yes,15,0.174783,0.26348
Sat,No,45,0.158048,0.29199
Sat,Yes,42,0.147906,0.325733
Sun,No,57,0.160113,0.252672
Sun,Yes,19,0.18725,0.710345
Thur,No,45,0.160298,0.266312
Thur,Yes,17,0.163863,0.241255


In [57]:
ftupple = [('average', 'mean'),
           ('Variance', np.var)]

grouped1[['tip_pct', 'total_bill']].agg(ftupple)

  grouped1[['tip_pct', 'total_bill']].agg(ftupple)


Unnamed: 0_level_0,Unnamed: 1_level_0,tip_pct,tip_pct,total_bill,total_bill
Unnamed: 0_level_1,Unnamed: 1_level_1,average,Variance,average,Variance
day,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Fri,No,0.15165,0.000791,18.42,25.596333
Fri,Yes,0.174783,0.002631,16.813333,82.562438
Sat,No,0.158048,0.001581,19.661778,79.908965
Sat,Yes,0.147906,0.003767,21.276667,101.387535
Sun,No,0.160113,0.001793,20.506667,66.09998
Sun,Yes,0.18725,0.023757,24.12,109.046044
Thur,No,0.160298,0.001503,17.113111,59.625081
Thur,Yes,0.163863,0.001551,19.190588,69.808518


In [58]:
grouped1.agg({'tip' : np.max,
             'size': 'sum'})

  grouped1.agg({'tip' : np.max,


Unnamed: 0_level_0,Unnamed: 1_level_0,tip,size
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1
Fri,No,3.5,9
Fri,Yes,4.73,31
Sat,No,9.0,115
Sat,Yes,10.0,104
Sun,No,6.0,167
Sun,Yes,6.5,49
Thur,No,6.7,112
Thur,Yes,5.0,40


In [59]:
grouped1.agg({'tip_pct' : ['min', 'max', 'mean', 'std'],
              'size' : 'sum'})

Unnamed: 0_level_0,Unnamed: 1_level_0,tip_pct,tip_pct,tip_pct,tip_pct,size
Unnamed: 0_level_1,Unnamed: 1_level_1,min,max,mean,std,sum
day,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Fri,No,0.120385,0.187735,0.15165,0.028123,9
Fri,Yes,0.103555,0.26348,0.174783,0.051293,31
Sat,No,0.056797,0.29199,0.158048,0.039767,115
Sat,Yes,0.035638,0.325733,0.147906,0.061375,104
Sun,No,0.059447,0.252672,0.160113,0.042347,167
Sun,Yes,0.06566,0.710345,0.18725,0.154134,49
Thur,No,0.072961,0.266312,0.160298,0.038774,112
Thur,Yes,0.090014,0.241255,0.163863,0.039389,40


In [60]:
# Returning Aggregated Data Without Row Indexes

# Select only numeric columns for aggregation
numeric_cols = tips.select_dtypes(include='number').columns
tips.groupby(['day', 'smoker'], as_index=False)[numeric_cols].mean()

Unnamed: 0,day,smoker,total_bill,tip,size,tip_pct
0,Fri,No,18.42,2.8125,2.25,0.15165
1,Fri,Yes,16.813333,2.714,2.066667,0.174783
2,Sat,No,19.661778,3.102889,2.555556,0.158048
3,Sat,Yes,21.276667,2.875476,2.47619,0.147906
4,Sun,No,20.506667,3.167895,2.929825,0.160113
5,Sun,Yes,24.12,3.516842,2.578947,0.18725
6,Thur,No,17.113111,2.673778,2.488889,0.160298
7,Thur,Yes,19.190588,3.03,2.352941,0.163863


In [61]:
# 10.3 Apply: General split-apply-combine
# This section is a placeholder for 
# the application of the split-apply-combine strategy.

In [62]:
def top(df, n=5, column='tip_pct'):
    return df.sort_values(column, ascending=False)[:n]

# Example usage of the top function
top(tips, n=5, column='tip_pct')

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_pct
172,7.25,5.15,Male,Yes,Sun,Dinner,2,0.710345
178,9.6,4.0,Female,Yes,Sun,Dinner,2,0.416667
67,3.07,1.0,Female,Yes,Sat,Dinner,1,0.325733
232,11.61,3.39,Male,No,Sat,Dinner,2,0.29199
183,23.17,6.5,Male,Yes,Sun,Dinner,4,0.280535


In [63]:
tips.groupby('smoker').apply(top, n=3, column='tip_pct')

  tips.groupby('smoker').apply(top, n=3, column='tip_pct')


Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,tip,sex,smoker,day,time,size,tip_pct
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
No,232,11.61,3.39,Male,No,Sat,Dinner,2,0.29199
No,149,7.51,2.0,Male,No,Thur,Lunch,2,0.266312
No,51,10.29,2.6,Female,No,Sun,Dinner,2,0.252672
Yes,172,7.25,5.15,Male,Yes,Sun,Dinner,2,0.710345
Yes,178,9.6,4.0,Female,Yes,Sun,Dinner,2,0.416667
Yes,67,3.07,1.0,Female,Yes,Sat,Dinner,1,0.325733


In [64]:
tips.groupby(['smoker', 'day']).apply(top, n=1, column='total_bill')

  tips.groupby(['smoker', 'day']).apply(top, n=1, column='total_bill')


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,total_bill,tip,sex,smoker,day,time,size,tip_pct
smoker,day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
No,Fri,94,22.75,3.25,Female,No,Fri,Dinner,2,0.142857
No,Sat,212,48.33,9.0,Male,No,Sat,Dinner,4,0.18622
No,Sun,156,48.17,5.0,Male,No,Sun,Dinner,6,0.103799
No,Thur,142,41.19,5.0,Male,No,Thur,Lunch,5,0.121389
Yes,Fri,95,40.17,4.73,Male,Yes,Fri,Dinner,4,0.11775
Yes,Sat,170,50.81,10.0,Male,Yes,Sat,Dinner,3,0.196812
Yes,Sun,182,45.35,3.5,Male,Yes,Sun,Dinner,3,0.077178
Yes,Thur,197,43.11,5.0,Female,Yes,Thur,Lunch,4,0.115982


In [65]:
result1 = tips.groupby('smoker')['tip_pct'].describe()

result1

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
No,151.0,0.159328,0.03991,0.056797,0.136906,0.155625,0.185014,0.29199
Yes,93.0,0.163196,0.085119,0.035638,0.106771,0.153846,0.195059,0.710345


In [66]:
result1.unstack('smoker')

       smoker
count  No        151.000000
       Yes        93.000000
mean   No          0.159328
       Yes         0.163196
std    No          0.039910
       Yes         0.085119
min    No          0.056797
       Yes         0.035638
25%    No          0.136906
       Yes         0.106771
50%    No          0.155625
       Yes         0.153846
75%    No          0.185014
       Yes         0.195059
max    No          0.291990
       Yes         0.710345
dtype: float64

In [67]:
def f(group):
    return group.describe()

grouped1.apply(f)

# this is a placeholder for the application of 
# the split-apply-combine strategy
# when you invoke the apply method like described above,
# it is a shortcut for applying a function to each group
# and returning a DataFrame with the results

  grouped1.apply(f)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,total_bill,tip,size,tip_pct
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Fri,No,count,4.000000,4.000000,4.00,4.000000
Fri,No,mean,18.420000,2.812500,2.25,0.151650
Fri,No,std,5.059282,0.898494,0.50,0.028123
Fri,No,min,12.460000,1.500000,2.00,0.120385
Fri,No,25%,15.100000,2.625000,2.00,0.137239
...,...,...,...,...,...,...
Thur,Yes,min,10.340000,2.000000,2.00,0.090014
Thur,Yes,25%,13.510000,2.000000,2.00,0.148038
Thur,Yes,50%,16.470000,2.560000,2.00,0.153846
Thur,Yes,75%,19.810000,4.000000,2.00,0.194837


In [68]:
# Suppressing the Group Keys
# If you want to suppress the group keys in the result, 
# you can use the `as_index=False` parameter in the groupby method

tips.groupby('smoker', group_keys=False).apply(top, n=3, 
                                               column='tip_pct')

  tips.groupby('smoker', group_keys=False).apply(top, n=3,


Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_pct
232,11.61,3.39,Male,No,Sat,Dinner,2,0.29199
149,7.51,2.0,Male,No,Thur,Lunch,2,0.266312
51,10.29,2.6,Female,No,Sun,Dinner,2,0.252672
172,7.25,5.15,Male,Yes,Sun,Dinner,2,0.710345
178,9.6,4.0,Female,Yes,Sun,Dinner,2,0.416667
67,3.07,1.0,Female,Yes,Sat,Dinner,1,0.325733


In [69]:
# Quantile & Bucket Analysis

frame = pd.DataFrame({'data1': np.random.standard_normal(100),
                      'data2': np.random.standard_normal(100)
                        })

frame.head()

Unnamed: 0,data1,data2
0,0.128991,-0.034256
1,0.833791,0.663276
2,1.937529,-1.239211
3,1.053727,-0.991841
4,-0.606454,-0.399486


In [70]:
quartile_1 = pd.cut(frame['data1'], 4)
quartile_1.head(10)

# pd.cut is used to segment and sort data values into bins.
# It is often used to convert continuous data into categorical data.
# You can use pd.cut to create quantiles or buckets of data.

# pd.cut(frame['data1'], 4) creates 4 equal-width bins
# based on the values in the 'data1' column of the DataFrame 'frame'.



0    (-0.169, 1.112]
1    (-0.169, 1.112]
2     (1.112, 2.393]
3    (-0.169, 1.112]
4    (-1.45, -0.169]
5    (-0.169, 1.112]
6    (-1.45, -0.169]
7    (-1.45, -0.169]
8    (-0.169, 1.112]
9    (-2.736, -1.45]
Name: data1, dtype: category
Categories (4, interval[float64, right]): [(-2.736, -1.45] < (-1.45, -0.169] < (-0.169, 1.112] < (1.112, 2.393]]

In [71]:
def get_stat(group):
    return pd.DataFrame(
        {'min': group.min(), 'max': group.max(),
           'count': group.count(), 'mean': group.mean()})

grouped2 = frame.groupby(quartile_1)

  grouped2 = frame.groupby(quartile_1)


In [72]:
grouped2.apply(get_stat)

Unnamed: 0_level_0,Unnamed: 1_level_0,min,max,count,mean
data1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
"(-2.736, -1.45]",data1,-2.731047,-1.486938,8,-1.916766
"(-2.736, -1.45]",data2,-1.475669,1.963106,8,0.040416
"(-1.45, -0.169]",data1,-1.407033,-0.274652,38,-0.667735
"(-1.45, -0.169]",data2,-1.98562,1.916023,38,0.01074
"(-0.169, 1.112]",data1,-0.166359,1.110294,38,0.459772
"(-0.169, 1.112]",data2,-3.492017,1.791813,38,-0.092361
"(1.112, 2.393]",data1,1.160443,2.392897,16,1.702143
"(1.112, 2.393]",data2,-2.555724,2.005471,16,0.053587


In [73]:
grouped.agg(['min', 'max', 'count','mean'])

Unnamed: 0_level_0,data1,data1,data1,data1,data2,data2,data2,data2
Unnamed: 0_level_1,min,max,count,mean,min,max,count,mean
key1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
a,-1.503336,0.539226,3,-0.516697,-0.667041,0.205379,3,-0.126591
b,-0.189294,0.322188,2,0.066447,-0.576052,-0.556899,2,-0.566476


In [74]:
# Example: Filing Missing Values with Group-Specific Values

# This section is a placeholder for the example of 
# filling missing values

s= pd.Series(np.random.standard_normal(6))
s[::2] = np.nan  # introduce NaN values for testing

In [75]:
s

0         NaN
1    1.096930
2         NaN
3    0.264391
4         NaN
5   -0.968794
dtype: float64

In [76]:
s.fillna(s.mean())

0    0.130842
1    1.096930
2    0.130842
3    0.264391
4    0.130842
5   -0.968794
dtype: float64

In [77]:
wifes = ['Chiori','Yae Miko','Fischl','Furina',
          'Barbara','Kuki Shinobu','Lan Yan', 'Mizuki']

group_key = ['Inazuma', 'Inazuma', 'Mondstadt', 'Inazuma',
             'Mondstadt', 'Mondstadt', 'Mondstadt', 'Mondstadt']

data = pd.Series(np.random.standard_normal(8), index=wifes)

In [78]:
data

Chiori          0.420044
Yae Miko        1.417904
Fischl         -0.016121
Furina         -2.102161
Barbara         1.462583
Kuki Shinobu   -0.472003
Lan Yan        -0.368176
Mizuki         -1.129890
dtype: float64

In [79]:
data[['Fischl', 'Kuki Shinobu', 'Mizuki']] = np.nan  
# introduce NaN values for testing

data

Chiori          0.420044
Yae Miko        1.417904
Fischl               NaN
Furina         -2.102161
Barbara         1.462583
Kuki Shinobu         NaN
Lan Yan        -0.368176
Mizuki               NaN
dtype: float64

In [80]:
data.groupby(group_key).size()

# this is a placeholder for the example of 
# filling missing values with group-specific values

# data.groupby(group_key).size() is used to count the number of
# occurrences of each group in the 'group_key' list.


Inazuma      3
Mondstadt    5
dtype: int64

In [81]:
data.groupby(group_key).count()

# data.groupby(group_key).count() is used to count the number of 
# non-NaN values in each group defined by 'group_key'.



Inazuma      3
Mondstadt    2
dtype: int64

In [82]:
data.groupby(group_key).mean()

Inazuma     -0.088071
Mondstadt    0.547204
dtype: float64

In [83]:
def fill_mean(group):
    return group.fillna(group.mean()) # This function fills NaN values in 
                                      # a group with the mean of that group
  
data.groupby(group_key).apply(fill_mean)

Inazuma    Chiori          0.420044
           Yae Miko        1.417904
           Furina         -2.102161
Mondstadt  Fischl          0.547204
           Barbara         1.462583
           Kuki Shinobu    0.547204
           Lan Yan        -0.368176
           Mizuki          0.547204
dtype: float64

In [84]:
fill_values = {'Inazuma': 0.5, 'Mondstadt': -1}

def fill_with_values(group):
    return group.fillna(fill_values[group.name])

data.groupby(group_key).apply(fill_with_values)

Inazuma    Chiori          0.420044
           Yae Miko        1.417904
           Furina         -2.102161
Mondstadt  Fischl         -1.000000
           Barbara         1.462583
           Kuki Shinobu   -1.000000
           Lan Yan        -0.368176
           Mizuki         -1.000000
dtype: float64

In [85]:
suits = ['Hearts', 'Diamonds', 'Clubs', 'Spades']
card_val = (list(range(1, 11)) + [10]*3) *4 
# this creates a list of card values

base_names = ['A'] + list(range(2, 11)) + ['J', 'K', 'Q']
cards = []
for suit in suits:
    cards.extend(str(num) + ' of ' + suit for num in base_names)

deck = pd.Series(card_val, index=cards)

In [86]:
deck.head(13)

A of Hearts      1
2 of Hearts      2
3 of Hearts      3
4 of Hearts      4
5 of Hearts      5
6 of Hearts      6
7 of Hearts      7
8 of Hearts      8
9 of Hearts      9
10 of Hearts    10
J of Hearts     10
K of Hearts     10
Q of Hearts     10
dtype: int64

In [87]:
def draw(deck, n=5):
    return deck.sample(n=n, replace=False)
# Example usage of the draw function
draw(deck, n=5)

6 of Hearts       6
J of Hearts      10
2 of Clubs        2
6 of Diamonds     6
3 of Diamonds     3
dtype: int64

In [88]:
def get_suit(card):
    return card.split(' ')[-1]  
# Extract the suit from the card string
deck.groupby(get_suit).apply(draw, n=2)

Clubs     Q of Clubs        10
          6 of Clubs         6
Diamonds  4 of Diamonds      4
          10 of Diamonds    10
Hearts    8 of Hearts        8
          K of Hearts       10
Spades    2 of Spades        2
          10 of Spades      10
dtype: int64

In [89]:
deck.groupby(get_suit, group_keys=False).apply(draw, n=2)

8 of Clubs        8
4 of Clubs        4
6 of Diamonds     6
3 of Diamonds     3
10 of Hearts     10
8 of Hearts       8
A of Spades       1
6 of Spades       6
dtype: int64

In [90]:
# Example: Group Wieghted Average and Correlation
# This section is a placeholder for the example of
# group weighted average and correlation



In [91]:
df_1 = pd.DataFrame({'category' : ['a', 'a', 'a', 'a',
                                   'b', 'b', 'b', 'b'],
                     'values' : np.random.standard_normal(8),
                     'weights' : np.random.uniform(size=8)})

df_1

Unnamed: 0,category,values,weights
0,a,-0.429768,0.911791
1,a,0.215446,0.281583
2,a,0.446725,0.516517
3,a,1.064104,0.331937
4,b,0.906385,0.622519
5,b,-0.287269,0.026086
6,b,0.778797,0.713553
7,b,-0.075672,0.650329


In [92]:
grouped.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1.0,-0.585981,0.205379
a,2.0,0.539226,0.081889
b,2.0,0.322188,-0.556899
b,1.0,-0.189294,-0.576052
a,,-1.503336,-0.667041


In [93]:
grouped1.head(1)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_pct
0,16.99,1.01,Female,No,Sun,Dinner,2,0.059447
19,20.65,3.35,Male,No,Sat,Dinner,3,0.162228
56,38.01,3.0,Male,Yes,Sat,Dinner,4,0.078927
77,27.2,4.0,Male,No,Thur,Lunch,4,0.147059
80,19.44,3.0,Male,Yes,Thur,Lunch,2,0.154321
90,28.97,3.0,Male,Yes,Fri,Dinner,2,0.103555
91,22.49,3.5,Male,No,Fri,Dinner,2,0.155625
164,17.51,3.0,Female,Yes,Sun,Dinner,2,0.171331


In [94]:
grouped2.head(2)

Unnamed: 0,data1,data2
0,0.128991,-0.034256
1,0.833791,0.663276
2,1.937529,-1.239211
4,-0.606454,-0.399486
6,-0.693589,1.102326
9,-2.294898,0.206131
24,-1.685811,-1.475669
32,1.501688,1.712146


In [95]:
s_grouped.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,data2
key1,key2,Unnamed: 2_level_1
a,1,0.205379
a,2,0.081889
b,2,-0.556899
b,1,-0.576052


In [96]:
grouped = df_1.groupby('category')

In [97]:
def get_wavg(group):
    return np.average(group['values'],
                       weights=group['weights']) 

grouped.apply(get_wavg)   

  grouped.apply(get_wavg)


category
a    0.123793
b    0.528326
dtype: float64

In [98]:
close_px = pd.read_csv('stock_px.csv',
                       parse_dates=True, index_col=0)

In [99]:
close_px.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 2214 entries, 2003-01-02 to 2011-10-14
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   AAPL    2214 non-null   float64
 1   MSFT    2214 non-null   float64
 2   XOM     2214 non-null   float64
 3   SPX     2214 non-null   float64
dtypes: float64(4)
memory usage: 86.5 KB


In [100]:
close_px.tail()

Unnamed: 0,AAPL,MSFT,XOM,SPX
2011-10-10,388.81,26.94,76.28,1194.89
2011-10-11,400.29,27.0,76.27,1195.54
2011-10-12,402.19,26.96,77.16,1207.25
2011-10-13,408.43,27.18,76.37,1203.66
2011-10-14,422.0,27.27,78.11,1224.58


In [101]:
def spx_corr(group):
    return group.corrwith(group['SPX'])

rets = close_px.pct_change().dropna()

In [102]:
def get_year(x):
    return x.year

by_year = rets.groupby(get_year)
by_year.apply(lambda x: x.corrwith(x['SPX']))

Unnamed: 0,AAPL,MSFT,XOM,SPX
2003,0.541124,0.745174,0.661265,1.0
2004,0.374283,0.588531,0.557742,1.0
2005,0.46754,0.562374,0.63101,1.0
2006,0.428267,0.406126,0.518514,1.0
2007,0.508118,0.65877,0.786264,1.0
2008,0.681434,0.804626,0.828303,1.0
2009,0.707103,0.654902,0.797921,1.0
2010,0.710105,0.730118,0.839057,1.0
2011,0.691931,0.800996,0.859975,1.0


In [103]:
# other methods like 
by_year.apply(spx_corr)

Unnamed: 0,AAPL,MSFT,XOM,SPX
2003,0.541124,0.745174,0.661265,1.0
2004,0.374283,0.588531,0.557742,1.0
2005,0.46754,0.562374,0.63101,1.0
2006,0.428267,0.406126,0.518514,1.0
2007,0.508118,0.65877,0.786264,1.0
2008,0.681434,0.804626,0.828303,1.0
2009,0.707103,0.654902,0.797921,1.0
2010,0.710105,0.730118,0.839057,1.0
2011,0.691931,0.800996,0.859975,1.0


In [104]:
def corr_aapl_msft(group):
    return group['AAPL'].corr(group['MSFT'])

by_year.apply(corr_aapl_msft)
# this is computing the correlation between AAPL and MSFT
# for each year in the 'by_year' group

2003    0.480868
2004    0.259024
2005    0.300093
2006    0.161735
2007    0.417738
2008    0.611901
2009    0.432738
2010    0.571946
2011    0.581987
dtype: float64

In [105]:
# Example: Group-Wise Linear Regression

import statsmodels.api as sm 

In [106]:
def regress(data, yvar=None, xvars=None):
    Y = data[yvar]
    X = data[xvars]
    X['intercept'] = 1
    result = sm.OLS(Y, X).fit()
    return result.params

In [107]:
by_year.apply(regress, yvar='AAPL', xvars =['SPX'])

Unnamed: 0,SPX,intercept
2003,1.195406,0.00071
2004,1.363463,0.004201
2005,1.766415,0.003246
2006,1.645496,8e-05
2007,1.198761,0.003438
2008,0.968016,-0.00111
2009,0.879103,0.002954
2010,1.052608,0.001261
2011,0.806605,0.001514


In [108]:
df_2 = pd.DataFrame({'key': ['a', 'b', 'c']*4,
                     'value' : np.arange(12.)})

df_2

Unnamed: 0,key,value
0,a,0.0
1,b,1.0
2,c,2.0
3,a,3.0
4,b,4.0
5,c,5.0
6,a,6.0
7,b,7.0
8,c,8.0
9,a,9.0


In [109]:
g = df_2.groupby('key')['value']

g.mean()

# copilot where are you inline?


key
a    4.5
b    5.5
c    6.5
Name: value, dtype: float64

In [110]:
def get_mean(group):
    return group.mean()

g.transform(get_mean)

0     4.5
1     5.5
2     6.5
3     4.5
4     5.5
5     6.5
6     4.5
7     5.5
8     6.5
9     4.5
10    5.5
11    6.5
Name: value, dtype: float64

In [111]:
def times_two(group):
    return group*2

g.transform(times_two)

0      0.0
1      2.0
2      4.0
3      6.0
4      8.0
5     10.0
6     12.0
7     14.0
8     16.0
9     18.0
10    20.0
11    22.0
Name: value, dtype: float64

In [112]:
def ge_ranks(group):
    return group.rank(ascending=False)

g.transform(ge_ranks)

0     4.0
1     4.0
2     4.0
3     3.0
4     3.0
5     3.0
6     2.0
7     2.0
8     2.0
9     1.0
10    1.0
11    1.0
Name: value, dtype: float64

In [113]:
def normalize(x):
    return (x-x.mean()) / x.std()

g.transform(normalize)

0    -1.161895
1    -1.161895
2    -1.161895
3    -0.387298
4    -0.387298
5    -0.387298
6     0.387298
7     0.387298
8     0.387298
9     1.161895
10    1.161895
11    1.161895
Name: value, dtype: float64

In [114]:
g.apply(normalize)

key    
a    0    -1.161895
     3    -0.387298
     6     0.387298
     9     1.161895
b    1    -1.161895
     4    -0.387298
     7     0.387298
     10    1.161895
c    2    -1.161895
     5    -0.387298
     8     0.387298
     11    1.161895
Name: value, dtype: float64

In [115]:
g.transform('mean')

0     4.5
1     5.5
2     6.5
3     4.5
4     5.5
5     6.5
6     4.5
7     5.5
8     6.5
9     4.5
10    5.5
11    6.5
Name: value, dtype: float64

In [116]:
normalized = (df_2['value'] - g.transform('mean')) / g.transform('std')

normalized

0    -1.161895
1    -1.161895
2    -1.161895
3    -0.387298
4    -0.387298
5    -0.387298
6     0.387298
7     0.387298
8     0.387298
9     1.161895
10    1.161895
11    1.161895
Name: value, dtype: float64

In [117]:
# 10.5 Pivot Tables and Cross-Tabulation

tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_pct
0,16.99,1.01,Female,No,Sun,Dinner,2,0.059447
1,10.34,1.66,Male,No,Sun,Dinner,3,0.160542
2,21.01,3.5,Male,No,Sun,Dinner,3,0.166587
3,23.68,3.31,Male,No,Sun,Dinner,2,0.13978
4,24.59,3.61,Female,No,Sun,Dinner,4,0.146808


In [118]:
tips.pivot_table(index=['day', 'smoker'], values=numeric_cols)

Unnamed: 0_level_0,Unnamed: 1_level_0,size,tip,tip_pct,total_bill
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Fri,No,2.25,2.8125,0.15165,18.42
Fri,Yes,2.066667,2.714,0.174783,16.813333
Sat,No,2.555556,3.102889,0.158048,19.661778
Sat,Yes,2.47619,2.875476,0.147906,21.276667
Sun,No,2.929825,3.167895,0.160113,20.506667
Sun,Yes,2.578947,3.516842,0.18725,24.12
Thur,No,2.488889,2.673778,0.160298,17.113111
Thur,Yes,2.352941,3.03,0.163863,19.190588


In [119]:
tips.pivot_table(index=['time', 'day'], columns='smoker',
                 values=['tip_pct', 'size'])

Unnamed: 0_level_0,Unnamed: 1_level_0,size,size,tip_pct,tip_pct
Unnamed: 0_level_1,smoker,No,Yes,No,Yes
time,day,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Dinner,Fri,2.0,2.222222,0.139622,0.165347
Dinner,Sat,2.555556,2.47619,0.158048,0.147906
Dinner,Sun,2.929825,2.578947,0.160113,0.18725
Dinner,Thur,2.0,,0.159744,
Lunch,Fri,3.0,1.833333,0.187735,0.188937
Lunch,Thur,2.5,2.352941,0.160311,0.163863


In [120]:
tips.pivot_table(index=['time', 'day'], columns='smoker',
                 values=['tip_pct', 'size'], margins=True)

# This passing margin = True si Adding all row & columns labels, 
# with corresponding values being the group statistics for 
# all data within single tier

Unnamed: 0_level_0,Unnamed: 1_level_0,size,size,size,tip_pct,tip_pct,tip_pct
Unnamed: 0_level_1,smoker,No,Yes,All,No,Yes,All
time,day,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
Dinner,Fri,2.0,2.222222,2.166667,0.139622,0.165347,0.158916
Dinner,Sat,2.555556,2.47619,2.517241,0.158048,0.147906,0.153152
Dinner,Sun,2.929825,2.578947,2.842105,0.160113,0.18725,0.166897
Dinner,Thur,2.0,,2.0,0.159744,,0.159744
Lunch,Fri,3.0,1.833333,2.0,0.187735,0.188937,0.188765
Lunch,Thur,2.5,2.352941,2.459016,0.160311,0.163863,0.161301
All,,2.668874,2.408602,2.569672,0.159328,0.163196,0.160803


In [121]:
tips.pivot_table(index=['time', 'smoker'], columns='day',
                 values= 'tip_pct', aggfunc=len, margins=True)

# The all values are means without taking into account
# smoker vs non-smoker (the All columns) or any of the 2 levels
# of grouping on the rows (the All rows)

Unnamed: 0_level_0,day,Fri,Sat,Sun,Thur,All
time,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Dinner,No,3.0,45.0,57.0,1.0,106
Dinner,Yes,9.0,42.0,19.0,,70
Lunch,No,1.0,,,44.0,45
Lunch,Yes,6.0,,,17.0,23
All,,19.0,87.0,76.0,62.0,244


In [122]:
tips.pivot_table(index=['time', 'size','smoker'], columns='day',
                 values= 'tip_pct', fill_value=0)

# if some combinations are empty (NA, Nan) 
# you may wish to pass a fill_value

Unnamed: 0_level_0,Unnamed: 1_level_0,day,Fri,Sat,Sun,Thur
time,size,smoker,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Dinner,1,No,0.0,0.137931,0.0,0.0
Dinner,1,Yes,0.0,0.325733,0.0,0.0
Dinner,2,No,0.139622,0.162705,0.168859,0.159744
Dinner,2,Yes,0.171297,0.148668,0.207893,0.0
Dinner,3,No,0.0,0.154661,0.152663,0.0
Dinner,3,Yes,0.0,0.144995,0.15266,0.0
Dinner,4,No,0.0,0.150096,0.148143,0.0
Dinner,4,Yes,0.11775,0.124515,0.19337,0.0
Dinner,5,No,0.0,0.0,0.206928,0.0
Dinner,5,Yes,0.0,0.106572,0.06566,0.0


In [123]:
# Cross-Tabulations:Crosstab

from io import StringIO


In [124]:
data_genshin = """ Sample Teyvat_State Elements
1.  Mondstadt Anemo
2.  Liyue Geo
3.  Mondstadt Anemo
4.  Liyue Anemo
5.  Liyue Geo
6.  Liyue Anemo
7.  Mondstadt Anemo
8.  Mondstadt Geo
9.  Liyue Anemo
10. Mondstadt Anemo
"""

In [125]:
data_genshin = pd.read_table(StringIO(data_genshin),
                             sep = "\s+")
data_genshin

  sep = "\s+")


Unnamed: 0,Sample,Teyvat_State,Elements
0,1.0,Mondstadt,Anemo
1,2.0,Liyue,Geo
2,3.0,Mondstadt,Anemo
3,4.0,Liyue,Anemo
4,5.0,Liyue,Geo
5,6.0,Liyue,Anemo
6,7.0,Mondstadt,Anemo
7,8.0,Mondstadt,Geo
8,9.0,Liyue,Anemo
9,10.0,Mondstadt,Anemo


In [126]:
pd.crosstab(data_genshin['Teyvat_State'],
            data_genshin['Elements'],
            margins=True)

Elements,Anemo,Geo,All
Teyvat_State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Liyue,3,2,5
Mondstadt,4,1,5
All,7,3,10


In [127]:
pd.crosstab([tips['time'],tips['day']],
             tips['smoker'],
             margins=True)

# The first 2 arguments to crosstab can each be an 
# array or Series or list of arrays

Unnamed: 0_level_0,smoker,No,Yes,All
time,day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Dinner,Fri,3,9,12
Dinner,Sat,45,42,87
Dinner,Sun,57,19,76
Dinner,Thur,1,0,1
Lunch,Fri,1,6,7
Lunch,Thur,44,17,61
All,,151,93,244
