In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.DataFrame({
    'key1': ['a', 'a', 'b', 'b', 'a'],
    'key2': ['one', 'two', 'one', 'two', 'one'],
    'data1': np.random.randn(5),
    'data2': np.random.randn(5)
})
df

Unnamed: 0,data1,data2,key1,key2
0,1.19579,0.577098,a,one
1,1.661197,-1.79617,a,two
2,0.947784,0.304866,b,one
3,1.540399,-0.805227,b,two
4,-0.146255,-0.788528,a,one


In [3]:
k1_means = df.groupby('key1').mean().add_prefix('mean_')
k1_means

Unnamed: 0_level_0,mean_data1,mean_data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.903577,-0.6692
b,1.244091,-0.25018


In [4]:
pd.merge(df, k1_means, left_on='key1', right_index=True)

Unnamed: 0,data1,data2,key1,key2,mean_data1,mean_data2
0,1.19579,0.577098,a,one,0.903577,-0.6692
1,1.661197,-1.79617,a,two,0.903577,-0.6692
4,-0.146255,-0.788528,a,one,0.903577,-0.6692
2,0.947784,0.304866,b,one,1.244091,-0.25018
3,1.540399,-0.805227,b,two,1.244091,-0.25018


In [6]:
people = pd.DataFrame(np.random.randn(5, 5),
                     columns=['a', 'b', 'c', 'd', 'e'],
                     index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis'])
people.loc[2:3, ['b', 'c']] = np.nan
people

Unnamed: 0,a,b,c,d,e
Joe,-1.077144,0.943046,-0.529196,0.051181,0.062384
Steve,0.798176,-0.13765,-1.399049,1.61826,1.519064
Wes,0.870827,,,-1.14495,0.200465
Jim,0.563671,1.43651,-1.383661,-0.729096,-0.413723
Travis,2.142531,0.394038,1.290055,0.542811,0.965633


In [8]:
key = ['one', 'two', 'one', 'two', 'one']
people.groupby(key).mean()

Unnamed: 0,a,b,c,d,e
one,0.645405,0.668542,0.38043,-0.183653,0.409494
two,0.680923,0.64943,-1.391355,0.444582,0.55267


In [9]:
people.groupby(key).transform(np.mean)

Unnamed: 0,a,b,c,d,e
Joe,0.645405,0.668542,0.38043,-0.183653,0.409494
Steve,0.680923,0.64943,-1.391355,0.444582,0.55267
Wes,0.645405,0.668542,0.38043,-0.183653,0.409494
Jim,0.680923,0.64943,-1.391355,0.444582,0.55267
Travis,0.645405,0.668542,0.38043,-0.183653,0.409494


In [10]:
def demean(arr):
    return arr - arr.mean()

demeaned = people.groupby(key).transform(demean)
demeaned

Unnamed: 0,a,b,c,d,e
Joe,-1.722549,0.274504,-0.909625,0.234834,-0.34711
Steve,0.117253,-0.78708,-0.007694,1.173678,0.966393
Wes,0.225422,,,-0.961298,-0.209029
Jim,-0.117253,0.78708,0.007694,-1.173678,-0.966393
Travis,1.497127,-0.274504,0.909625,0.726463,0.556139


In [11]:
demeaned.groupby(key).mean()

Unnamed: 0,a,b,c,d,e
one,0.0,0.0,0.0,0.0,0.0
two,0.0,0.0,-1.110223e-16,0.0,0.0


In [13]:
tips = pd.read_csv('../plot/tips.csv')
tips['tip_pct'] = tips['tip'] / tips['total_bill']
tips[:6]

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_pct
0,16.99,1.01,Female,No,Sun,Dinner,2,0.059447
1,10.34,1.66,Male,No,Sun,Dinner,3,0.160542
2,21.01,3.5,Male,No,Sun,Dinner,3,0.166587
3,23.68,3.31,Male,No,Sun,Dinner,2,0.13978
4,24.59,3.61,Female,No,Sun,Dinner,4,0.146808
5,25.29,4.71,Male,No,Sun,Dinner,4,0.18624


In [19]:
def top(df, n=5, column='tip_pct'):
    return df.sort_values(by=column)[-n:]

top(tips, n=6)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_pct
109,14.31,4.0,Female,Yes,Sat,Dinner,2,0.279525
183,23.17,6.5,Male,Yes,Sun,Dinner,4,0.280535
232,11.61,3.39,Male,No,Sat,Dinner,2,0.29199
67,3.07,1.0,Female,Yes,Sat,Dinner,1,0.325733
178,9.6,4.0,Female,Yes,Sun,Dinner,2,0.416667
172,7.25,5.15,Male,Yes,Sun,Dinner,2,0.710345


In [20]:
tips.groupby('smoker').apply(top)

Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,tip,sex,smoker,day,time,size,tip_pct
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
No,88,24.71,5.85,Male,No,Thur,Lunch,2,0.236746
No,185,20.69,5.0,Male,No,Sun,Dinner,5,0.241663
No,51,10.29,2.6,Female,No,Sun,Dinner,2,0.252672
No,149,7.51,2.0,Male,No,Thur,Lunch,2,0.266312
No,232,11.61,3.39,Male,No,Sat,Dinner,2,0.29199
Yes,109,14.31,4.0,Female,Yes,Sat,Dinner,2,0.279525
Yes,183,23.17,6.5,Male,Yes,Sun,Dinner,4,0.280535
Yes,67,3.07,1.0,Female,Yes,Sat,Dinner,1,0.325733
Yes,178,9.6,4.0,Female,Yes,Sun,Dinner,2,0.416667
Yes,172,7.25,5.15,Male,Yes,Sun,Dinner,2,0.710345


In [23]:
tips.groupby(['smoker', 'day']).apply(top, n=1, column='total_bill')

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,total_bill,tip,sex,smoker,day,time,size,tip_pct
smoker,day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
No,Fri,94,22.75,3.25,Female,No,Fri,Dinner,2,0.142857
No,Sat,212,48.33,9.0,Male,No,Sat,Dinner,4,0.18622
No,Sun,156,48.17,5.0,Male,No,Sun,Dinner,6,0.103799
No,Thur,142,41.19,5.0,Male,No,Thur,Lunch,5,0.121389
Yes,Fri,95,40.17,4.73,Male,Yes,Fri,Dinner,4,0.11775
Yes,Sat,170,50.81,10.0,Male,Yes,Sat,Dinner,3,0.196812
Yes,Sun,182,45.35,3.5,Male,Yes,Sun,Dinner,3,0.077178
Yes,Thur,197,43.11,5.0,Female,Yes,Thur,Lunch,4,0.115982


In [24]:
result = tips.groupby('smoker')['tip_pct'].describe()
result

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
No,151.0,0.159328,0.03991,0.056797,0.136906,0.155625,0.185014,0.29199
Yes,93.0,0.163196,0.085119,0.035638,0.106771,0.153846,0.195059,0.710345


In [25]:
result.unstack('smoker')

       smoker
count  No        151.000000
       Yes        93.000000
mean   No          0.159328
       Yes         0.163196
std    No          0.039910
       Yes         0.085119
min    No          0.056797
       Yes         0.035638
25%    No          0.136906
       Yes         0.106771
50%    No          0.155625
       Yes         0.153846
75%    No          0.185014
       Yes         0.195059
max    No          0.291990
       Yes         0.710345
dtype: float64

In [27]:
tips.groupby('smoker', group_keys=False).apply(top)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_pct
88,24.71,5.85,Male,No,Thur,Lunch,2,0.236746
185,20.69,5.0,Male,No,Sun,Dinner,5,0.241663
51,10.29,2.6,Female,No,Sun,Dinner,2,0.252672
149,7.51,2.0,Male,No,Thur,Lunch,2,0.266312
232,11.61,3.39,Male,No,Sat,Dinner,2,0.29199
109,14.31,4.0,Female,Yes,Sat,Dinner,2,0.279525
183,23.17,6.5,Male,Yes,Sun,Dinner,4,0.280535
67,3.07,1.0,Female,Yes,Sat,Dinner,1,0.325733
178,9.6,4.0,Female,Yes,Sun,Dinner,2,0.416667
172,7.25,5.15,Male,Yes,Sun,Dinner,2,0.710345


In [29]:
frame = pd.DataFrame({'data1': np.random.randn(1000), 'data2': np.random.randn(1000)})
factor = pd.cut(frame.data1, 4)
factor[:10]

0    (-3.108, -1.511]
1     (0.0799, 1.671]
2    (-1.511, 0.0799]
3    (-1.511, 0.0799]
4    (-1.511, 0.0799]
5     (0.0799, 1.671]
6    (-1.511, 0.0799]
7     (0.0799, 1.671]
8    (-1.511, 0.0799]
9     (0.0799, 1.671]
Name: data1, dtype: category
Categories (4, interval[float64]): [(-3.108, -1.511] < (-1.511, 0.0799] < (0.0799, 1.671] < (1.671, 3.262]]

In [30]:
def get_stats(group):
    return {'min': group.min(), 'max': group.max(), 'count': group.count(), 'mean': group.mean()}

grouped = frame.data2.groupby(factor)
grouped.apply(get_stats).unstack()

Unnamed: 0_level_0,count,max,mean,min
data1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"(-3.108, -1.511]",70.0,2.749815,-0.079409,-2.129837
"(-1.511, 0.0799]",465.0,3.219554,0.005468,-4.576888
"(0.0799, 1.671]",415.0,2.594198,0.010229,-2.649347
"(1.671, 3.262]",50.0,1.815761,-0.178571,-2.058149


In [31]:
grouping = pd.qcut(frame.data1, 10, labels=False)
grouped = frame.data2.groupby(grouping)
grouped.apply(get_stats).unstack()

Unnamed: 0_level_0,count,max,mean,min
data1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,100.0,2.749815,-0.048593,-2.129837
1,100.0,2.280131,0.001064,-2.380575
2,100.0,2.798677,0.048163,-2.741319
3,100.0,2.506101,-0.197846,-4.576888
4,100.0,2.844536,0.009474,-2.68746
5,100.0,3.219554,0.158449,-2.199504
6,100.0,2.594198,0.178848,-1.880239
7,100.0,1.937399,-0.066087,-2.33077
8,100.0,2.504578,-0.005526,-2.649347
9,100.0,1.815761,-0.154942,-2.143217


In [33]:
s = pd.Series(np.random.randn(6))
s[::2] = np.nan
s

0         NaN
1    0.309346
2         NaN
3   -0.200963
4         NaN
5   -0.305468
dtype: float64

In [34]:
s.fillna(s.mean())

0   -0.065695
1    0.309346
2   -0.065695
3   -0.200963
4   -0.065695
5   -0.305468
dtype: float64

In [35]:
states = ['Ohio', 'New York', 'Vermont', 'Florida', 'Oregon', 'Nevada', 'California', 'Idaho']
group_key = ['East'] * 4 + ['West'] * 4
data = pd.Series(np.random.randn(8), index=states)
data[['Vermont', 'Nevada', 'Idaho']] = np.nan
data

Ohio         -0.242541
New York     -1.370111
Vermont            NaN
Florida      -0.438021
Oregon        1.296947
Nevada             NaN
California    0.734265
Idaho              NaN
dtype: float64

In [36]:
data.groupby(group_key).mean()

East   -0.683558
West    1.015606
dtype: float64