In [2]:
import pandas as pd
import numpy as np

Sometimes we want to select data based on groups and understand aggregated data on a group level. We have seen that even though Pandas allows us to iterate over every row in a dataframe, it is geneally very slow to do so. Fortunately Pandas has a groupby() function to speed up such task. The idea behind the groupby() function is that it takes some dataframe, splits it into chunks based on some key values, applies computation on those chunks, then combines the results back together into another dataframe. In pandas this is refered to as the split-apply-combine pattern.

In [3]:
# Now comapring an operation with two different approaches - one is iterating over very row of the dataframe and second is to
# split it into chunks using group by

df = pd.read_csv('../resources/week-3/datasets/census.csv')
df = df[df['SUMLEV']==50]
df.head()

Unnamed: 0,SUMLEV,REGION,DIVISION,STATE,COUNTY,STNAME,CTYNAME,CENSUS2010POP,ESTIMATESBASE2010,POPESTIMATE2010,...,RDOMESTICMIG2011,RDOMESTICMIG2012,RDOMESTICMIG2013,RDOMESTICMIG2014,RDOMESTICMIG2015,RNETMIG2011,RNETMIG2012,RNETMIG2013,RNETMIG2014,RNETMIG2015
1,50,3,6,1,1,Alabama,Autauga County,54571,54571,54660,...,7.242091,-2.915927,-3.012349,2.265971,-2.530799,7.606016,-2.626146,-2.722002,2.59227,-2.187333
2,50,3,6,1,3,Alabama,Baldwin County,182265,182265,183193,...,14.83296,17.647293,21.845705,19.243287,17.197872,15.844176,18.559627,22.727626,20.317142,18.293499
3,50,3,6,1,5,Alabama,Barbour County,27457,27457,27341,...,-4.728132,-2.50069,-7.056824,-3.904217,-10.543299,-4.874741,-2.758113,-7.167664,-3.978583,-10.543299
4,50,3,6,1,7,Alabama,Bibb County,22915,22919,22861,...,-5.527043,-5.068871,-6.201001,-0.177537,0.177258,-5.088389,-4.363636,-5.403729,0.754533,1.107861
5,50,3,6,1,9,Alabama,Blount County,57322,57322,57373,...,1.807375,-1.177622,-1.748766,-2.062535,-1.36997,1.859511,-0.84858,-1.402476,-1.577232,-0.884411


In [None]:
# In the first example for groupby() I want to use the census date. Let's get a list of the unique states,
# then we can iterate over all the states and for each state we reduce the data frame and calculate the
# average.

# Let's run such task for 3 times and time it. For this we'll use the cell magic function %%timeit

In [4]:
%%timeit -n 3
arr = np.array([])
for state in df['STNAME'].unique():
    meann = np.average(df.where(df['STNAME']==state).dropna()['CENSUS2010POP'])
    arr = np.append(arr, meann)
print(arr)
    

[ 71339.34328358  24490.72413793 426134.46666667  38878.90666667
 642309.5862069   78581.1875     446762.125      299311.33333333
 601723.         280616.56716418  60928.63522013 272060.2
  35626.86363636 125790.50980392  70476.10869565  30771.26262626
  27172.55238095  36161.39166667  70833.9375      83022.5625
 240564.66666667 467687.78571429 119080.          60964.65517241
  36186.54878049  52077.62608696  17668.125       19638.07526882
 158855.94117647 131647.         418661.61904762  62399.36363636
 312550.03225806  95354.83        12690.39622642 131096.63636364
  48718.84415584 106418.72222222 189587.74626866 210513.4
 100551.39130435  12336.06060606  66801.10526316  98998.27165354
  95306.37931034  44695.78571429  60111.29323308 172424.1025641
  33690.8         78985.91666667  24505.47826087]
[ 71339.34328358  24490.72413793 426134.46666667  38878.90666667
 642309.5862069   78581.1875     446762.125      299311.33333333
 601723.         280616.56716418  60928.63522013 272060.2
 

[ 71339.34328358  24490.72413793 426134.46666667  38878.90666667
 642309.5862069   78581.1875     446762.125      299311.33333333
 601723.         280616.56716418  60928.63522013 272060.2
  35626.86363636 125790.50980392  70476.10869565  30771.26262626
  27172.55238095  36161.39166667  70833.9375      83022.5625
 240564.66666667 467687.78571429 119080.          60964.65517241
  36186.54878049  52077.62608696  17668.125       19638.07526882
 158855.94117647 131647.         418661.61904762  62399.36363636
 312550.03225806  95354.83        12690.39622642 131096.63636364
  48718.84415584 106418.72222222 189587.74626866 210513.4
 100551.39130435  12336.06060606  66801.10526316  98998.27165354
  95306.37931034  44695.78571429  60111.29323308 172424.1025641
  33690.8         78985.91666667  24505.47826087]
[ 71339.34328358  24490.72413793 426134.46666667  38878.90666667
 642309.5862069   78581.1875     446762.125      299311.33333333
 601723.         280616.56716418  60928.63522013 272060.2
 

In [5]:
%%timeit -n 3
arr1 = np.array([])
for group, frame in df.groupby('STNAME'):
    meann1 = np.average(frame['CENSUS2010POP'])
    arr1 = np.append(arr1, meann1)
print(arr1)
    

[ 71339.34328358  24490.72413793 426134.46666667  38878.90666667
 642309.5862069   78581.1875     446762.125      299311.33333333
 601723.         280616.56716418  60928.63522013 272060.2
  35626.86363636 125790.50980392  70476.10869565  30771.26262626
  27172.55238095  36161.39166667  70833.9375      83022.5625
 240564.66666667 467687.78571429 119080.          60964.65517241
  36186.54878049  52077.62608696  17668.125       19638.07526882
 158855.94117647 131647.         418661.61904762  62399.36363636
 312550.03225806  95354.83        12690.39622642 131096.63636364
  48718.84415584 106418.72222222 189587.74626866 210513.4
 100551.39130435  12336.06060606  66801.10526316  98998.27165354
  95306.37931034  44695.78571429  60111.29323308 172424.1025641
  33690.8         78985.91666667  24505.47826087]
[ 71339.34328358  24490.72413793 426134.46666667  38878.90666667
 642309.5862069   78581.1875     446762.125      299311.33333333
 601723.         280616.56716418  60928.63522013 272060.2
 

[ 71339.34328358  24490.72413793 426134.46666667  38878.90666667
 642309.5862069   78581.1875     446762.125      299311.33333333
 601723.         280616.56716418  60928.63522013 272060.2
  35626.86363636 125790.50980392  70476.10869565  30771.26262626
  27172.55238095  36161.39166667  70833.9375      83022.5625
 240564.66666667 467687.78571429 119080.          60964.65517241
  36186.54878049  52077.62608696  17668.125       19638.07526882
 158855.94117647 131647.         418661.61904762  62399.36363636
 312550.03225806  95354.83        12690.39622642 131096.63636364
  48718.84415584 106418.72222222 189587.74626866 210513.4
 100551.39130435  12336.06060606  66801.10526316  98998.27165354
  95306.37931034  44695.78571429  60111.29323308 172424.1025641
  33690.8         78985.91666667  24505.47826087]
[ 71339.34328358  24490.72413793 426134.46666667  38878.90666667
 642309.5862069   78581.1875     446762.125      299311.33333333
 601723.         280616.56716418  60928.63522013 272060.2
 

In [None]:
# So there is a huge execution time difference between the above two approaches, second one(group by) is much faster


In [25]:
df1 = pd.read_csv('../resources/week-3/datasets/census.csv')
df1 = df1[df1['SUMLEV']==50]
df1.head()

Unnamed: 0,SUMLEV,REGION,DIVISION,STATE,COUNTY,STNAME,CTYNAME,CENSUS2010POP,ESTIMATESBASE2010,POPESTIMATE2010,...,RDOMESTICMIG2011,RDOMESTICMIG2012,RDOMESTICMIG2013,RDOMESTICMIG2014,RDOMESTICMIG2015,RNETMIG2011,RNETMIG2012,RNETMIG2013,RNETMIG2014,RNETMIG2015
1,50,3,6,1,1,Alabama,Autauga County,54571,54571,54660,...,7.242091,-2.915927,-3.012349,2.265971,-2.530799,7.606016,-2.626146,-2.722002,2.59227,-2.187333
2,50,3,6,1,3,Alabama,Baldwin County,182265,182265,183193,...,14.83296,17.647293,21.845705,19.243287,17.197872,15.844176,18.559627,22.727626,20.317142,18.293499
3,50,3,6,1,5,Alabama,Barbour County,27457,27457,27341,...,-4.728132,-2.50069,-7.056824,-3.904217,-10.543299,-4.874741,-2.758113,-7.167664,-3.978583,-10.543299
4,50,3,6,1,7,Alabama,Bibb County,22915,22919,22861,...,-5.527043,-5.068871,-6.201001,-0.177537,0.177258,-5.088389,-4.363636,-5.403729,0.754533,1.107861
5,50,3,6,1,9,Alabama,Blount County,57322,57322,57373,...,1.807375,-1.177622,-1.748766,-2.062535,-1.36997,1.859511,-0.84858,-1.402476,-1.577232,-0.884411


In [26]:
grps = df1.groupby('STNAME')

In [30]:
grps.count()

Unnamed: 0_level_0,SUMLEV,REGION,DIVISION,STATE,COUNTY,CTYNAME,CENSUS2010POP,ESTIMATESBASE2010,POPESTIMATE2010,POPESTIMATE2011,...,RDOMESTICMIG2011,RDOMESTICMIG2012,RDOMESTICMIG2013,RDOMESTICMIG2014,RDOMESTICMIG2015,RNETMIG2011,RNETMIG2012,RNETMIG2013,RNETMIG2014,RNETMIG2015
STNAME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Alabama,67,67,67,67,67,67,67,67,67,67,...,67,67,67,67,67,67,67,67,67,67
Alaska,29,29,29,29,29,29,29,29,29,29,...,29,29,29,29,29,29,29,29,29,29
Arizona,15,15,15,15,15,15,15,15,15,15,...,15,15,15,15,15,15,15,15,15,15
Arkansas,75,75,75,75,75,75,75,75,75,75,...,75,75,75,75,75,75,75,75,75,75
California,58,58,58,58,58,58,58,58,58,58,...,58,58,58,58,58,58,58,58,58,58
Colorado,64,64,64,64,64,64,64,64,64,64,...,64,64,64,64,64,64,64,64,64,64
Connecticut,8,8,8,8,8,8,8,8,8,8,...,8,8,8,8,8,8,8,8,8,8
Delaware,3,3,3,3,3,3,3,3,3,3,...,3,3,3,3,3,3,3,3,3,3
District of Columbia,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
Florida,67,67,67,67,67,67,67,67,67,67,...,67,67,67,67,67,67,67,67,67,67


In [33]:
grps.get_group('Alabama')

Unnamed: 0,SUMLEV,REGION,DIVISION,STATE,COUNTY,STNAME,CTYNAME,CENSUS2010POP,ESTIMATESBASE2010,POPESTIMATE2010,...,RDOMESTICMIG2011,RDOMESTICMIG2012,RDOMESTICMIG2013,RDOMESTICMIG2014,RDOMESTICMIG2015,RNETMIG2011,RNETMIG2012,RNETMIG2013,RNETMIG2014,RNETMIG2015
1,50,3,6,1,1,Alabama,Autauga County,54571,54571,54660,...,7.242091,-2.915927,-3.012349,2.265971,-2.530799,7.606016,-2.626146,-2.722002,2.592270,-2.187333
2,50,3,6,1,3,Alabama,Baldwin County,182265,182265,183193,...,14.832960,17.647293,21.845705,19.243287,17.197872,15.844176,18.559627,22.727626,20.317142,18.293499
3,50,3,6,1,5,Alabama,Barbour County,27457,27457,27341,...,-4.728132,-2.500690,-7.056824,-3.904217,-10.543299,-4.874741,-2.758113,-7.167664,-3.978583,-10.543299
4,50,3,6,1,7,Alabama,Bibb County,22915,22919,22861,...,-5.527043,-5.068871,-6.201001,-0.177537,0.177258,-5.088389,-4.363636,-5.403729,0.754533,1.107861
5,50,3,6,1,9,Alabama,Blount County,57322,57322,57373,...,1.807375,-1.177622,-1.748766,-2.062535,-1.369970,1.859511,-0.848580,-1.402476,-1.577232,-0.884411
6,50,3,6,1,11,Alabama,Bullock County,10914,10915,10887,...,-30.953709,-5.180127,-1.130263,14.354290,-16.167247,-29.001673,-2.825524,1.507017,17.243790,-13.193961
7,50,3,6,1,13,Alabama,Butler County,20947,20946,20944,...,-14.032727,-11.684234,-5.655413,1.085428,-6.529805,-13.936612,-11.586865,-5.557058,1.184103,-6.430868
8,50,3,6,1,15,Alabama,Calhoun County,118572,118586,118437,...,-6.155670,-4.611706,-5.524649,-4.463211,-3.376322,-5.791579,-4.092677,-5.062836,-3.912834,-2.806406
9,50,3,6,1,17,Alabama,Chambers County,34215,34170,34098,...,-2.731639,3.849092,2.872721,-2.287222,1.349468,-1.821092,4.701181,3.781439,-1.290228,2.346901
10,50,3,6,1,19,Alabama,Cherokee County,25989,25986,25976,...,6.339327,1.113180,5.488706,-0.076806,-3.239866,6.416167,1.420264,5.757384,0.230419,-2.931307


Till now we have learnt how to split up the data and apply some very basic operations or functions. Now we will see
some indepth applying operations like - aggregation, transformation, filtering. lets look at them

# Aggregation

In [None]:
#The most straight forward apply step is the aggregation of data, and uses the method agg() on the groupby
# object. Thus far we have only iterated through the groupby object, unpacking it into a label (the group
# name) and a dataframe. But with agg we can pass in a dictionary of the columns we are interested in
# aggregating along with the function we are looking to apply to aggregate.

In [8]:
df = pd.read_csv('../resources/week-3/datasets/listings.csv')


In [12]:
df.columns

Index(['id', 'listing_url', 'scrape_id', 'last_scraped', 'name', 'summary',
       'space', 'description', 'experiences_offered', 'neighborhood_overview',
       'notes', 'transit', 'access', 'interaction', 'house_rules',
       'thumbnail_url', 'medium_url', 'picture_url', 'xl_picture_url',
       'host_id', 'host_url', 'host_name', 'host_since', 'host_location',
       'host_about', 'host_response_time', 'host_response_rate',
       'host_acceptance_rate', 'host_is_superhost', 'host_thumbnail_url',
       'host_picture_url', 'host_neighbourhood', 'host_listings_count',
       'host_total_listings_count', 'host_verifications',
       'host_has_profile_pic', 'host_identity_verified', 'street',
       'neighbourhood', 'neighbourhood_cleansed',
       'neighbourhood_group_cleansed', 'city', 'state', 'zipcode', 'market',
       'smart_location', 'country_code', 'country', 'latitude', 'longitude',
       'is_location_exact', 'property_type', 'room_type', 'accommodates',
       'bathrooms',

In [22]:
df.groupby('cancellation_policy').agg(review_scores_value_avg = ('review_scores_value',np.nanmean))

Unnamed: 0_level_0,review_scores_value_avg
cancellation_policy,Unnamed: 1_level_1
flexible,9.237421
moderate,9.307398
strict,9.081441
super_strict_30,8.537313


In [24]:
df.groupby('cancellation_policy').agg(review_scores_value_avg = ('review_scores_value',np.nanmean), 
                                      reviews_per_month_avg=('reviews_per_month',np.nanmean))

Unnamed: 0_level_0,review_scores_value_avg,reviews_per_month_avg
cancellation_policy,Unnamed: 1_level_1,Unnamed: 2_level_1
flexible,9.237421,1.82921
moderate,9.307398,2.391922
strict,9.081441,1.873467
super_strict_30,8.537313,0.340143


# Transformation

In [None]:
# Transformation is different from aggregation. Where agg() returns a single value per column, so one row per
# group, tranform() returns an object that is the same size as the group. Essentially, it broadcasts the
# function you supply over the grouped dataframe, returning a new dataframe. This makes combining data later
# easy.

In [34]:
cols=['cancellation_policy','review_scores_value']
# Now lets transform it, I'll store this in its own dataframe
transform_df=df[cols].groupby('cancellation_policy').transform(np.nanmean)
transform_df.head()

Unnamed: 0,review_scores_value
0,9.307398
1,9.307398
2,9.307398
3,9.307398
4,9.237421


In [35]:
transform_df.shape

(3585, 1)

In [36]:
transform_df.rename({'review_scores_value':'mean_review_scores'}, axis='columns', inplace=True)
df=df.merge(transform_df, left_index=True, right_index=True)
df.head()

Unnamed: 0,id,listing_url,scrape_id,last_scraped,name,summary,space,description,experiences_offered,neighborhood_overview,...,requires_license,license,jurisdiction_names,instant_bookable,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,calculated_host_listings_count,reviews_per_month,mean_review_scores
0,12147973,https://www.airbnb.com/rooms/12147973,20160906204935,2016-09-07,Sunny Bungalow in the City,"Cozy, sunny, family home. Master bedroom high...",The house has an open and cozy feel at the sam...,"Cozy, sunny, family home. Master bedroom high...",none,"Roslindale is quiet, convenient and friendly. ...",...,f,,,f,moderate,f,f,1,,9.307398
1,3075044,https://www.airbnb.com/rooms/3075044,20160906204935,2016-09-07,Charming room in pet friendly apt,Charming and quiet room in a second floor 1910...,Small but cozy and quite room with a full size...,Charming and quiet room in a second floor 1910...,none,"The room is in Roslindale, a diverse and prima...",...,f,,,t,moderate,f,f,1,1.3,9.307398
2,6976,https://www.airbnb.com/rooms/6976,20160906204935,2016-09-07,Mexican Folk Art Haven in Boston,"Come stay with a friendly, middle-aged guy in ...","Come stay with a friendly, middle-aged guy in ...","Come stay with a friendly, middle-aged guy in ...",none,The LOCATION: Roslindale is a safe and diverse...,...,f,,,f,moderate,t,f,1,0.47,9.307398
3,1436513,https://www.airbnb.com/rooms/1436513,20160906204935,2016-09-07,Spacious Sunny Bedroom Suite in Historic Home,Come experience the comforts of home away from...,Most places you find in Boston are small howev...,Come experience the comforts of home away from...,none,Roslindale is a lovely little neighborhood loc...,...,f,,,f,moderate,f,f,1,1.0,9.307398
4,7651065,https://www.airbnb.com/rooms/7651065,20160906204935,2016-09-07,Come Home to Boston,"My comfy, clean and relaxing home is one block...","Clean, attractive, private room, one block fro...","My comfy, clean and relaxing home is one block...",none,"I love the proximity to downtown, the neighbor...",...,f,,,f,flexible,f,f,1,2.25,9.237421


# Filtering

In [None]:
# The GroupBy object has build in support for filtering groups as well. It's often that you'll want to group
# by some feature, then make some transformation to the groups, then drop certain groups as part of your
# cleaning routines. The filter() function takes in a function which it applies to each group dataframe and
# returns either a True or a False, depending upon whether that group should be included in the results.

In [37]:
# For instance, if we only want those groups which have a mean rating above 9 included in our results
df.groupby('cancellation_policy').filter(lambda x: np.nanmean(x['review_scores_value'])>9.2)

Unnamed: 0,id,listing_url,scrape_id,last_scraped,name,summary,space,description,experiences_offered,neighborhood_overview,...,requires_license,license,jurisdiction_names,instant_bookable,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,calculated_host_listings_count,reviews_per_month,mean_review_scores
0,12147973,https://www.airbnb.com/rooms/12147973,20160906204935,2016-09-07,Sunny Bungalow in the City,"Cozy, sunny, family home. Master bedroom high...",The house has an open and cozy feel at the sam...,"Cozy, sunny, family home. Master bedroom high...",none,"Roslindale is quiet, convenient and friendly. ...",...,f,,,f,moderate,f,f,1,,9.307398
1,3075044,https://www.airbnb.com/rooms/3075044,20160906204935,2016-09-07,Charming room in pet friendly apt,Charming and quiet room in a second floor 1910...,Small but cozy and quite room with a full size...,Charming and quiet room in a second floor 1910...,none,"The room is in Roslindale, a diverse and prima...",...,f,,,t,moderate,f,f,1,1.30,9.307398
2,6976,https://www.airbnb.com/rooms/6976,20160906204935,2016-09-07,Mexican Folk Art Haven in Boston,"Come stay with a friendly, middle-aged guy in ...","Come stay with a friendly, middle-aged guy in ...","Come stay with a friendly, middle-aged guy in ...",none,The LOCATION: Roslindale is a safe and diverse...,...,f,,,f,moderate,t,f,1,0.47,9.307398
3,1436513,https://www.airbnb.com/rooms/1436513,20160906204935,2016-09-07,Spacious Sunny Bedroom Suite in Historic Home,Come experience the comforts of home away from...,Most places you find in Boston are small howev...,Come experience the comforts of home away from...,none,Roslindale is a lovely little neighborhood loc...,...,f,,,f,moderate,f,f,1,1.00,9.307398
4,7651065,https://www.airbnb.com/rooms/7651065,20160906204935,2016-09-07,Come Home to Boston,"My comfy, clean and relaxing home is one block...","Clean, attractive, private room, one block fro...","My comfy, clean and relaxing home is one block...",none,"I love the proximity to downtown, the neighbor...",...,f,,,f,flexible,f,f,1,2.25,9.237421
5,12386020,https://www.airbnb.com/rooms/12386020,20160906204935,2016-09-07,Private Bedroom + Great Coffee,Super comfy bedroom plus your own bathroom in ...,Our sunny condo is located on the second and t...,Super comfy bedroom plus your own bathroom in ...,none,We love our corner of Roslindale! For quiet wa...,...,f,,,f,flexible,f,f,1,1.70,9.237421
7,2843445,https://www.airbnb.com/rooms/2843445,20160906204935,2016-09-07,"""Tranquility"" on ""Top of the Hill""","We can accommodate guests who are gluten-free,...",We provide a bedroom and full shared bath. Ra...,"We can accommodate guests who are gluten-free,...",none,Our neighborhood is residential with friendly ...,...,f,,,f,moderate,t,t,2,2.38,9.307398
8,753446,https://www.airbnb.com/rooms/753446,20160906204935,2016-09-07,6 miles away from downtown Boston!,Nice and cozy apartment about 6 miles away to ...,Nice and cozy apartment about 6 miles away to ...,Nice and cozy apartment about 6 miles away to ...,none,Roslindale is a primarily residential neighbor...,...,f,,,f,moderate,f,f,1,5.36,9.307398
10,12023024,https://www.airbnb.com/rooms/12023024,20160906204935,2016-09-07,Cozy room in a well located house,The room is in a single family house located i...,,The room is in a single family house located i...,none,,...,f,,,f,flexible,f,f,1,0.36,9.237421
11,1668313,https://www.airbnb.com/rooms/1668313,20160906204935,2016-09-07,Room in Rozzie-Twin Bed-Full Bath,Quiet second floor bedroom sleeps one in comfo...,,Quiet second floor bedroom sleeps one in comfo...,none,Our neighborhood is quiet and relaxed. There i...,...,f,,,f,flexible,f,f,2,0.48,9.237421


# Applying

In [39]:
# By far the most common operation I invoke on groupby objects is the apply() function. This allows you to
# apply an arbitrary function to each group, and stitch the results back for each apply() into a single
# dataframe where the index is preserved.

# Lets look at an example using our airbnb data, I'm going to get a clean copy of the dataframe
df=pd.read_csv("../resources/week-3/datasets/listings.csv")
# And lets just include some of the columns we were interested in previously
df=df[['cancellation_policy','review_scores_value']]
df.head()

Unnamed: 0,cancellation_policy,review_scores_value
0,moderate,
1,moderate,9.0
2,moderate,10.0
3,moderate,10.0
4,flexible,10.0


In [40]:
# In previous work we wanted to find the average review score of a listing and its deviation from the group
# mean. This was a two step process, first we used transform() on the groupby object and then we had to
# broadcast to create a new column. With apply() we could wrap this logic in one place
def calc_mean_review_scores(group):
    # group is a dataframe just of whatever we have grouped by, e.g. cancellation policy, so we can treat
    # this as the complete dataframe
    avg=np.nanmean(group["review_scores_value"])
    # now broadcast our formula and create a new column
    group["review_scores_mean"]=np.abs(avg-group["review_scores_value"])
    return group

# Now just apply this to the groups
df.groupby('cancellation_policy').apply(calc_mean_review_scores).head()

Unnamed: 0,cancellation_policy,review_scores_value,review_scores_mean
0,moderate,,
1,moderate,9.0,0.307398
2,moderate,10.0,0.692602
3,moderate,10.0,0.692602
4,flexible,10.0,0.762579
