# The GroupBy Object

In [2]:
import pandas as pd

## The Fortune 1000 dataset
- The **Fortune 1000** is a listing of the 1000 largest American companies as ranked by Fortune magazine
- The **DataFrame** includes the company's name, sector, industry and revenues, profits and employees

In [26]:
fortune_df = pd.read_csv("fortune1000.csv", index_col="Rank")
fortune_df["Sector"]  = fortune_df["Sector"].astype("category")
fortune_df["Industry"] = fortune_df["Industry"].astype("category")
fortune = fortune_df.copy()
fortune

Unnamed: 0_level_0,Company,Sector,Industry,Revenue,Profits,Employees
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,Walmart,Retailing,General Merchandisers,482130,14694,2300000
2,Exxon Mobil,Energy,Petroleum Refining,246204,16150,75600
3,Apple,Technology,"Computers, Office Equipment",233715,53394,110000
4,Berkshire Hathaway,Financials,Insurance: Property and Casualty (Stock),210821,24083,331000
5,McKesson,Health Care,Wholesalers: Health Care,181241,1476,70400
...,...,...,...,...,...,...
996,New York Community Bancorp,Financials,Commercial Banks,1902,-47,3448
997,Portland General Electric,Energy,Utilities: Gas and Electric,1898,172,2646
997,Portland General Electric,Energy,Utilities: Gas and Electric,1898,172,2646
999,Wendy’s,"Hotels, Resturants & Leisure",Food Services,1896,161,21200


## The groupby Method

In [19]:
sectors = fortune.groupby("Sector", observed=False)
sectors

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7984493e3a80>

In [20]:
len(sectors)

21

In [22]:
sectors.size()

Sector
Aerospace & Defense              20
Apparel                          15
Business Services                51
Chemicals                        30
Energy                          122
Engineering & Construction       26
Financials                      139
Food and Drug Stores             15
Food, Beverages & Tobacco        43
Health Care                      75
Hotels, Resturants & Leisure     25
Household Products               28
Industrials                      46
Materials                        43
Media                            25
Motor Vehicles & Parts           24
Retailing                        80
Technology                      102
Telecommunications               15
Transportation                   36
Wholesalers                      40
dtype: int64

In [23]:
sectors.first()

Unnamed: 0_level_0,Rank,Company,Industry,Revenue,Profits,Employees
Sector,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Aerospace & Defense,24,Boeing,Aerospace and Defense,96114,5176,161400
Apparel,91,Nike,Apparel,30601,3273,62600
Business Services,144,ManpowerGroup,Temporary Help,19330,419,27000
Chemicals,56,Dow Chemical,Chemicals,48778,7685,49495
Energy,2,Exxon Mobil,Petroleum Refining,246204,16150,75600
Engineering & Construction,155,Fluor,"Engineering, Construction",18114,413,38758
Financials,4,Berkshire Hathaway,Insurance: Property and Casualty (Stock),210821,24083,331000
Food and Drug Stores,7,CVS Health,Food and Drug Stores,153290,5237,199000
"Food, Beverages & Tobacco",41,Archer Daniels Midland,Food Production,67702,1849,32300
Health Care,5,McKesson,Wholesalers: Health Care,181241,1476,70400


In [24]:
sectors.last()

Unnamed: 0_level_0,Rank,Company,Industry,Revenue,Profits,Employees
Sector,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Aerospace & Defense,987,Delta Tucker Holdings,Aerospace and Defense,1923,-133,12000
Apparel,917,Guess,Apparel,2204,82,13500
Business Services,993,DeVry Education Group,Education,1910,140,11770
Chemicals,949,H.B. Fuller,Chemicals,2084,87,4425
Energy,997,Portland General Electric,Utilities: Gas and Electric,1898,172,2646
Engineering & Construction,994,MDC Holdings,Homebuilders,1909,66,1225
Financials,996,New York Community Bancorp,Commercial Banks,1902,-47,3448
Food and Drug Stores,928,Fred’s,Food and Drug Stores,2151,-7,7103
"Food, Beverages & Tobacco",954,Alliance One International,Tobacco,2066,-15,6835
Health Care,978,Providence Service,Health Care: Pharmacy and Other Services,1987,84,9072


## Retreive a Group with the get_group Method

In [31]:
fortune = fortune_df.copy()
sectors = fortune.groupby("Sector", observed=False)
fortune.head()

Unnamed: 0_level_0,Company,Sector,Industry,Revenue,Profits,Employees
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,Walmart,Retailing,General Merchandisers,482130,14694,2300000
2,Exxon Mobil,Energy,Petroleum Refining,246204,16150,75600
3,Apple,Technology,"Computers, Office Equipment",233715,53394,110000
4,Berkshire Hathaway,Financials,Insurance: Property and Casualty (Stock),210821,24083,331000
5,McKesson,Health Care,Wholesalers: Health Care,181241,1476,70400


In [32]:
sectors.get_group("Energy") # Get the individual group dataframes

Unnamed: 0_level_0,Company,Sector,Industry,Revenue,Profits,Employees
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2,Exxon Mobil,Energy,Petroleum Refining,246204,16150,75600
14,Chevron,Energy,Petroleum Refining,131118,4587,61500
30,Phillips 66,Energy,Petroleum Refining,87169,4227,14000
32,Valero Energy,Energy,Petroleum Refining,81824,3990,10103
42,Marathon Petroleum,Energy,Petroleum Refining,64566,2852,45440
...,...,...,...,...,...,...
981,WPX Energy,Energy,"Mining, Crude-Oil Production",1958,-1727,1040
983,Adams Resources & Energy,Energy,Petroleum Refining,1944,-1,809
995,EP Energy,Energy,"Mining, Crude-Oil Production",1908,-3748,665
997,Portland General Electric,Energy,Utilities: Gas and Electric,1898,172,2646


## Methods on the GroupBy Object
- Use square brackets on the `DataFraomeGroupBy` object to extract a column fro m the original df
- the `SeriesGroupBy` will have agg methods on it

In [33]:
fortune = fortune_df.copy()
sectors = fortune.groupby("Sector", observed=False)
fortune.head()

Unnamed: 0_level_0,Company,Sector,Industry,Revenue,Profits,Employees
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,Walmart,Retailing,General Merchandisers,482130,14694,2300000
2,Exxon Mobil,Energy,Petroleum Refining,246204,16150,75600
3,Apple,Technology,"Computers, Office Equipment",233715,53394,110000
4,Berkshire Hathaway,Financials,Insurance: Property and Casualty (Stock),210821,24083,331000
5,McKesson,Health Care,Wholesalers: Health Care,181241,1476,70400


In [36]:
sectors["Revenue"].sum()

Sector
Aerospace & Defense              357940
Apparel                           95968
Business Services                272195
Chemicals                        243897
Energy                          1517809
Engineering & Construction       153983
Financials                      2217159
Food and Drug Stores             483769
Food, Beverages & Tobacco        555967
Health Care                     1614707
Hotels, Resturants & Leisure     169546
Household Products               234737
Industrials                      497581
Materials                        259145
Media                            220764
Motor Vehicles & Parts           482540
Retailing                       1465076
Technology                      1377600
Telecommunications               461834
Transportation                   408508
Wholesalers                      444800
Name: Revenue, dtype: int64

In [38]:
sectors["Revenue"].mean()

Sector
Aerospace & Defense             17897.000000
Apparel                          6397.866667
Business Services                5337.156863
Chemicals                        8129.900000
Energy                          12441.057377
Engineering & Construction       5922.423077
Financials                      15950.784173
Food and Drug Stores            32251.266667
Food, Beverages & Tobacco       12929.465116
Health Care                     21529.426667
Hotels, Resturants & Leisure     6781.840000
Household Products               8383.464286
Industrials                     10816.978261
Materials                        6026.627907
Media                            8830.560000
Motor Vehicles & Parts          20105.833333
Retailing                       18313.450000
Technology                      13505.882353
Telecommunications              30788.933333
Transportation                  11347.444444
Wholesalers                     11120.000000
Name: Revenue, dtype: float64

In [39]:
sectors[["Revenue", "Profits", "Employees"]].sum()

Unnamed: 0_level_0,Revenue,Profits,Employees
Sector,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Aerospace & Defense,357940,28742,968057
Apparel,95968,8236,346397
Business Services,272195,28227,1361050
Chemicals,243897,22628,463651
Energy,1517809,-73447,1188927
Engineering & Construction,153983,5304,406708
Financials,2217159,260209,3359948
Food and Drug Stores,483769,16759,1395398
"Food, Beverages & Tobacco",555967,51417,1211632
Health Care,1614707,106114,2678289


## Grouping by Multiple Columns
- Pass a list of columns to the **groupby** method to group by pairings of values across columns
- Target a column to retreive the **SeriesGroupBy** Object, then perform an aggregation with a method.
- Pandas will return a **MultiIndex Series** where the levels will be the original groups

In [53]:
fortune = fortune_df.copy()
sectors = fortune.groupby("Sector", observed=True)
sector_industries = fortune.groupby(["Sector", "Industry"], observed=True)

fortune.head()

Unnamed: 0_level_0,Company,Sector,Industry,Revenue,Profits,Employees
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,Walmart,Retailing,General Merchandisers,482130,14694,2300000
2,Exxon Mobil,Energy,Petroleum Refining,246204,16150,75600
3,Apple,Technology,"Computers, Office Equipment",233715,53394,110000
4,Berkshire Hathaway,Financials,Insurance: Property and Casualty (Stock),210821,24083,331000
5,McKesson,Health Care,Wholesalers: Health Care,181241,1476,70400


In [50]:
sector_industries.get_group(("Energy", "Petroleum Refining"))

Unnamed: 0_level_0,Company,Sector,Industry,Revenue,Profits,Employees
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2,Exxon Mobil,Energy,Petroleum Refining,246204,16150,75600
14,Chevron,Energy,Petroleum Refining,131118,4587,61500
30,Phillips 66,Energy,Petroleum Refining,87169,4227,14000
32,Valero Energy,Energy,Petroleum Refining,81824,3990,10103
42,Marathon Petroleum,Energy,Petroleum Refining,64566,2852,45440
98,Tesoro,Energy,Petroleum Refining,28150,1540,6016
214,HollyFrontier,Energy,Petroleum Refining,13238,740,2704
217,PBF Energy,Energy,Petroleum Refining,13124,146,2270
289,Western Refining,Energy,Petroleum Refining,9787,407,7347
394,Hess,Energy,Petroleum Refining,6575,-3056,2770


In [51]:
sector_industries["Revenue"].sum()

Sector               Industry                                     
Aerospace & Defense  Aerospace and Defense                            357940
Apparel              Apparel                                           95968
Business Services    Advertising, marketing                            22748
                     Diversified Outsourcing Services                  64829
                     Education                                          7485
                                                                       ...  
Transportation       Trucking, Truck Leasing                           35950
Wholesalers          Miscellaneous                                      8982
                     Wholesalers: Diversified                         176138
                     Wholesalers: Electronics and Office Equipment    147906
                     Wholesalers: Food and Grocery                    111774
Name: Revenue, Length: 79, dtype: int64

In [52]:
sector_industries.size()

Sector               Industry                                     
Aerospace & Defense  Aerospace and Defense                            20
Apparel              Apparel                                          15
Business Services    Advertising, marketing                            2
                     Diversified Outsourcing Services                 14
                     Education                                         3
                                                                      ..
Transportation       Trucking, Truck Leasing                           9
Wholesalers          Miscellaneous                                     1
                     Wholesalers: Diversified                         25
                     Wholesalers: Electronics and Office Equipment     8
                     Wholesalers: Food and Grocery                     6
Length: 79, dtype: int64

## The agg Method
- The `agg` method applies different aggregation methods on different columns.
- Invoke the `agg` method directly on the **DataFrameGroupBy** object.
- Pass the method a dict where the keys are the columns and the values are the aggregation methods


In [54]:
fortune = fortune_df.copy()
sectors = fortune.groupby("Sector", observed=True)

fortune.head()

Unnamed: 0_level_0,Company,Sector,Industry,Revenue,Profits,Employees
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,Walmart,Retailing,General Merchandisers,482130,14694,2300000
2,Exxon Mobil,Energy,Petroleum Refining,246204,16150,75600
3,Apple,Technology,"Computers, Office Equipment",233715,53394,110000
4,Berkshire Hathaway,Financials,Insurance: Property and Casualty (Stock),210821,24083,331000
5,McKesson,Health Care,Wholesalers: Health Care,181241,1476,70400


In [55]:
sectors.agg({'Revenue': 'sum', 'Profits': 'sum', 'Employees': 'mean'})

Unnamed: 0_level_0,Revenue,Profits,Employees
Sector,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Aerospace & Defense,357940,28742,48402.85
Apparel,95968,8236,23093.133333
Business Services,272195,28227,26687.254902
Chemicals,243897,22628,15455.033333
Energy,1517809,-73447,9745.303279
Engineering & Construction,153983,5304,15642.615385
Financials,2217159,260209,24172.28777
Food and Drug Stores,483769,16759,93026.533333
"Food, Beverages & Tobacco",555967,51417,28177.488372
Health Care,1614707,106114,35710.52


## Iterating through Groups

In [56]:
fortune = fortune_df.copy()
sectors = fortune.groupby("Sector", observed=True)

fortune.head()

Unnamed: 0_level_0,Company,Sector,Industry,Revenue,Profits,Employees
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,Walmart,Retailing,General Merchandisers,482130,14694,2300000
2,Exxon Mobil,Energy,Petroleum Refining,246204,16150,75600
3,Apple,Technology,"Computers, Office Equipment",233715,53394,110000
4,Berkshire Hathaway,Financials,Insurance: Property and Casualty (Stock),210821,24083,331000
5,McKesson,Health Care,Wholesalers: Health Care,181241,1476,70400


In [63]:
# Find two companies in each sector with most employees
# Include group will show whether to include "Sector" aka grouping columns in the callback function's df
sectors.apply(lambda group_df: group_df.nlargest(2, "Employees"), include_groups=False) 

Unnamed: 0_level_0,Unnamed: 1_level_0,Company,Industry,Revenue,Profits,Employees
Sector,Rank,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Aerospace & Defense,45,United Technologies,Aerospace and Defense,61047,7608,197200
Aerospace & Defense,24,Boeing,Aerospace and Defense,96114,5176,161400
Apparel,448,Hanesbrands,Apparel,5732,429,65300
Apparel,231,VF,Apparel,12377,1232,64000
Business Services,199,Aramark,Diversified Outsourcing Services,14329,236,216500
Business Services,744,Convergys,Diversified Outsourcing Services,2951,169,130000
Chemicals,101,DuPont,Chemicals,27940,1953,52000
Chemicals,56,Dow Chemical,Chemicals,48778,7685,49495
Energy,2,Exxon Mobil,Petroleum Refining,246204,16150,75600
Energy,117,Halliburton,"Oil and Gas Equipment, Services",23633,-671,65000
