# groupby()

#### Group by is a process involving 3 steps:
##### 1) Split the data into groups based on criteria
##### 2) Apply a function to each group independently
##### 3) Cpmbine the results into a data structure

## Imports and data

In [9]:
import pandas as pd

In [12]:
air_quality = pd.read_pickle('air_quality.pkl')

In [14]:
air_quality.head()

Unnamed: 0,date_time,PM2.5,PM10,SO2,NO2,CO,O3,TEMP,PRES,DEWP,...,hour,quarter,day_of_week_num,day_of_week_name,time_until_2022,time_until_2022_days,time_until_2022_weeks,prior_2016_ind,PM2.5_category,TEMP_category
0,2013-03-01 00:00:00,9.0,9.0,3.0,17.0,300.0,89.0,-0.5,1024.5,-21.4,...,0,1,4,Friday,3228 days 00:00:00,3228.0,461.142857,True,Good,Very Cold
1,2013-03-01 01:00:00,4.0,4.0,3.0,16.0,300.0,88.0,-0.7,1025.1,-22.1,...,1,1,4,Friday,3227 days 23:00:00,3227.958333,461.136905,True,Good,Very Cold
2,2013-03-01 05:00:00,4.0,4.0,9.0,25.0,300.0,78.0,-2.4,1027.5,-21.3,...,5,1,4,Friday,3227 days 19:00:00,3227.791667,461.113095,True,Good,Very Cold
3,2013-03-01 06:00:00,5.0,5.0,10.0,29.0,400.0,67.0,-2.5,1028.2,-20.4,...,6,1,4,Friday,3227 days 18:00:00,3227.75,461.107143,True,Good,Very Cold
4,2013-03-01 07:00:00,3.0,6.0,12.0,40.0,400.0,52.0,-1.4,1029.5,-20.4,...,7,1,4,Friday,3227 days 17:00:00,3227.708333,461.10119,True,Good,Very Cold


## groupby() Basics

In [19]:
## In this example we are grouping the air_quality dataframe by PM2.5_category
## BUT does not display anything outside of the notification that they are grouped.
## See next cell
air_quality.groupby(by='PM2.5_category').

  air_quality.groupby(by='PM2.5_category')


<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001AAD3D169D0>

In [23]:
## using .group we can see a dictonary whos keys are the categories and the values are the rows axis lables they apply to
air_quality.groupby(by='PM2.5_category').groups

  air_quality.groupby(by='PM2.5_category').groups


{'Good': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 17, 20, 21, 22, 23, 24, 25, 26, 27, 29, 30, 31, 32, 72, 73, 74, 75, 76, 78, 81, 82, 83, 168, 169, 201, 202, 203, 205, 206, 208, 238, 275, 276, 277, 278, 282, 283, 286, 287, 370, 371, 411, 412, 448, 449, 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 462, 463, 464, 465, 466, 467, 468, 495, 507, 513, 514, 515, 520, 521, 522, 523, 524, 525, 527, 528, 529, 534, 535, 536, 624, 636, 637, 638, ...], 'Moderate': [16, 18, 19, 28, 33, 34, 35, 36, 61, 71, 77, 79, 80, 84, 85, 86, 87, 196, 197, 198, 199, 200, 204, 207, 209, 210, 211, 212, 213, 214, 215, 216, 217, 279, 280, 281, 284, 285, 288, 289, 290, 292, 410, 446, 447, 461, 469, 470, 512, 516, 517, 518, 519, 526, 530, 531, 532, 533, 537, 538, 623, 625, 626, 629, 630, 631, 632, 633, 634, 635, 719, 720, 721, 722, 723, 727, 736, 737, 738, 739, 747, 748, 749, 750, 751, 752, 779, 780, 783, 784, 785, 786, 787, 788, 789, 791, 792, 793, 794, 795, ...], 'Unhealthy for sensitive group

In [27]:
## We can prove the above by using the .head() method
## Notice rows 0-15 are "good" but 16, 18 and 19 are "moderate" this is backed up above by looking at the axis's listed
air_quality['PM2.5_category'].head(20)

0         Good
1         Good
2         Good
3         Good
4         Good
5         Good
6         Good
7         Good
8         Good
9         Good
10        Good
11        Good
12        Good
13        Good
14        Good
15        Good
16    Moderate
17        Good
18    Moderate
19    Moderate
Name: PM2.5_category, dtype: category
Categories (6, object): ['Good' < 'Moderate' < 'Unhealthy for sensitive groups' < 'Unhealthy' < 'Very unhealthy' < 'Hazardous']

In [34]:
## We can view all the group names by using the .keys() method
air_quality.groupby(by='PM2.5_category').groups.keys()

  air_quality.groupby(by='PM2.5_category').groups.keys()


dict_keys(['Good', 'Moderate', 'Unhealthy for sensitive groups', 'Unhealthy', 'Very unhealthy', 'Hazardous'])

In [36]:
## We can view specifiec groups by using .get_group() method
air_quality.groupby(by='PM2.5_category').get_group('Good')

  air_quality.groupby(by='PM2.5_category').get_group('Good')


Unnamed: 0,date_time,PM2.5,PM10,SO2,NO2,CO,O3,TEMP,PRES,DEWP,...,hour,quarter,day_of_week_num,day_of_week_name,time_until_2022,time_until_2022_days,time_until_2022_weeks,prior_2016_ind,PM2.5_category,TEMP_category
0,2013-03-01 00:00:00,9.0,9.0,3.0,17.0,300.0,89.0,-0.5,1024.5,-21.4,...,0,1,4,Friday,3228 days 00:00:00,3228.000000,461.142857,True,Good,Very Cold
1,2013-03-01 01:00:00,4.0,4.0,3.0,16.0,300.0,88.0,-0.7,1025.1,-22.1,...,1,1,4,Friday,3227 days 23:00:00,3227.958333,461.136905,True,Good,Very Cold
2,2013-03-01 05:00:00,4.0,4.0,9.0,25.0,300.0,78.0,-2.4,1027.5,-21.3,...,5,1,4,Friday,3227 days 19:00:00,3227.791667,461.113095,True,Good,Very Cold
3,2013-03-01 06:00:00,5.0,5.0,10.0,29.0,400.0,67.0,-2.5,1028.2,-20.4,...,6,1,4,Friday,3227 days 18:00:00,3227.750000,461.107143,True,Good,Very Cold
4,2013-03-01 07:00:00,3.0,6.0,12.0,40.0,400.0,52.0,-1.4,1029.5,-20.4,...,7,1,4,Friday,3227 days 17:00:00,3227.708333,461.101190,True,Good,Very Cold
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95676,2017-02-28 11:00:00,7.0,18.0,3.0,23.0,400.0,82.0,13.3,1016.0,-12.8,...,11,1,1,Tuesday,1767 days 13:00:00,1767.541667,252.505952,False,Good,Warm
95677,2017-02-28 12:00:00,8.0,17.0,2.0,19.0,300.0,87.0,12.9,1015.4,-14.9,...,12,1,1,Tuesday,1767 days 12:00:00,1767.500000,252.500000,False,Good,Warm
95678,2017-02-28 13:00:00,6.0,8.0,2.0,18.0,300.0,89.0,14.0,1014.5,-15.0,...,13,1,1,Tuesday,1767 days 11:00:00,1767.458333,252.494048,False,Good,Warm
95680,2017-02-28 15:00:00,9.0,9.0,2.0,22.0,300.0,91.0,15.4,1013.0,-15.0,...,15,1,1,Tuesday,1767 days 09:00:00,1767.375000,252.482143,False,Good,Warm


## Stats Methods and groupby()

In [44]:
## We can apply groupby() after sorting data
air_quality.sort_values('date_time').groupby(by = 'year')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001AAD1B780D0>

In [46]:
## We can then use the first() to get the first records in each group. Basically the earliest records in each year since their sorted
air_quality.sort_values('date_time').groupby(by = 'year').first()

Unnamed: 0_level_0,date_time,PM2.5,PM10,SO2,NO2,CO,O3,TEMP,PRES,DEWP,...,hour,quarter,day_of_week_num,day_of_week_name,time_until_2022,time_until_2022_days,time_until_2022_weeks,prior_2016_ind,PM2.5_category,TEMP_category
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013,2013-03-01 00:00:00,9.0,9.0,3.0,17.0,300.0,89.0,-0.5,1024.5,-21.4,...,0,1,4,Friday,3228 days 00:00:00,3228.0,461.142857,True,Good,Very Cold
2014,2014-01-01 01:00:00,66.0,154.0,40.0,81.0,1500.0,7.0,-1.1,1008.8,-13.0,...,1,1,2,Wednesday,2921 days 23:00:00,2921.958333,417.422619,True,Unhealthy,Very Cold
2015,2015-01-01 00:00:00,3.0,14.0,15.0,29.0,600.0,33.0,-7.0,1026.0,-23.8,...,0,1,3,Thursday,2557 days 00:00:00,2557.0,365.285714,True,Good,Very Cold
2016,2016-01-01 00:00:00,200.0,209.0,31.0,98.0,3400.0,2.0,-2.5,1024.5,-8.2,...,0,1,4,Friday,2192 days 00:00:00,2192.0,313.142857,False,Very unhealthy,Very Cold
2017,2017-01-01 00:00:00,423.0,509.0,5.0,102.0,5900.0,2.0,-4.7,1022.1,-6.1,...,0,1,6,Sunday,1826 days 00:00:00,1826.0,260.857143,False,Hazardous,Very Cold
