### Categoricals and groupby

In [1]:
import pandas as pd

In [2]:
sales = pd.DataFrame(
    {
        'weekday': ['Sun', 'Sun', 'Mon', 'Mon'],
        'city': ['Austin', 'Dallas', 'Austin', 'Dallas'],
        'bread': [139,237,326,456],
        'butter': [20,45,70,98]
    })
sales

Unnamed: 0,weekday,city,bread,butter
0,Sun,Austin,139,20
1,Sun,Dallas,237,45
2,Mon,Austin,326,70
3,Mon,Dallas,456,98


In [3]:
# Boolean fitler and count
sales.loc[sales['weekday'] == 'Sun'].count()

weekday    2
city       2
bread      2
butter     2
dtype: int64

In [4]:
# Groupby and count
sales.groupby('weekday').count()

Unnamed: 0_level_0,city,bread,butter
weekday,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Mon,2,2,2
Sun,2,2,2


In [5]:
# Groupby and sum
sales.groupby('weekday')['bread'].sum()

weekday
Mon    782
Sun    376
Name: bread, dtype: int64

In [6]:
# Groupby and sum: multiple columns
sales.groupby('weekday')[['bread','butter']].sum()

Unnamed: 0_level_0,bread,butter
weekday,Unnamed: 1_level_1,Unnamed: 2_level_1
Mon,782,168
Sun,376,65


In [7]:
# Groupby and mean: multi-level index
sales.groupby(['city','weekday']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,bread,butter
city,weekday,Unnamed: 2_level_1,Unnamed: 3_level_1
Austin,Mon,326,70
Austin,Sun,139,20
Dallas,Mon,456,98
Dallas,Sun,237,45


In [8]:
customers = pd.Series(['Dave','Alice','Bob','Alice'])
customers

0     Dave
1    Alice
2      Bob
3    Alice
dtype: object

In [9]:
sales.groupby(customers)['bread'].sum()

Alice    693
Bob      326
Dave     139
Name: bread, dtype: int64

In [10]:
# Categorical data
sales['weekday'].unique()

array(['Sun', 'Mon'], dtype=object)

In [11]:
sales['weekday'] = sales['weekday'].astype('category')
sales['weekday']

0    Sun
1    Sun
2    Mon
3    Mon
Name: weekday, dtype: category
Categories (2, object): [Mon, Sun]

In [12]:
# Grouping by multiple columns
titanic = pd.read_csv('titanic.csv')

by_class = titanic.groupby('pclass')
count_by_class = by_class['survived'].count()
count_by_class

pclass
1    323
2    277
3    709
Name: survived, dtype: int64

In [13]:
by_mult = titanic.groupby(['embarked','pclass'])
count_mult = by_mult['survived'].count()
count_mult

embarked  pclass
C         1         141
          2          28
          3         101
Q         1           3
          2           7
          3         113
S         1         177
          2         242
          3         495
Name: survived, dtype: int64

In [14]:
#Grouping by another series
life = pd.read_csv('life_fname.csv', index_col='Country')
life.head(3)

Unnamed: 0_level_0,1964,1965,1966,1967,1968,1969,1970,1971,1972,1973,...,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Afghanistan,33.639,34.152,34.662,35.17,35.674,36.172,36.663,37.143,37.614,38.075,...,56.583,57.071,57.582,58.102,58.618,59.124,59.612,60.079,60.524,60.947
Albania,65.475,65.863,66.122,66.316,66.5,66.702,66.948,67.251,67.595,67.966,...,75.725,75.949,76.124,76.278,76.433,76.598,76.78,76.979,77.185,77.392
Algeria,47.953,48.389,48.806,49.205,49.592,49.976,50.366,50.767,51.195,51.67,...,69.682,69.854,70.02,70.18,70.332,70.477,70.615,70.747,70.874,71.0


In [15]:
regions = pd.read_csv('regions_fname.csv', index_col='Country')
regions.head(3)

Unnamed: 0_level_0,region
Country,Unnamed: 1_level_1
Afghanistan,South Asia
Albania,Europe & Central Asia
Algeria,Middle East & North Africa


In [16]:
life_by_region = life.groupby(regions['region'])
life_by_region['2010'].mean()

region
America                       74.037350
East Asia & Pacific           73.405750
Europe & Central Asia         75.656387
Middle East & North Africa    72.805333
South Asia                    68.189750
Sub-Saharan Africa            57.575080
Name: 2010, dtype: float64

### Groupby and aggregation 

In [17]:
sales

Unnamed: 0,weekday,city,bread,butter
0,Sun,Austin,139,20
1,Sun,Dallas,237,45
2,Mon,Austin,326,70
3,Mon,Dallas,456,98


In [18]:
# Review: groupby
sales.groupby('city')[['bread', 'butter']].max()

Unnamed: 0_level_0,bread,butter
city,Unnamed: 1_level_1,Unnamed: 2_level_1
Austin,326,70
Dallas,456,98


In [19]:
# Multiple aggregations 
sales.groupby('city')[['bread','butter']].agg(['max', 'sum'])

Unnamed: 0_level_0,bread,bread,butter,butter
Unnamed: 0_level_1,max,sum,max,sum
city,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Austin,326,465,70,90
Dallas,456,693,98,143


In [20]:
# Custom aggregation 
def data_range(series):
    return series.max() - series.min()

sales.groupby('weekday')[['bread','butter']].agg(data_range)

Unnamed: 0_level_0,bread,butter
weekday,Unnamed: 1_level_1,Unnamed: 2_level_1
Mon,130,28
Sun,98,25


In [21]:
# Custom aggregation; dictionaries
sales.groupby(customers)[['bread','butter']].agg({'bread':'sum', 'butter':data_range})

Unnamed: 0,bread,butter
Alice,693,53
Bob,326,0
Dave,139,0


In [22]:
# Computing multiple aggregations of muliple columns
by_class = titanic.groupby('pclass')
by_class_sub = by_class[['age','fare']]
aggregated = by_class_sub.agg(['max','median'])

In [23]:
aggregated.loc[:, ('age','max')]

pclass
1    80.0
2    70.0
3    74.0
Name: (age, max), dtype: float64

In [24]:
aggregated.loc[:, ('fare','median')]

pclass
1    60.0000
2    15.0458
3     8.0500
Name: (fare, median), dtype: float64

In [25]:
gapminder = pd.read_csv('gapminder_tidy.csv', index_col=['Year', 'region', 'Country']).sort_index()
#gapminder

In [26]:
by_year_region = gapminder.groupby(level=['Year', 'region'])
def spread(series):
    return series.max() - series.min()
aggregator = {'population':'sum', 'child_mortality':'mean', 'gdp':spread}
aggregated = by_year_region.agg(aggregator)
aggregated.tail(6)

Unnamed: 0_level_0,Unnamed: 1_level_0,population,child_mortality,gdp
Year,region,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2013,America,962908700.0,17.745833,49634.0
2013,East Asia & Pacific,2244209000.0,22.285714,134744.0
2013,Europe & Central Asia,896878800.0,9.831875,86418.0
2013,Middle East & North Africa,403050400.0,20.2215,128676.0
2013,South Asia,1701241000.0,46.2875,11469.0
2013,Sub-Saharan Africa,920599600.0,76.94449,32035.0


In [27]:
sales = pd.read_csv('sales\sales-feb-2015.csv', index_col='Date', parse_dates=True)
sales

Unnamed: 0_level_0,Company,Product,Units
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2015-02-02 08:30:00,Hooli,Software,3
2015-02-02 21:00:00,Mediacore,Hardware,9
2015-02-03 14:00:00,Initech,Software,13
2015-02-04 15:30:00,Streeplex,Software,13
2015-02-04 22:00:00,Acme Coporation,Hardware,14
2015-02-05 02:00:00,Acme Coporation,Software,19
2015-02-05 22:00:00,Hooli,Service,10
2015-02-07 23:00:00,Acme Coporation,Hardware,1
2015-02-09 09:00:00,Streeplex,Service,19
2015-02-09 13:00:00,Mediacore,Software,7


In [28]:
by_day = sales.groupby(sales.index.strftime('%a'))
by_day

<pandas.core.groupby.groupby.DataFrameGroupBy object at 0x0000000008359C18>

In [29]:
units_sum = by_day['Units'].sum()
units_sum

Mon    48
Sat     7
Thu    59
Tue    13
Wed    48
Name: Units, dtype: int64

### Groupby and transformation 

In [30]:
# the z-score
def zscore(series):
    return (series - series.mean()) / series.std()

In [31]:
auto = pd.read_csv('auto-mpg.csv')
auto.head()

Unnamed: 0,mpg,cyl,displ,hp,weight,accel,yr,origin,name
0,18.0,8,307.0,130,3504,12.0,70,US,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,US,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,US,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,US,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,US,ford torino


In [32]:
# MPG zscore
zscore(auto['mpg']).head()

0   -0.697747
1   -1.082115
2   -0.697747
3   -0.953992
4   -0.825870
Name: mpg, dtype: float64

In [33]:
# MPG zscore by year
auto.groupby('yr')['mpg'].transform(zscore).head()

0    0.058125
1   -0.503753
2    0.058125
3   -0.316460
4   -0.129168
Name: mpg, dtype: float64

In [34]:
# Apply transformation and aggregation 
def zscore_with_year_and_name(group):
    df = pd.DataFrame(
            {'mpg': zscore(group['mpg']),
            'year': group['yr'],
            'name': group['name']})
    return df

In [35]:
auto.groupby('yr').apply(zscore_with_year_and_name).head()

Unnamed: 0,mpg,year,name
0,0.058125,70,chevrolet chevelle malibu
1,-0.503753,70,buick skylark 320
2,0.058125,70,plymouth satellite
3,-0.31646,70,amc rebel sst
4,-0.129168,70,ford torino


In [36]:
#Detecting outliers with Z-score
from scipy.stats import zscore
gapminder = pd.read_csv('gapminder_tidy.csv', index_col='Country')
gapminder_2010 = gapminder[gapminder['Year'] == 2010]
gapminder_2010 = gapminder_2010.drop('Year', axis=1)
gapminder_2010.tail()

Unnamed: 0_level_0,fertility,life,population,child_mortality,gdp,region
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
West Bank and Gaza,4.216,72.64,4039192.0,23.8,4163.0,Middle East & North Africa
Western Sahara,2.471,66.844,530500.0,50.71,,Middle East & North Africa
"Yemen, Rep.",4.498,62.536,24052514.0,,,Middle East & North Africa
Zambia,5.813,54.549,13088570.0,84.8,3451.0,Sub-Saharan Africa
Zimbabwe,3.721,53.684,12571454.0,95.1,1484.0,Sub-Saharan Africa


In [37]:
standardized = gapminder_2010.groupby('region')[['life','fertility']].transform(zscore)
standardized.head()

Unnamed: 0_level_0,life,fertility
Country,Unnamed: 1_level_1,Unnamed: 2_level_1
Afghanistan,-1.743601,2.504732
Albania,0.226367,0.010964
Algeria,-0.440196,-0.003972
Angola,-0.882537,1.095653
Antigua and Barbuda,0.240607,-0.363761


In [38]:
outliers = ((standardized['life'] < -3) | (standardized['fertility'] > 3))
outliers.head()

Country
Afghanistan            False
Albania                False
Algeria                False
Angola                 False
Antigua and Barbuda    False
dtype: bool

In [39]:
gm_outliers = gapminder_2010.loc[outliers]
gm_outliers

Unnamed: 0_level_0,fertility,life,population,child_mortality,gdp,region
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Guatemala,3.974,71.1,14388929.0,34.5,6849.0,America
Haiti,3.35,45.0,9993247.0,208.8,1518.0,America
Tajikistan,3.78,66.83,6878637.0,52.6,2110.0,Europe & Central Asia
Timor-Leste,6.237,65.952,1124355.0,63.8,1777.0,East Asia & Pacific


In [40]:
# Filling missing data (imputation) by group
titanic.tail(10)

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
1299,3,0,"Yasbeck, Mr. Antoni",male,27.0,1,0,2659,14.4542,,C,C,,
1300,3,1,"Yasbeck, Mrs. Antoni (Selini Alexander)",female,15.0,1,0,2659,14.4542,,C,,,
1301,3,0,"Youseff, Mr. Gerious",male,45.5,0,0,2628,7.225,,C,,312.0,
1302,3,0,"Yousif, Mr. Wazli",male,,0,0,2647,7.225,,C,,,
1303,3,0,"Yousseff, Mr. Gerious",male,,0,0,2627,14.4583,,C,,,
1304,3,0,"Zabour, Miss. Hileni",female,14.5,1,0,2665,14.4542,,C,,328.0,
1305,3,0,"Zabour, Miss. Thamine",female,,1,0,2665,14.4542,,C,,,
1306,3,0,"Zakarian, Mr. Mapriededer",male,26.5,0,0,2656,7.225,,C,,304.0,
1307,3,0,"Zakarian, Mr. Ortin",male,27.0,0,0,2670,7.225,,C,,,
1308,3,0,"Zimmerman, Mr. Leo",male,29.0,0,0,315082,7.875,,S,,,


In [41]:
by_sex_class = titanic.groupby(['sex', 'pclass'])
by_sex_class

<pandas.core.groupby.groupby.DataFrameGroupBy object at 0x000000000977DFD0>

In [42]:
def impute_median(series):
    return series.fillna(series.median())

In [43]:
titanic.age = by_sex_class.age.transform(impute_median)
titanic.tail(10)

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
1299,3,0,"Yasbeck, Mr. Antoni",male,27.0,1,0,2659,14.4542,,C,C,,
1300,3,1,"Yasbeck, Mrs. Antoni (Selini Alexander)",female,15.0,1,0,2659,14.4542,,C,,,
1301,3,0,"Youseff, Mr. Gerious",male,45.5,0,0,2628,7.225,,C,,312.0,
1302,3,0,"Yousif, Mr. Wazli",male,25.0,0,0,2647,7.225,,C,,,
1303,3,0,"Yousseff, Mr. Gerious",male,25.0,0,0,2627,14.4583,,C,,,
1304,3,0,"Zabour, Miss. Hileni",female,14.5,1,0,2665,14.4542,,C,,328.0,
1305,3,0,"Zabour, Miss. Thamine",female,22.0,1,0,2665,14.4542,,C,,,
1306,3,0,"Zakarian, Mr. Mapriededer",male,26.5,0,0,2656,7.225,,C,,304.0,
1307,3,0,"Zakarian, Mr. Ortin",male,27.0,0,0,2670,7.225,,C,,,
1308,3,0,"Zimmerman, Mr. Leo",male,29.0,0,0,315082,7.875,,S,,,


In [44]:
# Other transformations with .apply
gapminder_2010.tail(5)

Unnamed: 0_level_0,fertility,life,population,child_mortality,gdp,region
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
West Bank and Gaza,4.216,72.64,4039192.0,23.8,4163.0,Middle East & North Africa
Western Sahara,2.471,66.844,530500.0,50.71,,Middle East & North Africa
"Yemen, Rep.",4.498,62.536,24052514.0,,,Middle East & North Africa
Zambia,5.813,54.549,13088570.0,84.8,3451.0,Sub-Saharan Africa
Zimbabwe,3.721,53.684,12571454.0,95.1,1484.0,Sub-Saharan Africa


In [45]:
def disparity(gr):
    # Compute the spread of gr['gdp']: s
    s = gr['gdp'].max() - gr['gdp'].min()
    # Compute the z-score of gr['gdp'] as (gr['gdp']-gr['gdp'].mean())/gr['gdp'].std(): z
    z = (gr['gdp']-gr['gdp'].mean())/gr['gdp'].std()
    # Return a DataFrame with the inputs {'z(gdp)':z, 'regional spread(gdp)': s}
    return pd.DataFrame({'z(gdp)':z, 'regional spread(gdp)':s})

In [46]:
regional = gapminder_2010.groupby('region')
reg_disp = regional.apply(disparity)
reg_disp.loc[['United States', 'United Kingdom', 'China']]

Unnamed: 0_level_0,z(gdp),regional spread(gdp)
Country,Unnamed: 1_level_1,Unnamed: 2_level_1
United States,3.013374,47855.0
United Kingdom,0.572873,89037.0
China,-0.432756,96993.0


### Groupby and filtering

In [47]:
auto = pd.read_csv('auto-mpg.csv')
auto.head()

Unnamed: 0,mpg,cyl,displ,hp,weight,accel,yr,origin,name
0,18.0,8,307.0,130,3504,12.0,70,US,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,US,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,US,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,US,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,US,ford torino


In [48]:
# Mean MPG by year
auto.groupby('yr')['mpg'].mean()

yr
70    17.689655
71    21.111111
72    18.714286
73    17.100000
74    22.769231
75    20.266667
76    21.573529
77    23.375000
78    24.061111
79    25.093103
80    33.803704
81    30.185714
82    32.000000
Name: mpg, dtype: float64

In [49]:
splitting = auto.groupby('yr')
type(splitting)

pandas.core.groupby.groupby.DataFrameGroupBy

In [50]:
type(splitting.groups)

dict

In [51]:
splitting.groups.keys()

dict_keys([70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82])

In [52]:
# groupby object: iteration 
for group_name, group in splitting:
    avg = group['mpg'].mean()
    print(group_name, avg)

70 17.689655172413794
71 21.11111111111111
72 18.714285714285715
73 17.1
74 22.76923076923077
75 20.266666666666666
76 21.573529411764707
77 23.375
78 24.061111111111114
79 25.09310344827585
80 33.803703703703704
81 30.185714285714287
82 32.0


In [53]:
# groupby object: iteration and filtering
for group_name, group in splitting:
    avg = group.loc[group['name'].str.contains('chevrolet'), 'mpg'].mean()
    print(group_name, avg)

70 15.666666666666666
71 20.25
72 15.333333333333334
73 14.833333333333334
74 18.666666666666668
75 17.666666666666668
76 23.25
77 20.25
78 23.233333333333334
79 21.666666666666668
80 30.05
81 23.5
82 29.0


In [54]:
# groupby object: comprehension
chevy_means = {year:group.loc[group['name'].str.contains('chevrolet'), 'mpg'].mean() 
              for year,group in splitting}
pd.Series(chevy_means)

70    15.666667
71    20.250000
72    15.333333
73    14.833333
74    18.666667
75    17.666667
76    23.250000
77    20.250000
78    23.233333
79    21.666667
80    30.050000
81    23.500000
82    29.000000
dtype: float64

In [55]:
# Boolean groupby
chevy = auto['name'].str.contains('chevrolet')
auto.groupby(['yr', chevy])['mpg'].mean()

yr  name 
70  False    17.923077
    True     15.666667
71  False    21.260870
    True     20.250000
72  False    19.120000
    True     15.333333
73  False    17.500000
    True     14.833333
74  False    23.304348
    True     18.666667
75  False    20.555556
    True     17.666667
76  False    21.350000
    True     23.250000
77  False    23.895833
    True     20.250000
78  False    24.136364
    True     23.233333
79  False    25.488462
    True     21.666667
80  False    34.104000
    True     30.050000
81  False    30.433333
    True     23.500000
82  False    32.461538
    True     29.000000
Name: mpg, dtype: float64

In [56]:
# Grouping and filtering with .apply()
def c_deck_survival(gr):
    c_passengers = gr['cabin'].str.startswith('C').fillna(False)
    return gr.loc[c_passengers, 'survived'].mean()

In [57]:
by_sex = titanic.groupby('sex')
by_sex

<pandas.core.groupby.groupby.DataFrameGroupBy object at 0x000000000979ABE0>

In [58]:
c_surv_by_sex = by_sex.apply(c_deck_survival)
c_surv_by_sex

sex
female    0.913043
male      0.312500
dtype: float64

In [59]:
# Grouping and filtering with .filter()
sales

Unnamed: 0_level_0,Company,Product,Units
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2015-02-02 08:30:00,Hooli,Software,3
2015-02-02 21:00:00,Mediacore,Hardware,9
2015-02-03 14:00:00,Initech,Software,13
2015-02-04 15:30:00,Streeplex,Software,13
2015-02-04 22:00:00,Acme Coporation,Hardware,14
2015-02-05 02:00:00,Acme Coporation,Software,19
2015-02-05 22:00:00,Hooli,Service,10
2015-02-07 23:00:00,Acme Coporation,Hardware,1
2015-02-09 09:00:00,Streeplex,Service,19
2015-02-09 13:00:00,Mediacore,Software,7


In [60]:
by_company = sales.groupby('Company')

In [61]:
by_com_sum = by_company['Units'].sum()
by_com_sum

Company
Acme Coporation    34
Hooli              30
Initech            30
Mediacore          45
Streeplex          36
Name: Units, dtype: int64

In [62]:
by_com_filt = by_company.filter(lambda g : g['Units'].sum() > 35)
by_com_filt

Unnamed: 0_level_0,Company,Product,Units
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2015-02-02 21:00:00,Mediacore,Hardware,9
2015-02-04 15:30:00,Streeplex,Software,13
2015-02-09 09:00:00,Streeplex,Service,19
2015-02-09 13:00:00,Mediacore,Software,7
2015-02-19 11:00:00,Mediacore,Hardware,16
2015-02-19 16:00:00,Mediacore,Service,10
2015-02-21 05:00:00,Mediacore,Software,3
2015-02-26 09:00:00,Streeplex,Service,4


In [64]:
under10 = (titanic['age'] < 10 ).map({True:'under 10', False:'over 10'})
under10.head()

0     over 10
1    under 10
2    under 10
3     over 10
4     over 10
Name: age, dtype: object

In [66]:
survived_mean_1 = titanic.groupby(under10)['survived'].mean()
survived_mean_1

age
over 10     0.366748
under 10    0.609756
Name: survived, dtype: float64

In [67]:
survived_mean_2 = titanic.groupby([under10, 'pclass'])['survived'].mean()
survived_mean_2

age       pclass
over 10   1         0.617555
          2         0.380392
          3         0.238897
under 10  1         0.750000
          2         1.000000
          3         0.446429
Name: survived, dtype: float64