<h1 style="color:cadetblue; font-size:2em;">Categoricals and groupby</h1>

In [2]:
import pandas as pd

In [11]:
# Sales data
sales = pd.DataFrame(
    {
        'weekday': ['Sun', 'Sun', 'Mon', 'Mon'],
        'city': ['Austin', 'Dallas', 'Austin', 'Dallas'],
        'bread': [139, 237, 326, 456],
        'butter': [20, 45, 70, 98]
    }
)

sales = sales.sort_index(axis=1)
sales

Unnamed: 0,bread,butter,city,weekday
0,139,20,Austin,Sun
1,237,45,Dallas,Sun
2,326,70,Austin,Mon
3,456,98,Dallas,Mon


In [12]:
# Boolean filter and count
sales.loc[sales['weekday'] == 'Sun'].count()

bread      2
butter     2
city       2
weekday    2
dtype: int64

In [13]:
# Groupby and count
sales.groupby('weekday').count()

Unnamed: 0_level_0,bread,butter,city
weekday,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Mon,2,2,2
Sun,2,2,2


In [14]:
# Groupby and sum
sales.groupby('weekday')['bread'].sum()

weekday
Mon    782
Sun    376
Name: bread, dtype: int64

In [15]:
# Groupby and sum: multiple columns
sales.groupby('weekday')[['bread','butter']].sum()

Unnamed: 0_level_0,bread,butter
weekday,Unnamed: 1_level_1,Unnamed: 2_level_1
Mon,782,168
Sun,376,65


In [16]:
# Groupby and mean: multi-level index
sales.groupby(['city','weekday']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,bread,butter
city,weekday,Unnamed: 2_level_1,Unnamed: 3_level_1
Austin,Mon,326,70
Austin,Sun,139,20
Dallas,Mon,456,98
Dallas,Sun,237,45


In [17]:
# Customers
customers = pd.Series(['Dave','Alice','Bob','Alice'])
customers

0     Dave
1    Alice
2      Bob
3    Alice
dtype: object

In [18]:
# Groupby and sum: by series
sales.groupby(customers)['bread'].sum()

Alice    693
Bob      326
Dave     139
Name: bread, dtype: int64

In [19]:
# Categorical data
sales['weekday'].unique()

array(['Sun', 'Mon'], dtype=object)

In [20]:
sales['weekday'] = sales['weekday'].astype('category')
sales['weekday']

0    Sun
1    Sun
2    Mon
3    Mon
Name: weekday, dtype: category
Categories (2, object): [Mon, Sun]

<div style="border: 2px dashed darkcyan; padding: 10px;">
    <strong>Important Note:</strong>
    <p>Categorical data has two important advantages:<p>
    <ol>
        <li>Uses less memory</li>
        <li>Speeds up operations like goupby()</li>
    </ol>
</div>

<h1 style="color:cadetblue; font-size:2em;">Groupby and aggregation</h1>

In [21]:
import pandas as pd
# Sales data
sales = pd.DataFrame(
    {
        'weekday': ['Sun', 'Sun', 'Mon', 'Mon'],
        'city': ['Austin', 'Dallas', 'Austin', 'Dallas'],
        'bread': [139, 237, 326, 456],
        'butter': [20, 45, 70, 98]
    }
)

sales = sales.sort_index(axis=1)
sales

Unnamed: 0,bread,butter,city,weekday
0,139,20,Austin,Sun
1,237,45,Dallas,Sun
2,326,70,Austin,Mon
3,456,98,Dallas,Mon


In [22]:
# Review: groupby
sales.groupby('city')[['bread','butter']].max()

Unnamed: 0_level_0,bread,butter
city,Unnamed: 1_level_1,Unnamed: 2_level_1
Austin,326,70
Dallas,456,98


In [23]:
# Multiple aggregations
sales.groupby('city')[['bread','butter']].agg(['max','sum'])

Unnamed: 0_level_0,bread,bread,butter,butter
Unnamed: 0_level_1,max,sum,max,sum
city,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Austin,326,465,70,90
Dallas,456,693,98,143


<div style="border: 2px dashed darkcyan; padding: 10px;">
    <strong>Aggregation functions:</strong>
    <p>string names:<p>
    <ul>
        <li>'sum'</li>
        <li>'mean'</li>
        <li>'count'</li>
    </ul>
</div>

In [25]:
# Custom aggregation
def data_range(series):
    return series.max() - series.min()

sales.groupby('weekday')[['bread', 'butter']].agg(data_range)

Unnamed: 0_level_0,bread,butter
weekday,Unnamed: 1_level_1,Unnamed: 2_level_1
Mon,130,28
Sun,98,25


In [27]:
# Custom aggregation: dictionaries
sales.groupby(customers)[['bread', 'butter']].agg({'bread':'sum', 'butter':data_range})

Unnamed: 0,bread,butter
Alice,693,53
Bob,326,0
Dave,139,0


<h1 style="color:cadetblue; font-size:2em;">Groupby and transformation</h1>

In [28]:
# The z-score
def zscore(series):
    return (series - series.mean()) / series.std()


In [30]:
# The automobile dataset
auto = pd.read_csv('datasets/auto-mpg.csv')
auto.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino


In [31]:
# MPG z-score
zscore(auto['mpg']).head()

0   -0.705551
1   -1.089379
2   -0.705551
3   -0.961437
4   -0.833494
Name: mpg, dtype: float64

In [33]:
# MPG z-score by year
auto.groupby('model year')['mpg'].transform(zscore).head()

0    0.058125
1   -0.503753
2    0.058125
3   -0.316460
4   -0.129168
Name: mpg, dtype: float64

In [36]:
# Apply transformation and aggregation
def zscore_with_year_and_name(group):
    df = pd.DataFrame(
    {'mpg': zscore(group['mpg']),
    'year': group['model year'],
    'name': group['car name']})
    return df

auto.groupby('model year').apply(zscore_with_year_and_name).head()

Unnamed: 0,mpg,year,name
0,0.058125,70,chevrolet chevelle malibu
1,-0.503753,70,buick skylark 320
2,0.058125,70,plymouth satellite
3,-0.31646,70,amc rebel sst
4,-0.129168,70,ford torino


<h1 style="color:cadetblue; font-size:2em;">Groupby and filtering</h1>

In [37]:
# The automobile dataset
auto = pd.read_csv('datasets/auto-mpg.csv')
auto.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino


In [38]:
# Mean MPG by year
auto.groupby('model year')['mpg'].mean()

model year
70    17.689655
71    21.250000
72    18.714286
73    17.100000
74    22.703704
75    20.266667
76    21.573529
77    23.375000
78    24.061111
79    25.093103
80    33.696552
81    30.334483
82    31.709677
Name: mpg, dtype: float64

In [39]:
# groupby object
splitting = auto.groupby('model year')
type(splitting)

pandas.core.groupby.groupby.DataFrameGroupBy

In [40]:
type(splitting.groups)

dict

In [41]:
print(splitting.groups.keys())

dict_keys([70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82])


In [42]:
# groupby object: iteration
for group_name, group in splitting:
    avg = group['mpg'].mean()
    print(group_name, avg)

70 17.689655172413794
71 21.25
72 18.714285714285715
73 17.1
74 22.703703703703702
75 20.266666666666666
76 21.573529411764707
77 23.375
78 24.061111111111114
79 25.09310344827585
80 33.696551724137926
81 30.33448275862069
82 31.70967741935484


In [44]:
# groupby object: iteration and filtering
for group_name, group in splitting:
    avg = group.loc[group['car name'].str.contains('chevrolet'), 'mpg'].mean()
    print(group_name, avg)

70 15.666666666666666
71 20.25
72 15.333333333333334
73 14.833333333333334
74 18.666666666666668
75 17.666666666666668
76 23.25
77 20.25
78 23.233333333333334
79 21.666666666666668
80 30.05
81 23.5
82 29.0


In [45]:
# groupby object: comprehension
chevy_means = {year:group.loc[group['car name'].str.contains('chevrolet'),'mpg'].mean() for year,group in splitting}

In [46]:
pd.Series(chevy_means)

70    15.666667
71    20.250000
72    15.333333
73    14.833333
74    18.666667
75    17.666667
76    23.250000
77    20.250000
78    23.233333
79    21.666667
80    30.050000
81    23.500000
82    29.000000
dtype: float64

In [47]:
# Boolean groupby
chevy = auto['car name'].str.contains('chevrolet')

auto.groupby(['model year', chevy])['mpg'].mean()

model year  car name
70          False       17.923077
            True        15.666667
71          False       21.416667
            True        20.250000
72          False       19.120000
            True        15.333333
73          False       17.500000
            True        14.833333
74          False       23.208333
            True        18.666667
75          False       20.555556
            True        17.666667
76          False       21.350000
            True        23.250000
77          False       23.895833
            True        20.250000
78          False       24.136364
            True        23.233333
79          False       25.488462
            True        21.666667
80          False       33.966667
            True        30.050000
81          False       30.578571
            True        23.500000
82          False       32.111111
            True        29.000000
Name: mpg, dtype: float64