In [4]:
#Video 1 - Categoricals and groupby
import pandas as pd
sales = pd.DataFrame(
        {
        'weekday': ['Sun', 'Sun', 'Mon', 'Mon'],
        'city': ['Austin', 'Dallas', 'Austin', 'Dallas'],
        'bread': [139, 237, 326, 456],
        'butter': [20, 45, 70, 98]
        }
)
sales

Unnamed: 0,weekday,city,bread,butter
0,Sun,Austin,139,20
1,Sun,Dallas,237,45
2,Mon,Austin,326,70
3,Mon,Dallas,456,98


In [6]:
#could use boolean filtering to find all sales made on Sunday and count them
sales.loc[sales['weekday'] == 'Sun'].count()
#requires for us to know the disticnt entries of the columns

weekday    2
city       2
bread      2
butter     2
dtype: int64

In [8]:
#alternatively we can groupby the weekday column and count entries for each distict value found
sales.groupby('weekday').count()
#more convenient - don't need to know the distinct entires of the columns
df.groupby('IndexGroupedByThisCol')['subselectThisCol'].aggregationMethod()

Unnamed: 0_level_0,city,bread,butter
weekday,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Mon,2,2,2
Sun,2,2,2


In [None]:
'''
A closer look:
sales.groupby('weekday').count()
1. split by weekday
2. apply count() function to each group of rows
3. combine results in a new DF with distinct values of weekday in the index

Count method is an aggregation or reduction because it reduces many values into a single value
Aggregation/ Reduction in groupy:
1. mean()
2. std()
3. sum()
4. first(), last()
5. max(), min()

'''

In [9]:
#Groupby and sum
sales.groupby('weekday')['bread'].sum()

weekday
Mon    782
Sun    376
Name: bread, dtype: int64

In [10]:
#Groupby and sum: multiple columns
#can use a list of columns
sales.groupby('weekday')[['bread', 'butter']].sum()


Unnamed: 0_level_0,bread,butter
weekday,Unnamed: 1_level_1,Unnamed: 2_level_1
Mon,782,168
Sun,376,65


In [11]:
#can groupby multilevel by passing in  multiple column names in a list
sales.groupby(['city', 'weekday']).mean()
#the result is the avg amnt of bread and butter bought at each of the two locations on Sun and Mon
#this returns a SORTED multi-level index

Unnamed: 0_level_0,Unnamed: 1_level_0,bread,butter
city,weekday,Unnamed: 2_level_1,Unnamed: 3_level_1
Austin,Mon,326,70
Austin,Sun,139,20
Dallas,Mon,456,98
Dallas,Sun,237,45


In [12]:
#Groupby can use any Pandas Series with the same index values
customers = pd.Series(['Dave', 'Alice', 'Bob', 'Alice'])
customers

0     Dave
1    Alice
2      Bob
3    Alice
dtype: object

In [13]:
# Groupby and sum: by Series
#customers df has an identical index to sales df
#df.groupby(SerieswithSameIndex)[subselectColumn].aggFunc()
sales.groupby(customers)['bread'].sum()
#results in a new Series with a customer name in the index

Alice    693
Bob      326
Dave     139
Name: bread, dtype: int64

In [14]:
#Categorical Data
#using .unique() on a Series returns an array of the distinct entries
sales['weekday'].unique()

array(['Sun', 'Mon'], dtype=object)

In [15]:
#to find out how many times each individual value occurs in that Series use .value_counts()
sales['weekday'].value_counts()

Sun    2
Mon    2
Name: weekday, dtype: int64

In [16]:
#transform weekday column into a categorical type
sales['weekday'] = sales['weekday'].astype('category')
sales['weekday']

0    Sun
1    Sun
2    Mon
3    Mon
Name: weekday, dtype: category
Categories (2, object): [Mon, Sun]

In [None]:
'''
What are the advantages of categoricals over strings?
    1. Uses less memory
    2. speeds up operations like groupby()

Series entries are stored using small integers and a seperate look-up table
'''

In [29]:
#Ex1 - grouping by multiple columns
csv = "titanic.csv"
titanic = pd.read_csv(csv)
titanic.head()

# Group titanic by 'pclass'
by_class = titanic.groupby('pclass') #this is a df generator object

# Aggregate 'survived' column of by_class by count
count_by_class = by_class['survived'].count()

# Print count_by_class
print(count_by_class)

# Group titanic by 'embarked' and 'pclass'
by_mult = titanic.groupby(['embarked', 'pclass'])

# Aggregate 'survived' column of by_mult by count
count_mult = by_mult['survived'].count()

# Print count_mult
print(count_mult)

pclass
1    323
2    277
3    709
Name: survived, dtype: int64
embarked  pclass
C         1         141
          2          28
          3         101
Q         1           3
          2           7
          3         113
S         1         177
          2         242
          3         495
Name: survived, dtype: int64


In [None]:
import requests
url1 = 'https://s3.amazonaws.com/assets.datacamp.com/production/course_1650/datasets/life_expectancy.csv' #has to be a string
r = requests.get(url1)

with open('life_fname.csv', 'wb') as f:
    f.write(r.content)
life_fname = 'life_fname.csv'

url2 = 'https://s3.amazonaws.com/assets.datacamp.com/production/course_1650/datasets/regions.csv'
r2 = requests.get(url2)
with open('regions_fname.csv', 'wb') as f:
    f.write(r2.content)
regions_fname = 'regions_fname.csv'

In [40]:
#Ex2 = Groupying by another series


# Read life_fname into a DataFrame: life
life = pd.read_csv(life_fname, index_col='Country')

# Read regions_fname into a DataFrame: regions
regions = pd.read_csv(regions_fname, index_col='Country')
#print(type(regions))

# Group life by regions['region']: life_by_region
life_by_region = life.groupby(regions['region']) #regions is a df, not a Series
#that's why use regions['region'] rather than .groupby(regions)['region']

# Print the mean over the '2010' column of life_by_region
print(life_by_region['2010'].mean())

region
America                       74.037350
East Asia & Pacific           73.405750
Europe & Central Asia         75.656387
Middle East & North Africa    72.805333
South Asia                    68.189750
Sub-Saharan Africa            57.575080
Name: 2010, dtype: float64


In [42]:
#Video 2- Groupby and aggregation
sales
#Review: groupby
#groupby a column, select one or more columns on which to perform aggregation:
sales.groupby('city')[['bread', 'butter']].max()

Unnamed: 0_level_0,bread,butter
city,Unnamed: 1_level_1,Unnamed: 2_level_1
Austin,326,70
Dallas,456,98


In [45]:
#multiple aggregations
#using .agg() we can do several
sales.groupby('city')[['bread', 'butter']].agg(['max', 'min', 'sum'])
#three aggregations are computed for each column (bread and butter) for each group (Austin, Dallas)
#the result is a MultiLevel index for the columns

Unnamed: 0_level_0,bread,bread,bread,butter,butter,butter
Unnamed: 0_level_1,max,min,sum,max,min,sum
city,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Austin,326,139,465,70,20,90
Dallas,456,237,693,98,45,143


In [47]:
#Aggregation functions
#can be used in many different ways:
#1. pass a list of strings repr common agg methods: sum, mean, count
#2. accepts user-defined functions or library functions
#3. Accepts a dict as input

def data_range(series):
    return series.max() - series.min()
#data_range IS an aggregation because it RECEIVES a series and RETURNS a single number

sales.groupby('weekday')[['bread', 'butter']].agg(data_range)

Unnamed: 0_level_0,bread,butter
weekday,Unnamed: 1_level_1,Unnamed: 2_level_1
Mon,130,28
Sun,98,25


In [48]:
sales.groupby(customers)[['bread', 'butter']].agg({'bread':'sum', 'butter':data_range})
#dict keys are column names
#dict vals are agg funcs to apply to EACH column
#note because data_range is a user-defined function - no need for ''

Unnamed: 0,bread,butter
Alice,693,53
Bob,326,0
Dave,139,0


In [52]:
#Ex3 - Computing multiple aggregates of multiple columns
#Reminder: When applying multiple aggregations on multiple columns, the aggregated DataFrame has a 
# multi-level column index.

# Group titanic by 'pclass': by_class
by_class = titanic.groupby('pclass')

# Select 'age' and 'fare'
by_class_sub = by_class[['age','fare']]

# Aggregate by_class_sub by 'max' and 'median': aggregated
aggregated = by_class_sub.agg(['max', 'median'])

# Print the maximum age in each class
print('Max age in each class\n', aggregated.loc[:, ('age','max')]) #interesting, [all rows, (from age column, choose max value)]

# Print the median fare in each class
print('Median fare in each class\n', aggregated.loc[:, ('fare', 'median')])

Max age in each class
 pclass
1    80.0
2    70.0
3    74.0
Name: (age, max), dtype: float64
Median fare in each class
 pclass
1    60.0000
2    15.0458
3     8.0500
Name: (fare, median), dtype: float64


In [55]:
csv = 'gapminder_tidy.csv'
gapminder = pd.read_csv(csv, index_col=['Year', 'region', 'Country'])
gapminder.describe

<bound method NDFrame.describe of                                         fertility    life  population  \
Year region                Country                                      
1964 South Asia            Afghanistan      7.671  33.639  10474903.0   
1965 South Asia            Afghanistan      7.671  34.152  10697983.0   
1966 South Asia            Afghanistan      7.671  34.662  10927724.0   
1967 South Asia            Afghanistan      7.671  35.170  11163656.0   
1968 South Asia            Afghanistan      7.671  35.674  11411022.0   
...                                           ...     ...         ...   
2002 Europe & Central Asia Åland              NaN  81.800     26257.0   
2003 Europe & Central Asia Åland              NaN  80.630     26347.0   
2004 Europe & Central Asia Åland              NaN  79.880     26530.0   
2005 Europe & Central Asia Åland              NaN  80.000     26766.0   
2006 Europe & Central Asia Åland              NaN  80.100     26923.0   

                

In [58]:
#Ex4 - Aggregating on index levels/fields
#in a multi-level row indexed DF, individual levels can be use dto perform groupby
#this allows ADVANCED aggregation techniques to be applied along one or more levels in the index
# accross one or more columns
#uses gapminder_tidy.csv which was imported above

# Read the CSV file into a DataFrame and sort the index: gapminder
gapminder = pd.read_csv(csv, index_col=['Year', 'region', 'Country'])
gapminder.sort_index(inplace=True)

# Group gapminder by 'Year' and 'region': by_year_region
by_year_region = gapminder.groupby(level=['Year', 'region']) #level is usually an int. Can I use a list of strings?
#yes, level will take list
# Define the function to compute spread: spread
def spread(series):
    return series.max() - series.min()

# Create the dictionary: aggregator
aggregator = {'population':'sum', 'child_mortality':'mean', 'gdp':spread} #spread is a user-defined func

# Aggregate by_year_region using the dictionary: aggregated
aggregated = by_year_region.agg(aggregator)

# Print the last 6 entries of aggregated 
print(aggregated.tail(6))

                                   population  child_mortality       gdp
Year region                                                             
2013 America                     9.629087e+08        17.745833   49634.0
     East Asia & Pacific         2.244209e+09        22.285714  134744.0
     Europe & Central Asia       8.968788e+08         9.831875   86418.0
     Middle East & North Africa  4.030504e+08        20.221500  128676.0
     South Asia                  1.701241e+09        46.287500   11469.0
     Sub-Saharan Africa          9.205996e+08        76.944490   32035.0


In [70]:
#Ex5 - Grouping on a function of the index
#groupby can alost be performed on transformation of the index values
#incase of a DateTimeIndex, we can extract portions of the datetime over which to group


# Read file: sales
sales = pd.read_csv('sales-feb-2015.csv', index_col ='Date', parse_dates=True)
#csv name different than DataCamp submission

# Create a groupby object: by_day
by_day = sales.groupby(sales.index.strftime('%a')) 
#print(by_day, '\n', type(by_day))
# Create sum: units_sum
units_sum = by_day['Units'].sum()

# Print units_sum
print(units_sum)



Mon    48
Sat     7
Thu    59
Tue    13
Wed    48
Name: Units, dtype: int64


In [None]:
#Video 3 - Groupby and transformations
#Often want to group data, and apply distinct transformations to distinct groups
#instead of aggregating AFTER grouping, we can apply transform method instead
#Changes DF entries to a spec'd func w/o changing the index

def zscore(series):
    return (series - series.mean()) / series.std()

zscore(df['column']).head() #can apply to a df column

#alternative: can normalize column data by index rather than over whole population
df.groupby('index item')['column'].transform(func).head()

#We know:
    #1. .agg() applies reduction
    #2. .transform() applies a func elt wise to groups
    
    #In some cases, split-apply-combine operations do not neatly fall into agg or transformations
    #in those cases, we use apply
    
def zscore_with_year_and_name(group):
    df = pd.DataFrame(
         {
             'mpg':zscore(group['mpg']), #transform mpg column
             'year':group['yr'], #take original values from yr column
             'name':group['name'] #take original values from name column
         }
    )
    return df

#this function is too complicate for .transform(), so we use .apply() instead
auto.groupby('yr').apply(zscore_with_year_and_name).head()

    

In [132]:
#had gapminder already, but there were three elts in index - had to reset index
gapminder.describe
gapminder.index
print('Gapminder index: \n', gapminder.index)
try:
    g2 = gapminder.reset_index()
except ValurError as e:
    print(e)

print('g2 index: \n', g2.index)
#g2 no longer has a multiIndex of Year, Country, and region. It's all reset as a Range(0, 10111, 1)

Gapminder index: 
 MultiIndex([(1964,            'America', 'Antigua and Barbuda'),
            (1964,            'America',           'Argentina'),
            (1964,            'America',               'Aruba'),
            (1964,            'America',             'Bahamas'),
            (1964,            'America',            'Barbados'),
            (1964,            'America',              'Belize'),
            (1964,            'America',             'Bolivia'),
            (1964,            'America',              'Brazil'),
            (1964,            'America',              'Canada'),
            (1964,            'America',               'Chile'),
            ...
            (2013, 'Sub-Saharan Africa',             'Somalia'),
            (2013, 'Sub-Saharan Africa',        'South Africa'),
            (2013, 'Sub-Saharan Africa',         'South Sudan'),
            (2013, 'Sub-Saharan Africa',               'Sudan'),
            (2013, 'Sub-Saharan Africa',           'Swa

In [148]:
#set index to only "Year" and "Country"
test = g2.set_index(['Year', 'Country'])

#sliced it by the year 2010, and all countries then sorted 
gapminder_2010 = test.loc[2010,:].sort_index()
gapminder_2010.describe
gapminder_2010.columns
gapminder_2010.index
#ok this seems to match DataCamp now, continue with Exercise

Index(['Afghanistan', 'Albania', 'Algeria', 'Angola', 'Antigua and Barbuda',
       'Argentina', 'Armenia', 'Aruba', 'Australia', 'Austria',
       ...
       'Uzbekistan', 'Vanuatu', 'Venezuela', 'Vietnam',
       'Virgin Islands (U.S.)', 'West Bank and Gaza', 'Western Sahara',
       'Yemen, Rep.', 'Zambia', 'Zimbabwe'],
      dtype='object', name='Country', length=202)

In [149]:
#Ex6
#can apply a .transform() after grouping to apply a function to groups of data independently
from scipy.stats import zscore

# Group gapminder_2010: standardized
standardized = gapminder_2010.groupby('region')[['life', 'fertility']].transform(zscore) 

#method groupby is not subscriptable
#use () when calling 'region' column not [] because we aren't calling an index
#i forgot the semantics here
#df.groupby(groupIndexByThis'column name')[[select these and do]].transform(using this func)


# Construct a Boolean Series to identify outliers: outliers
outliers = (standardized['life'] < -3) | (standardized['fertility'] > 3)

# Filter gapminder_2010 by the outliers: gm_outliers
gm_outliers = gapminder_2010.loc[outliers]

# Print gm_outliers
print(gm_outliers)

                            region  fertility    life  population  \
Country                                                             
Guatemala                  America      3.974  71.100  14388929.0   
Haiti                      America      3.350  45.000   9993247.0   
Tajikistan   Europe & Central Asia      3.780  66.830   6878637.0   
Timor-Leste    East Asia & Pacific      6.237  65.952   1124355.0   

             child_mortality     gdp  
Country                               
Guatemala               34.5  6849.0  
Haiti                  208.8  1518.0  
Tajikistan              52.6  2110.0  
Timor-Leste             63.8  1777.0  


In [173]:
titanic = pd.read_csv('titanic.csv') #had to reread the file for this assignment - the ages weren't correct
titanic.age.head()

0    29.00
1     0.92
2     2.00
3    30.00
4    25.00
Name: age, dtype: float64

In [185]:
#Ex7 - Filling missing data (imputation) by group
#ML and stats packages cannot determine best action when missing data entries are encountered
#instead of using .drop_na(), can fill missing data with .groupby() and .transform()

# Create a groupby object: by_sex_class
by_sex_class = titanic.groupby(['sex', 'pclass'])

# Write a function that imputes median
def impute_median(series):
    return series.fillna(series.median())
#this func fills NaN values with median

# Impute age and assign to titanic['age']
#titanic.age = by_sex_class.age.transform(impute_median) #age is a Series, can be chained or called in []
#age is fed into .transform(inpute_median(age)) but not literally...
#semantics: df.argGoingIntoFunc.transform(func)
titanic.age = by_sex_class['age'].transform(impute_median)

# Print the output of titanic.tail(10)
print(titanic.tail(10))


      pclass  survived                                     name     sex   age  \
1299       3         0                      Yasbeck, Mr. Antoni    male  27.0   
1300       3         1  Yasbeck, Mrs. Antoni (Selini Alexander)  female  15.0   
1301       3         0                     Youseff, Mr. Gerious    male  45.5   
1302       3         0                        Yousif, Mr. Wazli    male  25.0   
1303       3         0                    Yousseff, Mr. Gerious    male  25.0   
1304       3         0                     Zabour, Miss. Hileni  female  14.5   
1305       3         0                    Zabour, Miss. Thamine  female  22.0   
1306       3         0                Zakarian, Mr. Mapriededer    male  26.5   
1307       3         0                      Zakarian, Mr. Ortin    male  27.0   
1308       3         0                       Zimmerman, Mr. Leo    male  29.0   

      sibsp  parch  ticket     fare cabin embarked boat   body home.dest  
1299      1      0    2659  14.45

In [184]:
test = by_sex_class.apply(impute_median) #this doesn't work because apply worked on the groups
#sex and class, not age
print(test.age.tail(10)) #note how the age medians are different than above
print(test.sex.tail(10))

sex   pclass      
male  3       1296    27.0
              1297    25.0
              1298    36.0
              1299    27.0
              1301    45.5
              1302    25.0
              1303    25.0
              1306    26.5
              1307    27.0
              1308    29.0
Name: age, dtype: float64
sex   pclass      
male  3       1296    male
              1297    male
              1298    male
              1299    male
              1301    male
              1302    male
              1303    male
              1306    male
              1307    male
              1308    male
Name: sex, dtype: object


In [176]:
#Ex8 - Other transformations with .apply()
#The .apply() method when used on a groupby object performs an arbitrary function on each of the 
#groups. These functions can be aggregations, transformations or more complex workflows. 
#The .apply() method will then combine the results in an intelligent way.

def disparity(gr):
    # Compute the spread of gr['gdp']: s
    s = gr['gdp'].max() - gr['gdp'].min()
    # Compute the z-score of gr['gdp'] as (gr['gdp']-gr['gdp'].mean())/gr['gdp'].std(): z
    z = (gr['gdp'] - gr['gdp'].mean())/gr['gdp'].std()
    # Return a DataFrame with the inputs {'z(gdp)':z, 'regional spread(gdp)':s}
    return pd.DataFrame({'z(gdp)':z , 'regional spread(gdp)':s})

# Group gapminder_2010 by 'region': regional
regional = gapminder_2010.groupby('region')

# Apply the disparity function on regional: reg_disp
reg_disp = regional.apply(disparity)

# Print the disparity of 'United States', 'United Kingdom', and 'China'
print(reg_disp.loc[['United States', 'United Kingdom', 'China']])

                  z(gdp)  regional spread(gdp)
Country                                       
United States   3.013374               47855.0
United Kingdom  0.572873               89037.0
China          -0.432756               96993.0


In [None]:
Video 4 - Groupby and filtering

#What if we want to  filter a groupby?
#first start with splitting before aggregating

#groupby objects
#save the output of groupby as "splitting" before aggregating
splitting = auto.groupby('yr')
type(splitting) would return pandas.core.groupby.DataFrameGroupBy
#splitting has an attribute .groups which is a dictionary
type(splitting.groups) would return dict
splitting.groups keys are the 'yrs' and values are the corresponding rows of the original DF

#can ITERATE and carry out computations using loops
for k,v in splitting:
    do something with k,v
    
#can rewrite the loop as a dict comprehension
chevy_means = {year:group.loc[group['name'].str.contains('chevrolet'), 'mpg'].mean() for year,group in splitting}
#in chevy_means the keys are the years and the values are the filtered mpg averages for Chevrolet

#can construct a panda Series with chevy_means dict
pd.Series(chevy_means)

#Boolean groupby - can use a boolean Series with same index in .groupby
#to perform a one-to-all comparison
chevy = auto['name'].str.contains('chevrolet')
type(chevy) should return Boolean Series
auto.groupby(['yr', chevy])['mpg'].mean()



In [188]:
#reload titanic
titanic =  pd.read_csv('titanic.csv')
titanic.index
titanic.columns

Index(['pclass', 'survived', 'name', 'sex', 'age', 'sibsp', 'parch', 'ticket',
       'fare', 'cabin', 'embarked', 'boat', 'body', 'home.dest'],
      dtype='object')

In [189]:
#Ex9
#by using .apply() you can write functions that filter rows within groups. the .apply() method
# will handle the iteration over individual groups and then re-combine them back into a Series or DF

#Goal: analyze survival rates from the 'C' deck

#provided function:
def c_deck_survival(gr):

    c_passengers = gr['cabin'].str.startswith('C').fillna(False) #filter for "C" cabin

    return gr.loc[c_passengers, 'survived'].mean()

# Create a groupby object using titanic over the 'sex' column: by_sex
by_sex = titanic.groupby('sex')

# Call by_sex.apply with the function c_deck_survival
c_surv_by_sex = by_sex.apply(c_deck_survival)

# Print the survival rates
print(c_surv_by_sex)

sex
female    0.913043
male      0.312500
dtype: float64


In [191]:
#reload sales_2015.csv for Ex10
sales = pd.read_csv('sales-feb-2015.csv', index_col = 'Date', parse_dates=True)
sales.describe
sales.columns

Index(['Company', 'Product', 'Units'], dtype='object')

In [196]:
#Ex10 - Grouping and filtering with .filter()
#Can use groupby with the .filter() method to remove whole groups of rows from a DF based
#on a boolean condition

#Goal: take February sales data and remove entries from companies that purchased less than or equal
# to 35 units in the whole month

# Read the CSV file into a DataFrame: sales
#sales = pd.read_csv('sales.csv', index_col='Date', parse_dates=True)

# Group sales by 'Company': by_company
by_company = sales.groupby('Company')

# Compute the sum of the 'Units' of by_company: by_com_sum
by_com_sum = by_company['Units'].apply(sum)
print(by_com_sum)

# Filter 'Units' where the sum is > 35: by_com_filt
by_com_filt = by_company.filter(lambda g: g['Units'].sum() > 35)
print(by_com_filt)

Company
Acme Coporation    34
Hooli              30
Initech            30
Mediacore          45
Streeplex          36
Name: Units, dtype: int64
                       Company   Product  Units
Date                                           
2015-02-02 21:00:00  Mediacore  Hardware      9
2015-02-04 15:30:00  Streeplex  Software     13
2015-02-09 09:00:00  Streeplex   Service     19
2015-02-09 13:00:00  Mediacore  Software      7
2015-02-19 11:00:00  Mediacore  Hardware     16
2015-02-19 16:00:00  Mediacore   Service     10
2015-02-21 05:00:00  Mediacore  Software      3
2015-02-26 09:00:00  Streeplex   Service      4


In [207]:
#Ex11 - Filtering and grouping with .map()
#Sometimes you want to group by a function/transformation of a column
#the key is the Series is indexed the same as the DF. 
#Can also mix match column grouping with Series grouping

#Goal: investigate survival rates of passengers by 'age' and 'pclass'
    #Find out what fraction of children under 10 srurived in each 'pclass'
    
#titanic.describe
#titanic.columns
#len(titanic.columns) 
#confirmed my titanic matches with DataCamp's

# Create the Boolean Series: under10
under10 = titanic['age'] < 10
under10 = under10.map({True:'under 10', False:'over 10'})

# Group by under10 and compute the survival rate
survived_mean_1 = titanic.groupby(under10).survived.mean()
print(survived_mean_1)

# Group by under10 and pclass and compute the survival rate
survived_mean_2 = titanic.groupby([under10, 'pclass']).survived.mean()
#under10 is a Boolean Series, does not need ''
print(survived_mean_2)

age
over 10     0.366748
under 10    0.609756
Name: survived, dtype: float64
age       pclass
over 10   1         0.617555
          2         0.380392
          3         0.238897
under 10  1         0.750000
          2         1.000000
          3         0.446429
Name: survived, dtype: float64
