# In general group by process "split-apply-combine" for the given data frame

- Splitting the data into groups
- Applying a function to each group independently
- Combining the results into a data structure

Source: https://github.com/BindiChen/machine-learning/blob/master/data-analysis/032-pandas-groupby/pandas-groupby.ipynb

### Note: 
- This jupyter notebook is inspired from the above source
- I have added various other details to this notebook which are not available in the above notebook

In [None]:
import pandas as pd
import numpy as np

In [None]:
remote_url = "https://raw.githubusercontent.com/prodramp/publiccode/master/datasets/titanic.csv"

In [None]:
df = pd.read_csv(remote_url)

In [None]:
df

In [None]:
df_groupby_sex = df.groupby('Sex')

In [None]:
df_groupby_sex

In [None]:
type(df_groupby_sex)

In [None]:
df_groupby_sex.ngroups

In [None]:
df_groupby_sex.groups

In [None]:
df_groupby_sex.size()

In [None]:
df_groupby_sex.groups

In [None]:
df_groupby_sex.first()

In [None]:
df_groupby_sex.last()

In [None]:
# To retrieve one of the created groups
df_sex_female = df_groupby_sex.get_group('female')

In [None]:
df_sex_female

## Groups as keys

In [None]:
df_groupby_sex.groups.keys()

In [None]:
group_by_keys = df_groupby_sex.groups.keys()

In [None]:
group_by_keys

In [None]:
len(group_by_keys)

In [None]:
type(group_by_keys)

In [None]:
for each_key in group_by_keys:
    print(each_key)

In [None]:
keys = [key for key in group_by_keys]

In [None]:
keys

In [None]:
keys[0]

In [None]:
keys[1]

In [None]:
dict(list(df_groupby_sex))[keys[0]]

In [None]:
for each_key in group_by_keys:
    x = df_groupby_sex.get_group(each_key)
    print(x)

In [None]:
for each_key in group_by_keys:
    print(dict(list(df_groupby_sex))[each_key])

## Raw data processing

In [None]:
list(df_groupby_sex)

In [None]:
dict(list(df_groupby_sex))

In [None]:
dict(list(df_groupby_sex))['male']

In [None]:
dict(list(df_groupby_sex))['female']

## Processing Another Group

In [None]:
df_group_pclass = df.groupby('Pclass')

In [None]:
df_group_pclass

In [None]:
df_group_pclass.ngroups

In [None]:
df_group_pclass.ngroup

In [None]:
df_group_pclass.size()

In [None]:
df_group_pclass.groups

In [None]:
df_group_pclass.first()

In [None]:
df_group_pclass.last()

# Listing all the methods available on groupby object

In [None]:
import IPython

methods = [method_name for method_name in dir(df_groupby_sex) 
           if callable(getattr(df_groupby_sex, method_name)) & ~method_name.startswith('_')]

print(IPython.utils.text.columnize(methods))

In [None]:
import IPython

methods = [method_name for method_name in dir(df_group_pclass) 
           if callable(getattr(df_group_pclass, method_name)) & ~method_name.startswith('_')]

print(IPython.utils.text.columnize(methods))

## Processing 

In [None]:
df.groupby('Sex').Age.max()

In [None]:
df_groupby_sex.Age.max()

In [None]:
df_groupby_sex.get_group('male')['Age'].max()

In [None]:
df_groupby_sex.get_group('female')['Age'].max()

# Aggregating Values based on Group

In [None]:
df.groupby('Sex').Age.agg(['max', 'min', 'count', 'median', 'mean'])

In [None]:
# with custom column name
df.groupby('Sex').Age.agg(
    sex_max=('max'),
    sex_min=('min'),
)


In [None]:
# Custom aggregation function
def categorize(x):
    m = x.mean()
    return True if m > 29 else False

df.groupby('Sex').Age.agg(['max', 'mean', categorize])

In [None]:
# Use lambda
df.groupby('Sex').Age.agg(
    ['max', 'mean', lambda x: True if x.mean() > 29 else False]
)

# Applicable to full DataFrame when calling without column

In [None]:
df.groupby('Sex').mean()
## applicable to all numeric columns

In [None]:
df.groupby('Sex').agg(['mean', 'median'])

# Transforming the Data with groupby

In [None]:
# A lambda expression for Standardization.
standardization = lambda x: (x - x.mean()) / x.std()

In [None]:
df.groupby('Sex').Age.transform(standardization)

In [None]:
df.groupby('Sex').Age.apply(standardization)

# Filtering Data

In [None]:
# Let's group by Cabin and take a quick look at the size for each group
df.groupby('Cabin').size()

In [None]:
# filter data to return all passengers that lived in a cabin has ≥ 4 people.
df.groupby('Cabin').filter(lambda x: len(x) >= 4)

# Group by multiple categories

In [None]:
# Creating a subset
subset = df.loc[:, ['Sex', 'Pclass', 'Age', 'Fare']]

In [None]:
subset

In [None]:
# Grouping by multiple categories
subset.groupby(['Sex', 'Pclass']).mean()

In [None]:
# Creating a subset
subset_5 = df.loc[:, ['Sex', 'Pclass', 'Survived', 'Age', 'Fare']]

In [None]:
subset_5.shape

In [None]:
subset_5

In [None]:
# Grouping by multiple categories
subset_5.groupby(['Sex', 'Pclass', 'Survived']).mean()

In [None]:
subset_5.groupby(by=['Sex', 'Pclass', 'Survived']).mean()

In [None]:
df_subset_5 = subset_5.groupby(['Sex', 'Pclass', 'Survived']).mean()

In [None]:
subset_5.shape

In [None]:
df_subset_5.reset_index()

In [None]:
# A better way is to set as_index=False
subset_5.groupby(by=['Sex', 'Pclass', 'Survived'], as_index=False).mean()

# Handling missing values

In [None]:
subset_5.head()

In [None]:
subset_5.isna().sum()

In [None]:
subset_5.shape

In [None]:
# Create some missing values in Sex column
subset_5.iloc[100:200, 0] = np.nan

In [None]:
subset_5.isna().sum()

In [None]:
# The groupby function ignores the missing values by default.
subset_5.groupby(['Sex', 'Pclass', 'Survived']).mean()

In [None]:
subset_5.groupby(by=['Sex', 'Pclass', 'Survived'],  dropna=False).mean()

# An issue with 
# https://github.com/pandas-dev/pandas/issues/37323

# Creating Graph

In [None]:
%matplotlib inline

In [None]:
subset_5.groupby('Sex').mean().plot(kind='bar')

In [None]:
subset_5.groupby('Sex').get_group('male').mean().plot(kind='bar')

In [None]:
subset_5.groupby('Sex').get_group('female').mean().plot(kind='bar')