In [None]:
import pandas as pd
import numpy as np
import seaborn as sns

# GroupBy


In [None]:
"""By “group by” we are referring to a process involving one or more of the following steps:
1) Splitting the data into groups based on some criteria.
2) Applying a function to each group independently.
3) Combining the results into a data structure."""

In [None]:
# A list or array of values that is the same length as the axis being grouped
# A value indicating a column name in a DataFrame
# A dict or Series giving a correspondence bet the values on the axis being grouped and the group names
# A function to be invoked on the axis index or the individual labels in the index

In [None]:
df = pd.DataFrame({'key1' : ['a', 'a', 'b', 'b', 'a','b'], 'key2' : ['one', 'two', 'one', 'two', 'one','one'],
                   'data1' : np.random.randn(6),'data2' : np.random.randn(6)})

In [None]:
df

In [None]:
# to compute the mean of the data1 column using the labels from key1.

In [None]:
grouped = df['data1'].groupby(df['key1'])

In [None]:
grouped.mean()

In [None]:
"""data (a Series) has been aggregated according to the group key, producing
a new Series that is now indexed by the unique values in the key1 column. The
result index has the name'key1' because the DataFrame column df['key1'] """ 

In [None]:
df.groupby('key1')['data1']

In [None]:
df['data1'].groupby(df['key1'])

In [None]:
# column names as the group keys

In [None]:
df.groupby('key1').mean()

In [None]:
# with multiple keys

In [None]:
means = df['data1'].groupby([df['key1'], df['key2']]).mean()

In [None]:
means

In [None]:
df.groupby(['key1', 'key2'])[['data2']].mean()

In [None]:
"""Here we grouped the data using two keys, and the resulting Series
now has a hierarchical index consisting of the unique pairs of keys"""

In [None]:
# multiple column names as the group keys

In [None]:
df.groupby(['key1', 'key2']).mean()

In [None]:
# groupby with series as array with same length

In [None]:
city = np.array(['Hyd', 'Kol', 'Hyd', 'Pune', 'Pune','Kol'])

In [None]:
years = np.array([2016, 2016, 2017, 2016, 2017,2017])

In [None]:
df['data1'].groupby([city, years]).mean()

In [None]:
# Size
#useful GroupBy method , which returns a Series containing group sizes
# any missing values in a group key will be excluded from the result

In [None]:
df.groupby(['key1', 'key2']).size()

# Iterating Over Groups


In [None]:
# The GroupBy object supports iteration, generating a sequence of 2-tuples
# containing the group name along with the chunk of data

In [None]:
for name, group in df.groupby('key1'):
    print(name)
    print(group)

In [None]:
# with multiple keys

In [None]:
# In the case of multiple keys, the first element in the tuple will
# be a tuple of key values

In [None]:
for (k1, k2), group in df.groupby(['key1', 'key2']):
    print((k1, k2)) 
    print(group)

In [None]:
# groupby computing on dict of the data pieces as a one-liner

In [None]:
pieces = dict(list(df.groupby('key1')))

In [None]:
pieces['b']

In [None]:
# groupby on axis1

In [None]:
grouped = df.groupby(df.dtypes, axis=1)

In [None]:
for dtype, group in grouped:
    print(dtype)
    print(group)

# Grouping with Dicts and Series

In [None]:
students = pd.DataFrame(np.random.randn(5, 5),columns=['a', 'b', 'c', 'd', 'e'],
          index=['Ram', 'Shyam', 'Mohan', 'Rohan', 'Sohan'])

In [None]:
students.iloc[2:3, [1, 2]] = np.nan

In [None]:
students

In [None]:
diction = {'a': 'red', 'b': 'red', 'c': 'blue',
           'd': 'blue', 'e': 'red', 'f' : 'orange'}

In [None]:
df_dict_grp_by = students.groupby(diction, axis=1)

In [None]:
df_dict_grp_by.sum()

In [None]:
# with series

In [None]:
w_series = pd.Series(diction)

In [None]:
w_series

In [None]:
students.groupby(w_series, axis=1).count()

# Grouping with Functions


In [None]:
# Any function passed as a group key
# will be called once per index value, with the return values being
# used as the group names

In [None]:
"""Suppose you wanted to group by the length of the names; while
you could compute an array of string lengths, it’s simpler to just pass
the len function """

In [None]:
students.groupby(len).sum()

In [None]:
"""Mixing functions with arrays, dicts, or Series is not a problem as
everything gets converted to arrays internally"""

In [None]:
_list = ['one', 'one', 'one', 'two', 'two']

In [None]:
students.groupby([len, _list]).min()

# Grouping by Index Levels


In [None]:
col = pd.MultiIndex.from_arrays([['UP', 'UP', 'UP', 'HP', 'HP'],[1, 3, 5, 1, 3]],
                                names=['city', 'centre'])

In [None]:
hier_df = pd.DataFrame(np.random.randn(4, 5), columns=col)

In [None]:
hier_df

In [None]:
#To group by level, pass the level number or name using the level keyword

In [None]:
hier_df.groupby(level='city', axis=1).count()

# Data Aggregation

In [None]:
#Aggregations refer to any data transformation that produces scalar values from arrays

In [None]:
"""Aggregation: compute a summary statistic (or statistics) for each group. Some examples:
Compute group sums or means.
Compute group sizes / counts."""

In [None]:
df

In [None]:
grouped = df.groupby('key1')

In [None]:
grouped['data1'].quantile(0.9)

In [None]:
# User Defined function

In [None]:
def peak_to_peak(arr):
    return arr.max() - arr.min()

In [None]:
grouped.agg(peak_to_peak)

In [None]:
# methods like describe also work, even though they are not aggregations

In [None]:
grouped.describe()

In [None]:
#Inside GroupBy, when you invoke a method like describe, it is actually just a shortcut of below function

In [None]:
f = lambda x: x.describe()
grouped.apply(f)

# Column-Wise and Multiple Function Application

In [None]:
tips = sns.load_dataset('tips')

In [None]:
tips.head(2)

In [None]:
# Add tip percentage of total bill

In [None]:
tips['tip_pct'] = tips['tip'] / tips['total_bill']

In [None]:
tips[:5]

In [None]:
grouped = tips.groupby(['day', 'smoker'])

In [None]:
grouped_pct = grouped['tip_pct']

In [None]:
grouped_pct.agg('mean')

In [None]:
# If you pass a list of functions or function names instead, you get back a DataFrame with column names taken from the functions

In [None]:
grouped_pct.agg(['mean', 'std', peak_to_peak])

In [None]:
#if you pass a list of (name,function) tuples, the first element of each tuple will be used as the DataFrame column names

In [None]:
grouped_pct.agg([('ele1', 'mean'), ('ele2', np.std)])

In [None]:
#With a DataFrame you can specify a list of functions to apply to all of the columns or different functions per column

In [None]:
func = ['count', 'mean', 'max']

In [None]:
result = grouped['tip_pct', 'total_bill'].agg(func)

In [None]:
result                    # hierarchical index in o/p

In [None]:
result['tip_pct']

In [None]:
# list of tuples with custom names

In [None]:
fun_tup = [('Doremon', 'mean'), ('Picachoo', np.var)]

In [None]:
grouped['tip_pct', 'total_bill'].agg(fun_tup)

In [None]:
# Apply potentially different functions to one or more of the columns

we need to pass a dict to agg that contains a mapping of column names to any of the function specified

In [None]:
grouped.agg({'tip' : np.max, 'size' : 'sum'})

A DataFrame will have hierarchical columns only if multiple functions are applied to at least one column

In [None]:
grouped.agg({'tip_pct' : ['min', 'max', 'mean', 'std'],
             'size' : 'sum'})

# Returning Aggregated Data Without Row Indexes

the aggregated data comes back with an index, potentially hierarchical, composed from the unique group key combination

In [None]:
#disable this behavior in most cases by passing as_index=False to groupby

In [None]:
tips.groupby(['day', 'smoker'], as_index=False).mean()

# Apply: General split-apply-combine

In [None]:
# we want top 5 value of tip_pict in group

In [None]:
def top(df, n=5, column='tip_pct'):
    return df.sort_values(by=column)[-n:]

In [None]:
top(tips, n=6)

In [None]:
# group by smoker, and call apply with the above function


The top function will be called on each row group from the DataFrame, and then the results are glued together using pandas.concat, labeling the pieces with thegroup names. T he result therefore has a hierarchical index whose inner level contains index values from the original DataFrame.

In [None]:
tips.groupby('smoker').apply(top)

If you pass a function to apply that takes other arguments or keywords, you can pass these after the function

In [None]:
tips.groupby(['smoker', 'day']).apply(top, n=1, column='total_bill')

# Suppressing the Group Keys



resulting object has a hierarchical index formed from the group keys along with the indexes of each piece of the original object. You can disable this by passing group_keys=False to groupby

In [None]:
tips.groupby('smoker', group_keys=False).apply(top)

# Quantile and Bucket Analysis


pandas has some tools, in particular cut and qcut, for slicing data up into buckets with bins of your choosing or by sample quantiles. Combining these functions with groupby makes it convenient to perform bucket or quantile analysis on a dataset

In [None]:
frame = pd.DataFrame({'data1': np.random.randn(1000),
                      'data2': np.random.randn(1000)})

In [None]:
quartiles = pd.cut(frame.data1, 4)

In [None]:
quartiles[:10]

In [None]:
# The Categorical object
# returned by cut can be
# passed directly to groupby. So we
# could compute a set of statistics for the data2 column like

In [None]:
def get_stats(group):
    return {'min': group.min(), 'max': group.max(),
            'count': group.count(), 'mean': group.mean()}

In [None]:
grouped = frame.data2.groupby(quartiles)

In [None]:
grouped.apply(get_stats).unstack()

In [None]:
# Return quantile numbers, pass labels=False

In [None]:
grouping = pd.qcut(frame.data1, 10, labels=False)

In [None]:
grouped = frame.data2.groupby(grouping)

In [None]:
grouped.apply(get_stats).unstack()

# Pivot table



A pivot table is a data summarization tool frequently found in spreadsheet programs and other data analysis software. It aggregates a table of data by one or more keys, arranging the data in a rectangle with some of the group keys along the rows and some along the columns

In [None]:
tips.pivot_table(index=['day', 'smoker'])  # similar to groupby

In [None]:
tips.pivot_table(['tip_pct', 'size'], index=['time', 'day'],
                 columns='smoker')

In [None]:
# by passing  margins=True. 
# This will add All row and column labels, 
# with corresponding values being the group statistics for all the data within a single tier
# resulting all values AS MEAN

In [None]:
tips.pivot_table(['tip_pct', 'size'], index=['time', 'day'],
                 columns='smoker', margins=True)

In [None]:
tips.pivot_table('tip_pct', index=['time', 'smoker'], columns='day',
                 aggfunc=len, margins=True)

In [None]:
# Replacing Null Value

In [None]:
tips.pivot_table('tip_pct', index=['time', 'size', 'smoker'],
                 columns='day', aggfunc='mean', fill_value=0)

# Cross-Tabulations: Crosstab

A cross-tabulation (or crosstab for short) is a special case of a pivot table that computes group frequencies.

In [None]:
pd.crosstab([tips.time, tips.day], tips.smoker, margins=True)

In [None]:
data = pd.DataFrame({'City':['Hyd','Blore','Hyd','Blore','Blore','Blore',
                                    'Hyd','Hyd','Blore','Hyd'] 
                     'Handedness' : ['Right-handed','Left-handed','Right-handed','Right-handed',
                                     'Left-handed','Right-handed','Right-handed',
                                     'Left-handed','Right-handed','Right-handed']})

In [None]:
pd.crosstab(data.City, data.Handedness, margins=True)