In [None]:
import numpy as np
import pandas as pd
import datetime as dt

from pprint import pprint
from utils import record_factory

In [None]:
# Create sample sales data
n=100

record = record_factory()
sales = pd.DataFrame(data=[next(record) for _ in range(n)])
sales.head(5)

## Creating columns

In [None]:
# Ugly, assigns column in place.
sales["avg_order_amount"] = sales["order_amount"] / sales["orders"]
sales.head(5)

In [None]:
# Using assign method instead.
# Supply <column name>=<column logic> as arguments.
sales.assign(
    avg_order_amount=sales["order_amount"] / sales["orders"],
    age=(dt.date.today() - sales["birthdate"]).dt.days,
)

In [None]:
# No in-place modifications; age column is not in the orignal DataFrame.
sales.columns

In [None]:
# Can also provide arguments using a dict.
# Sometimes useful when automatically generating columns in a loop.
assignments = {
    "avg_order_amount": sales["order_amount"] / sales["orders"],
    "age": (dt.date.today() - sales["birthdate"]).dt.days,   
}

# Use **{...} to unroll the dict into arguments.
sales = sales.assign(**assignments)
sales.head()

### Conditional Assignment

In [None]:
# Ugly in-place assignment...
sales["spender_type"] = "medium"
sales.loc[sales["order_amount"] > 20, "spender_type"] = "big"
sales.loc[sales["order_amount"] < 10, "spender_type"] = "small"

sales.head()

In [None]:
# Uhoh dreaded "SettingWithCopyWarning" (because we didn't use .loc[])
sales[sales["order_amount"] > 20]["spender_type"] = "big"

In [None]:
# Let's get rid of the column
sales = sales.drop(columns="spender_type")

In [None]:
# Better solution using .assign() and np.where(<cond>, <true statement>, <false statement>)
(
    sales
    .assign(
        spender_type=np.where(sales["order_amount"] > 20, "big", "small")
    )
)

In [None]:
# Or simply use a function for conditional assignment.
# Also allows you to document your classification in the docstring!
def classify_spenders(amount):
    """Classifies your customers!"""
    
    if amount < 10:
        return "small"
    elif amount > 20:
        return "big"
    return "medium"

In [None]:
# Use .map() to apply your classification function to a column.
# Note: if your function needs multiple columns; look into the .apply() method.
(
    sales
    .assign(
        spender_type=sales["order_amount"].map(classify_spenders)
    )
)

## Sorting

In [None]:
# Sort rows: top 10 customers.
# Note: due to the brackets () we can "chain" methods; one line for each operation.
(
    sales
    .sort_values("order_amount", ascending=False)
    .loc[:, ["first_name", "last_name", "city", "order_amount"]]
    .head()
)

In [None]:
# Sort on multiple columns using a list of column names.
# Use a list to specify the sort order for each column.
(
    sales
    .sort_values(["city", "order_amount"], ascending=[True, False])
    .loc[:, ["first_name", "last_name", "city", "order_amount"]]
)

In [None]:
# Sort by index instead of columns.
sales.sort_index(ascending=False)

## Grouping

In [None]:
# The .groupby() method creates DataFrameGroupBy object
sales.groupby(["city", "gender"])

In [None]:
# It is basically a group to index values mapping.
pprint(
    sales.groupby(["city", "gender"]).groups
)

In [None]:
# Can apply aggregation methods directly to the DataFrameGroupBy object.
sales_agg = sales.groupby(["city", "gender"]).mean()
sales_agg

In [None]:
# By default, .groupby() creates a (multi-) index...
# This can be tricky to use...
sales_agg.loc[("Amsterdam", ), :]

In [None]:
# Ugh...
sales_agg.loc[(slice(None), "male"), :]

In [None]:
# Use as_index=False to avoid automatic index creation!
(
    sales
    .groupby(["city", "gender"], as_index=False)
    .mean()
)

### Using agg / aggregate

In [None]:
# Use .agg() to specify column + aggregation method.
(
    sales
    .groupby(["city", "gender"], as_index=False)
    .agg({
        "age": "mean",
        "order_amount": "mean",
    })
)

In [None]:
# Use a lost to specify multiple aggregations for a single column.
sales_agg = (
    sales
    .groupby(["city", "gender"], as_index=False)
    .agg({
        "age": "mean",
        "order_amount": ["mean", "min", "max"],
    })
)
sales_agg

In [None]:
# Yikes... another MultiIndex, but now on the columns...
sales_agg.columns

In [None]:
# Getting rid of it using set_axis
# Note: not a very flexible solution...
(
    sales_agg
    .set_axis(
        ["city", "gender", "age_mean", "order_amount_mean", "order_amount_min", "order_amount_max"],
        axis=1
    )
)

In [None]:
# Can use a helper function to make it more flexible...
def collapse_levels(df):
    """Collapse levels of a column multi-index."""

    colnames = ["_".join(t).strip("_") for t in df.columns]
    return df.set_axis(colnames, axis=1)


# Using .pipe() to apply the function to the entire DataFrame.
(
    sales_agg
    .pipe(collapse_levels)
)

In [None]:
# New style syntax for aggregations (pandas > 0.25).
# Note: more similar to the assign syntax.
(
    sales
    .groupby(["city", "gender"], as_index=False)
    .agg(
        age_mean=("age", "mean"),
        order_amount_mean=("order_amount", "mean"),
        order_amount_min=("order_amount", "min"),
        order_amount_max=("order_amount", "max"),
    )
)

### Custom aggregation functions

In [None]:
def big_spenders(amount, threshold=20):
    """
    Counts customers who spend more than threshold value.
    Note: uses Series as input.
    """
    
    return amount[amount > threshold].shape[0]

In [None]:
# Apply custom aggregation to the grouped DataFrame.
(
    sales
    .groupby(["city", "gender"])
    .agg(
        big_spenders=("order_amount", big_spenders)
    )
)

In [None]:
# Apply custom aggregation with parameters using a lambda function.
(
    sales
    .groupby(["city", "gender"])
    .agg(
        big_spenders=("order_amount", lambda s: big_spenders(s, 10))
    )
)

### Using transform

The transform method performs an aggregation "in the background" and maps the result onto the original index values. It thus creates a Series or DataFrame with the same index (and number of rows) as the one it was created from. This makes it easy to merge it with the original DataFrame. Note that it will (most likely) introduce duplicated values.

In [None]:
# Create a Series with sales totals per city.
# Note the index matches the sales DataFrame.
# Also note the duplicate values for records of the same city.
(
    sales
    .groupby("city")
    ["order_amount"]
    .transform("sum")
)

In [None]:
# Using .assign() we can easily incorporate the city totals in the sales DataFrame.
(
    sales
    .assign(
        city_total=sales.groupby(["city", "gender"])["order_amount"].transform("sum")
    )
    .sort_values(["city", "gender"])
    .head()
)

In [None]:
# Using the city total to calculate which percentage a customer contributed.
(
    sales
    .assign(
        city_total=lambda df: df.groupby(["city", "gender"])["order_amount"].transform("sum"),
        city_percentage=lambda df: 100 * df["order_amount"] / df["city_total"],
    )
    .head()
)

### Using apply

In [None]:
# Function to compute percentage contributed in a different way.
# Note that this function will receive a DataFrame for each group.

def group_percentage(df):
    """Computes percentage sales contribution per group."""
    
    city_total = df["order_amount"].sum()
    return df.assign(city_percentage=100 * df["order_amount"] / city_total)

In [None]:
# Using .apply() to apply the function to each group.
# Note: we have a MultiIndex again... can drop it as the columns are also in the data.
(
    sales
    .groupby(["city", "gender"])
    .apply(group_percentage)
    .sort_values("id")
    .head()
)

## Join, Merge, and Concat

In [None]:
# Toy data sets
a = pd.DataFrame({"label_a": list("ABC")}, index=[1, 2, 3])
b = pd.DataFrame({"label_b": list("ABD")}, index=[1, 2, 4])

In [None]:
# Join uses the indices to join two DataFrames.
# Note: defaults to LEFT join.
a.join(b)

In [None]:
# Available joins: "left", "right", "inner", "outer"
a.join(b, how="outer")

In [None]:
# Avoid duplicate column names when using .join()!
a["x"] = [1, 2, 3]
b["x"] = [1, 2, 3]

a.join(b)

In [None]:
# Or use the lsuffix / rsuffix arguments to resolve duplicate names.
a.join(b, lsuffix="_left", rsuffix="_right")

### Merge

In [None]:
# Merge resembles join, but uses columns instead of indices. It is also a bit more user-friendly!

In [None]:
a = pd.DataFrame({"x": [1, 2, 3], "label_a": list("ABC")})
b = pd.DataFrame({"x": [1, 2, 4], "label_b": list("ABD")})

In [None]:
# Simple merge, automatically finds and uses shared columns.
# Note: defaults to INNER join (cf. LEFT join from the join method).
a.merge(b)

In [None]:
# Specify join column with the on argument and join type with the how argument.
a.merge(b, on="x", how="outer")

In [None]:
# Merge on different columns in the left and right DataFrame.
# Note: merge conveniently uses automatic suffixes!
a.merge(b, left_on="label_a", right_on="label_b", how="outer")

### Concat

In [None]:
a = pd.DataFrame({"label_a": list("ABC")})
b = pd.DataFrame({"label_b": list("ABD")})

In [None]:
# Concat resembles SQL UNION statement; it merges rows from 2 DataFrames.
pd.concat([a, b])

In [None]:
# Use axis=1 to concatenate along column axis.
pd.concat([a, b], axis=1)

In [None]:
# Concat respects indices, note the introduced NaN values.
a = pd.DataFrame({"label_a": list("ABC")}, index=[1, 2, 3])
b = pd.DataFrame({"label_b": list("ABD")}, index=[1, 2, 4])

pd.concat([a, b], axis=1)