# DataFrame: Manipulation

In [1]:
import pandas as pd

In [None]:
def generate_data():
    """Return some dummy data."""
    return pd.DataFrame({
        "id": ["a", "b", "c", "d"],
        "score": [4, 6, 9, 8],
    })

scores = generate_data()
scores

## Creating New Columns

In [None]:
# Assign in-place using brackets [...].
# Note: Changes the original DataFrame!
scores["passed"] = scores["score"] > 5.5
scores

In [None]:
# Using assign() creates a copy.
scores.assign(
    label=scores["passed"].replace({True: "Passed", False: "Failed"})
)

In [None]:
# Original data has not changed!
scores

In [7]:
# Reset the data
scores = generate_data()

In [None]:
# Performing multiple assignments.
(
    scores
    .assign(

        # Assign to existing or new column.
        id=scores["id"].str.upper(),
        passed=scores["score"] > 5.5,

        # Compute label using "passed" column.
        # Note: Column "passed" is NOT in the original DataFrame!
        label=lambda df: df["passed"].replace({True: "Passed", False: "Failed"}),

    )
)

## More assign

In [None]:
df = pd.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]})
df

In [9]:
values = {"A": 1, "B": 2, "C": 3}

In [None]:
def show_me(A, B, C):
    print(A, B, C)

# Use ** to unpack the dict into argument=value pairs.
show_me(**values)

In [None]:
df.assign(**values)

In [None]:
# Create a dict with assignment statements.
polynomials = {}
for column in df.columns:
    for power in (2, 3, 4):
        polynomials[f"{column}_{power}"] = df[column] ** power

# Assign all at once.
df.assign(**polynomials)

In [None]:
# Create a dict with assignment statements.
polynomials = {}
for column in df.columns:
    for power in (2, 3, 4):
        # Using a lambda function...
        polynomials[f"{column}_{power}"] = lambda df: df[column] ** power

# Uhoh, that does not look good...
df.assign(**polynomials)

In [None]:
# Create a dict with assignment statements.
polynomials = {}
for column in df.columns:
    for power in (2, 3, 4):
        # Privide defaults for the additional arguments.
        polynomials[f"{column}_{power}"] = lambda df, c=column, p=power: df[c] ** p

# Much better!
df.assign(**polynomials)

## Applying Functions

In [None]:
df = pd.DataFrame({
    "name": ["john", "JANE", "Jack"],
    "lastname": ["doe", "DOE", "DOE"],
})
df

In [None]:
# Can use map to apply a function to all DataFrame cells.
df[["name", "lastname"]].map(lambda v: v.upper())

In [None]:
# Using functions on columns is identical to Series.
df["name"].map(str.capitalize)

In [None]:
# Use apply() to apply a function to a row.
# Note: Use axis=1 to get rows.
_ = df.apply(
    lambda row: print(" ".join(row)),
    axis=1
)

In [56]:
def make_fullname(person, case):
    """Generate full name for a person."""
    fullname = person["name"] + " " + person["lastname"]

    if case == "capitalize":
        return fullname.capitalize()
    if case == "lower":
        return fullname.lower()
    return fullname()


In [None]:
# Function arguments can be provided through apply().
df.apply(
    make_fullname,
    axis=1,

    # Arguments for make_fullname
    case="capitalize",
)

### Method Chaining

In [None]:
df

In [None]:
# Method chaining can produce cleaner code!
(
    df
    .assign(
        name=df["name"].str.capitalize(),
        lastname=df["lastname"].str.capitalize(),
        full_name=lambda df: df["name"] + " " + df["lastname"],
    )
    .sort_values(["lastname", "name"])
    .reset_index(drop=True)
)