# DataFrame: Manipulation

In [None]:
import datetime as dt

import numpy as np
import pandas as pd

## Creating Columns

In [None]:
def generate_data():
    """Return some dummy data."""
    return pd.DataFrame({
        "id": ["a", "b", "c", "d"],
        "score": [4, 6, 9, 8],
    })

scores = generate_data()
scores

In [None]:
# Assign directly using brackets [].
# Note: Changes DataFrame in place!
scores["passed"] = scores["score"] > 5.5
scores

In [None]:
def number_passed(df):
    """Compute number of people who passed."""
    df["passed"] = df["score"] > 5.5
    return df["passed"].sum()


# Generate data
scores = generate_data()

# User just wants to know how many people passed
print(f"Number passed: ", number_passed(scores))

# But DataFrame has changed (unexpectedly)...
scores

In [None]:
# Generate data
scores = generate_data()

# Using assign() returns a copy.
scores.assign(passed=scores["score"] > 5.5)

In [None]:
# Original DataFrame is left unchanged!
scores

In [None]:
# Performing multiple assignments.
# Note: Use lambda when columns was created in a previous step!
(
    scores
    .assign(

        # Compute passed using existing `score` column.
        passed=scores["score"] > 5.5,
        
        # Compute label using column `passed` from previous step.
        passed_label=lambda df: df["passed"].replace({True: "Passed", False: "Failed"}),

    )
)

## Applying Functions

In [None]:
df = pd.DataFrame({
    "name": ["john", "JANE", "Jack"],
    "lastname": ["doe", "DOE", "DOE"],
    "age": [45, 26, None],
})
df

In [None]:
# Using functions on columns is identical to Series.
df["name"].map(str.capitalize)

In [None]:
def print_info(row):
    """Print row information."""
    print("Type:   ", type(row))
    print("Index:  ", row.index)
    print("Values: ", row.values)
    print("-" * 60)

In [None]:
# Use apply() to apply a function to a row.
# Note: Use axis=1 to get rows.
df.apply(print_info, axis=1)

In [None]:
def fullname(person, case="capitalize"):
    """Generate full name for a person."""
    case = getattr(str, case, "capitalize")
    
    name = case(person["name"])
    lastname = case(person["lastname"])
    
    return f"{name} {lastname}"

In [None]:
# Function arguments can be provided through apply(). 
df.apply(
    fullname,
    axis=1,
    
    # Function arguments
    case="upper",
)

In [None]:
# Use applymap() to apply a function to all cells in a DataFrame.
df.applymap(
    lambda v: v.capitalize() if hasattr(v, "capitalize") else v
)

### Method Chaining

In [None]:
# Using method chaining produces clean code
(
    df
    .fillna({"age": -1})
    .applymap(
        lambda v: v.capitalize() if hasattr(v, "capitalize") else v
    )
    .assign(fullname=df.apply(fullname, axis=1))
    .sort_values("age")
)