In [1]:
import pandas as pd

df = pd.read_csv("train.csv", sep=";")

In [2]:
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [3]:
# Adding a nested list for duration column which is the contract duration
df["annual_duration"] = [[i, i / 2] for i in df["duration"]]

In [5]:
# Adding a tuple column on campaign -number of contacts performed during this campaign to the client
df["campaign_limit"] = [(i, i**2) for i in df["campaign"]]

In [None]:
# Group by column 'marital' and apply a lambda function to aggregate
result = df.groupby("marital").apply(
    lambda x: pd.Series(
        {
            "balance_max": x["balance"].max(),
            "age_mean": x["age"].mean(),
            "annual_duration_flat": [
                x for sublist in x["annual_duration"].tolist() for x in sublist
            ],
            "campaign_limit_concat": "".join(
                str(item) for item in x["campaign_limit"].tolist()
            ),
        }
    ),
    include_groups=False,
)

### Using a function

In [12]:
from pandas import DataFrame, Series

In [13]:
def max_customer_account_balance(df: DataFrame) -> DataFrame:
    """Adds a column with the maximum account balance to the DataFrame."""
    df["balance_max"] = df["balance"].max()
    return df

In [14]:
def customers_mean_age(df: DataFrame) -> DataFrame:
    """Adds a column with the mean age of all customers to the DataFrame."""
    df["age_mean"] = df["age"].mean()
    return df

In [None]:
df.pipe(customers_mean_age).pipe(max_customer_account_balance)

### Unit testing

In [24]:
DataFrame(data={"balance": [100, 200, 300, 400, 500]}).iloc[0]

balance    100
Name: 0, dtype: int64

In [20]:
def test_max_customer_account_balance():
    """Test the max_customer_account_balance function."""
    test_df = DataFrame(data={"balance": [100, 200, 300, 400, 500]})
    test_df = max_customer_account_balance(test_df)
    actual = test_df["balance_max"].iloc[0]
    expected = 500
    assert "balance_max" in test_df.columns
    assert actual == expected

In [21]:
test_max_customer_account_balance()

We didn't get any error, if we put any other number in expected, we will get an error, meaning the test failed.

In [32]:
def test_customers_mean_age():
    """Test the customers_mean_age function."""
    test_df = DataFrame(data={"age": [30, 40, 50, 60, 70]})
    test_df = customers_mean_age(test_df)
    actual = test_df["age_mean"].iloc[0]
    expected = 50
    assert "age_mean" in test_df.columns
    assert actual == expected

In [33]:
test_customers_mean_age()

In [35]:
def gb_customers_mean_age(df: DataFrame) -> DataFrame:
    """Adds a column with the mean age of all customers to the DataFrame."""
    df["age_mean"] = df["age"].mean()
    return df['age_mean'].iloc[0]

In [36]:
def gb_max_customer_account_balance(df: DataFrame) -> DataFrame:
    """Adds a column with the maximum account balance to the DataFrame."""
    df["balance_max"] = df["balance"].max()
    return df['balance_max'].iloc[0]

In [37]:
df.groupby("marital").apply(
    lambda x: pd.Series(
        {
            "balance_max": gb_max_customer_account_balance(x),
            "age_mean": gb_customers_mean_age(x),
        }
    ),
)

Unnamed: 0_level_0,balance_max,age_mean
marital,Unnamed: 1_level_1,Unnamed: 2_level_1
divorced,66721.0,45.782984
married,98417.0,43.408099
single,102127.0,33.70344
