## Modularized Feature Creation:

This means we have to create create logic that uses the existing features of the table to create entirely new tables. This can be made modular by using functions

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, DoubleType, StringType, DateType
from datetime import date

# Sample data
sample_data = [
    (1990, 25000, date(2026, 2, 10)),
    (1985, 45000, date(2026, 1, 15)),
    (2000, 80000, date(2026, 2, 20)),
    (1975, 32000, date(2025, 12, 25)),
    (1995, 70000, date(2026, 2, 1))
]

schema = StructType([
    StructField("birth_year", IntegerType(), True),
    StructField("income", IntegerType(), True),
    StructField("last_login", DateType(), True)
])

raw_df = spark.createDataFrame(sample_data, schema)
display(raw_df)

birth_year,income,last_login
1990,25000,2026-02-10
1985,45000,2026-01-15
2000,80000,2026-02-20
1975,32000,2025-12-25
1995,70000,2026-02-01


In [0]:
# Feature creation functions

def create_age_feature(df):
    # Calculate age from birthdate
    return df.withColumn("age", (2026 - df["birth_year"]))

In [0]:

def create_income_bucket_feature(df):
    # Bucketize income
    from pyspark.sql.functions import when
    return df.withColumn(
        "income_bucket",
        when(df["income"] < 30000, "Low")
        .when((df["income"] >= 30000) & (df["income"] < 70000), "Medium")
        .otherwise("High")
    )

In [0]:

def create_is_active_feature(df):
    # Flag if last login within 30 days
    from pyspark.sql.functions import datediff, current_date
    return df.withColumn(
        "is_active",
        (datediff(current_date(), df["last_login"]) < 30).cast("int")
    )


In [0]:
def create_features(df):
    df = create_age_feature(df)
    df = create_income_bucket_feature(df)
    df = create_is_active_feature(df)
    return df

In [0]:
features_df = create_features(raw_df)
display(features_df)

birth_year,income,last_login,age,income_bucket,is_active
1990,25000,2026-02-10,36,Low,1
1985,45000,2026-01-15,41,Medium,0
2000,80000,2026-02-20,26,High,1
1975,32000,2025-12-25,51,Medium,0
1995,70000,2026-02-01,31,High,1
