In [10]:
from pyspark.sql import SparkSession
import os
os.environ['PYARROW_IGNORE_TIMEZONE'] = '1'
import pyspark.pandas as ps

spark = SparkSession.builder \
    .appName("Transform and Apply in Pandas API on Spark") \
    .config("spark.sql.ansi.enabled", "false") \
    .config("spark.executorEnv.PYARROW_IGNORE_TIMEZONE", "1") \
    .getOrCreate()

ps.set_option('compute.ops_on_diff_frames', True)

ps_df = ps.DataFrame({
    "id": [1, 2, 3, 4, 5],
    "name": ["Alice", "Bob", "Charlie", "David", "Emma"],
    "age": [25, 30, 35, 40, 45],
    "salary": [50000, 60000, 75000, 80000, 120000]
})

print("Original Pandas-on-Spark DataFrame:")
print(ps_df)

# age + 10 anos 
ps_df["age_in_10_years"] = ps_df["age"].transform(lambda x: x + 10)

# categorias de salário 
def categorize_salary(salary) -> str:  # Type hint
    if salary < 60000:
        return "Low"
    elif salary < 100000:
        return "Medium"
    else:
        return "High"

ps_df["salary_category"] = ps_df["salary"].apply(categorize_salary)

# função para combinar nome e idade 
def format_row(row) -> str: 
    return f"{row['name']} ({row['age']} years old)"

ps_df["name_with_age"] = ps_df.apply(format_row, axis=1)

print("\nDataFrame final:")
print(ps_df)

spark.stop()

Original Pandas-on-Spark DataFrame:
   id     name  age  salary
0   1    Alice   25   50000
1   2      Bob   30   60000
2   3  Charlie   35   75000
3   4    David   40   80000
4   5     Emma   45  120000

DataFrame final:
   id     name  age  salary  age_in_10_years salary_category           name_with_age
0   1    Alice   25   50000               35             Low    Alice (25 years old)
1   2      Bob   30   60000               40          Medium      Bob (30 years old)
2   3  Charlie   35   75000               45          Medium  Charlie (35 years old)
3   4    David   40   80000               50          Medium    David (40 years old)
4   5     Emma   45  120000               55            High     Emma (45 years old)
