In [3]:
from pyspark.sql import SparkSession
import os
os.environ['PYARROW_IGNORE_TIMEZONE'] = '1'
import pyspark.pandas as ps

spark = SparkSession.builder \
    .appName("Pandas API on Spark") \
    .config("spark.sql.ansi.enabled", "false") \
    .config("spark.executorEnv.PYARROW_IGNORE_TIMEZONE", "1") \
    .getOrCreate()

# 1. Create a pandas-on-Spark DataFrame
ps_df = ps.DataFrame({
    "id": [1, 2, 3, 4, 5],
    "name": ["Alice", "Bob", "Charlie", "David", "Emma"],
    "age": [25, 30, 35, 40, 45],
    "salary": [50000, 60000, 75000, 80000, 120000]
})

print("Pandas-on-Spark DataFrame:")
print(ps_df)

# 2. Perform Pandas-style operations on Spark
print("\nAverage Age:", ps_df["age"].mean())

print("\nSummary Statistics:")
print(ps_df.describe())

# 3. Apply a function: Add a new column with salary increment
ps_df["salary_after_increment"] = ps_df["salary"] * 1.1
print("\nDataFrame after Salary Increment:")
print(ps_df)

# 4. Filtering (Similar to Pandas)
filtered_ps_df = ps_df[ps_df["age"] > 30]
print("\nFiltered DataFrame (age > 30):")
print(filtered_ps_df)

# 5. Convert Pandas-on-Spark DataFrame to Spark DataFrame
# Opção A: Preservar o índice como coluna
spark_df = ps_df.to_spark(index_col="index")
print("\nConverted Spark DataFrame (with index):")
spark_df.show()

# 6. Convert Spark DataFrame back to Pandas-on-Spark DataFrame
ps_df_from_spark = ps.DataFrame(spark_df)
print("\nReconverted Pandas-on-Spark DataFrame:")
print(ps_df_from_spark)

spark.stop()

Pandas-on-Spark DataFrame:
   id     name  age  salary
0   1    Alice   25   50000
1   2      Bob   30   60000
2   3  Charlie   35   75000
3   4    David   40   80000
4   5     Emma   45  120000

Average Age: 35.0

Summary Statistics:
             id        age        salary
count  5.000000   5.000000       5.00000
mean   3.000000  35.000000   77000.00000
std    1.581139   7.905694   26832.81573
min    1.000000  25.000000   50000.00000
25%    2.000000  30.000000   60000.00000
50%    3.000000  35.000000   75000.00000
75%    4.000000  40.000000   80000.00000
max    5.000000  45.000000  120000.00000

DataFrame after Salary Increment:
   id     name  age  salary  salary_after_increment
0   1    Alice   25   50000                 55000.0
1   2      Bob   30   60000                 66000.0
2   3  Charlie   35   75000                 82500.0
3   4    David   40   80000                 88000.0
4   5     Emma   45  120000                132000.0

Filtered DataFrame (age > 30):
   id     name  a