In [4]:
from pyspark.sql import SparkSession
import os
os.environ['PYARROW_IGNORE_TIMEZONE'] = '1'
import pandas as pd
import pyspark.pandas as ps

spark = SparkSession.builder \
    .appName("Pandas Integration with PySpark") \
    .config("spark.sql.ansi.enabled", "false") \
    .config("spark.executorEnv.PYARROW_IGNORE_TIMEZONE", "1") \
    .getOrCreate()

# 1. Create a Pandas DataFrame
pandas_df = pd.DataFrame({
    "id": [1, 2, 3, 4, 5],
    "name": ["Alice", "Bob", "Charlie", "David", "Emma"],
    "age": [25, 30, 35, 40, 45]
})

print("Pandas DataFrame:")
print(pandas_df)

# 2. Convert Pandas DataFrame to Spark DataFrame
spark_df = spark.createDataFrame(pandas_df)

print("\nSchema of Spark DataFrame:")
spark_df.printSchema()

print("\nSpark DataFrame:")
spark_df.show()

# 3. Perform transformations on Spark DataFrame
filtered_spark_df = spark_df.filter(spark_df.age > 30)
print("\nFiltered Spark DataFrame (age > 30):")
filtered_spark_df.show()

# 4. Convert Spark DataFrame back to Pandas DataFrame
converted_pandas_df = filtered_spark_df.toPandas()
print("\nConverted Pandas DataFrame:")
print(converted_pandas_df)

# 5. Use pandas-on-Spark for scalable Pandas operations
ps_df = ps.DataFrame(pandas_df)

# Perform a Pandas-like operation in Spark
print("\nUsing pandas-on-Spark (incrementing age by 1):")
ps_df["age"] = ps_df["age"] + 1
print(ps_df)

# 6. Convert pandas-on-Spark DataFrame to Spark DataFrame
# Preserva o Ã­ndice como coluna
converted_spark_df = ps_df.to_spark(index_col="index")
print("\nConverted Spark DataFrame from pandas-on-Spark:")
converted_spark_df.show()

spark.stop()

Pandas DataFrame:
   id     name  age
0   1    Alice   25
1   2      Bob   30
2   3  Charlie   35
3   4    David   40
4   5     Emma   45

Schema of Spark DataFrame:
root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- age: long (nullable = true)


Spark DataFrame:
+---+-------+---+
| id|   name|age|
+---+-------+---+
|  1|  Alice| 25|
|  2|    Bob| 30|
|  3|Charlie| 35|
|  4|  David| 40|
|  5|   Emma| 45|
+---+-------+---+


Filtered Spark DataFrame (age > 30):
+---+-------+---+
| id|   name|age|
+---+-------+---+
|  3|Charlie| 35|
|  4|  David| 40|
|  5|   Emma| 45|
+---+-------+---+


Converted Pandas DataFrame:
   id     name  age
0   3  Charlie   35
1   4    David   40
2   5     Emma   45

Using pandas-on-Spark (incrementing age by 1):
   id     name  age
0   1    Alice   26
1   2      Bob   31
2   3  Charlie   36
3   4    David   41
4   5     Emma   46

Converted Spark DataFrame from pandas-on-Spark:
+-----+---+-------+---+
|index| id|   name|age|
+-----+