Skip to content

Commit

Permalink
fixes missing sparkContext on spark connect clusters
Browse files Browse the repository at this point in the history
  • Loading branch information
jeffbrennan committed Feb 9, 2024
1 parent 423ea4a commit aa0dced
Showing 1 changed file with 11 additions and 5 deletions.
16 changes: 11 additions & 5 deletions quinn/dataframe_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,19 +20,25 @@ def column_to_list(df: DataFrame, col_name: str) -> list[Any]:
:return: List of values
:rtype: List[Any]
"""
pyarrow_kv = ("spark.sql.execution.arrow.pyspark.enabled", "true")

if "pyspark" not in sys.modules:
raise ImportError

# sparksession from df is not available in older versions of pyspark
if sys.modules["pyspark"].__version__ < "3.3.0":
return [row[0] for row in df.select(col_name).collect()]

spark_config = df.sparkSession.sparkContext.getConf().getAll()
spark_session = df.sparkSession.getActiveSession()
if spark_session is None:
return [row[0] for row in df.select(col_name).collect()]

pyarrow_enabled = (
spark_session.conf.get(
"spark.sql.execution.arrow.pyspark.enabled",
)
== "true"
)

pyarrow_enabled: bool = pyarrow_kv in spark_config
pyarrow_valid = pyarrow_enabled and sys.modules["pyarrow"] >= "0.17.0"
pyarrow_valid = pyarrow_enabled and sys.modules["pyarrow"].__version__ >= "0.17.0"

pandas_exists = "pandas" in sys.modules
pandas_valid = pandas_exists and sys.modules["pandas"].__version__ >= "0.24.2"
Expand Down

0 comments on commit aa0dced

Please sign in to comment.