In [5]:
from pyspark.sql.types import *
from pyspark.sql import Row
from pyspark.sql.functions import col, count, rand, collect_list, explode, struct, count, lit
from pyspark.sql.functions import pandas_udf, PandasUDFType

import pandas as pd
from scipy import stats
import statsmodels.api as sm

import pyarrow as pa

In [13]:
from pyspark.sql import SparkSession

spark = SparkSession.\
        builder.\
        appName("best_one").\
        master("spark://spark-master:7077").\
        config("spark.executor.memory", "512m").\
        config("spark.sql.execution.arrow.enabled", "true").\
        getOrCreate()

# Сделаем свободный DF для примера

In [14]:
df = spark.range(0, 10 * 1000).withColumn('id', (col('id') / 10000).cast('integer')).withColumn('v', rand())
df.cache()
df.count()

df.show()

+---+--------------------+
| id|                   v|
+---+--------------------+
|  0| 0.49840163843008656|
|  0|   0.069767152987945|
|  0|  0.7190715962156708|
|  0| 0.25640487040576976|
|  0|  0.6416189629727991|
|  0|  0.8528413886856091|
|  0|  0.6129146060898196|
|  0|  0.6164571002939396|
|  0|  0.6396182646364857|
|  0|  0.5752525667561637|
|  0|  0.8052223979650148|
|  0|  0.0405975490205146|
|  0|0.028272580413083004|
|  0|   0.594569306086507|
|  0|  0.9881616666059028|
|  0|  0.9770040363019834|
|  0|  0.9076460273114173|
|  0|  0.7452265863240174|
|  0| 0.12209934414804924|
|  0|  0.5801865543903545|
+---+--------------------+
only showing top 20 rows



# PANDAS UDF

In [15]:
# создадим первую функцию

    
@pandas_udf('double')    # декоратор
def plus_one(v):
    # базовая функция на python 
    return v + 1

df.withColumn('v', plus_one(df.v)).agg(count(col('v'))).show()

PythonException: 
  An exception was thrown from Python worker in the executor. The below is the Python worker stacktrace.
Traceback (most recent call last):
  File "/usr/bin/spark-3.0.0-bin-hadoop3.2/python/lib/pyspark.zip/pyspark/worker.py", line 589, in main
    func, profiler, deserializer, serializer = read_udfs(pickleSer, infile, eval_type)
  File "/usr/bin/spark-3.0.0-bin-hadoop3.2/python/lib/pyspark.zip/pyspark/worker.py", line 447, in read_udfs
    udfs.append(read_single_udf(pickleSer, infile, eval_type, runner_conf, udf_index=i))
  File "/usr/bin/spark-3.0.0-bin-hadoop3.2/python/lib/pyspark.zip/pyspark/worker.py", line 269, in read_single_udf
    return arg_offsets, wrap_scalar_pandas_udf(func, return_type)
  File "/usr/bin/spark-3.0.0-bin-hadoop3.2/python/lib/pyspark.zip/pyspark/worker.py", line 94, in wrap_scalar_pandas_udf
    arrow_return_type = to_arrow_type(return_type)
  File "/usr/bin/spark-3.0.0-bin-hadoop3.2/python/lib/pyspark.zip/pyspark/sql/pandas/types.py", line 29, in to_arrow_type
    import pyarrow as pa
ModuleNotFoundError: No module named 'pyarrow'


In [4]:
@pandas_udf("double", PandasUDFType.SCALAR)  # декоратор + тип данных
def pandas_plus_one(v):
    return v + 1

%timeit df.withColumn('v', pandas_plus_one(df.v)).agg(count(col('v'))).show()

In [5]:
# итеграция других библиотек (scipy)

@udf('double')
def cdf(v):
    return float(stats.norm.cdf(v))

%timeit df.withColumn('cumulative_probability', cdf(df.v)).agg(count(col('cumulative_probability'))).show()

In [6]:
@pandas_udf('double', PandasUDFType.SCALAR)
def pandas_cdf(v):
    return pd.Series(stats.norm.cdf(v))

%timeit df.withColumn('cumulative_probability', pandas_cdf(df.v)).agg(count(col('cumulative_probability'))).show()

In [7]:
# передача более сложных объектов и возврат нового объекта
@udf(ArrayType(df.schema))
def substract_mean(rows):
    vs = pd.Series([r.v for r in rows])
    vs = vs - vs.mean()
    return [Row(id=rows[i]['id'], v=float(vs[i])) for i in range(len(rows))]
  
%timeit df.groupby('id').agg(collect_list(struct(df['id'], df['v'])).alias('rows')).withColumn('new_rows', substract_mean(col('rows'))).withColumn('new_row', explode(col('new_rows'))).withColumn('id', col('new_row.id')).withColumn('v', col('new_row.v')).agg(count(col('v'))).show()

In [8]:
# передача более сложных объектов и возврат нового DF (+ новая колонка)

@pandas_udf(df.schema, PandasUDFType.GROUPED_MAP)
# Input/output == pandas.DataFrame
def pandas_subtract_mean(pdf):
	return pdf.assign(v=pdf.v - pdf.v.mean())

%timeit df.groupby('id').apply(pandas_subtract_mean).agg(count(col('v'))).show()

In [9]:
df2 = df.withColumn('y', rand()).withColumn('x1', rand()).withColumn('x2', rand()).select('id', 'y', 'x1', 'x2')
df2.show()                                                               

## UDF и агрегация данных

In [10]:
group_column = 'id'
y_column = 'y'
x_columns = ['x1', 'x2']
# записываем отдельно схему колонок
schema = df2.select(group_column, *x_columns).schema

#передаем схему и тип объекта, который будет приходить на вход
@pandas_udf(schema, PandasUDFType.GROUPED_MAP)
# Input/output == pandas.DataFrame
def ols(pdf):
    group_key = pdf[group_column].iloc[0]
    y = pdf[y_column]
    X = pdf[x_columns]
    X = sm.add_constant(X)
    
    # применяем метод из библиотеки
    model = sm.OLS(y, X).fit()
    
    # создаем новый DF
    return pd.DataFrame([[group_key] + [model.params[i] for i in   x_columns]], columns=[group_column] + x_columns)

beta = df2.groupby(group_column).apply(ols)
beta.show()