In [4]:
import pandas as pd
import statsmodels.api as sm
import numpy as np
from pyspark.sql.types import *
from pyspark.sql.functions import pandas_udf, PandasUDFType

In [15]:
#!pip install statsmodels
#!pip install PyArrow 

Collecting PyArrow
  Downloading pyarrow-3.0.0-cp37-cp37m-manylinux2014_x86_64.whl (20.7 MB)
[K     |████████████████████████████████| 20.7 MB 18.7 MB/s eta 0:00:01
Installing collected packages: PyArrow
Successfully installed PyArrow-3.0.0


In [6]:
from pyspark.sql import SparkSession

spark = SparkSession.\
        builder.\
        appName("best_one").\
        getOrCreate()

In [12]:
# простой DF
pdf = pd.DataFrame({'group_id':[1,1,1,1,1,1,2,2,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4],
                    'sex':['M','N','M','N','F','F','M','F','N','M','M','N','F','M','F','F','N','M','F','M','F','N','M','F','M','F'],
                    'x':[0,1,2,0,1,5,2,3,4,5,6,2,3,4,1,2,6,7,8,5,3,4,1,7,6,5],
                    'y':[2,1,0,0,0,5,2,5,3,4,5,6,1,2,5,6,7,8,9,4,2,5,8,10,5,6]})
df = spark.createDataFrame(pdf)

# ещё один способ визуализации
display(df.show(5))

+--------+---+---+---+
|group_id|sex|  x|  y|
+--------+---+---+---+
|       1|  M|  0|  2|
|       1|  N|  1|  1|
|       1|  M|  2|  0|
|       1|  N|  0|  0|
|       1|  F|  1|  0|
+--------+---+---+---+
only showing top 5 rows



None

In [13]:
# схема данного DF
result_schema =StructType([
  StructField('group_id',DoubleType()),
  StructField('sex',StringType()),
  StructField('x',DoubleType())
 ])

In [18]:
# немного магии
# применим sklearn функции и обработки из pandas в Spark

@pandas_udf(result_schema, PandasUDFType.GROUPED_MAP)
def ols(df):
    group_id = df['group_id'].iloc[0]
    sex = df['sex'].iloc[0]

    if len(df) == 1:
        return pd.DataFrame([[group_id] + [sex] + [None]], columns=['group_id'] + ['sex'] + ['x'])

    else:        
        y = df['y'].astype(int)
        X = df['x'].astype(int)
        X = sm.add_constant(X)
        model = sm.OLS(y, X).fit()
        return pd.DataFrame([[group_id] + [sex] + [model.params[1]]], columns=['group_id'] + ['sex'] + ['x'])


# применяем, как обчную функцию (а это ещё Spark!)
df.groupby('group_id', 'sex').apply(ols).show()

+--------+---+-------------------+
|group_id|sex|                  x|
+--------+---+-------------------+
|     4.0|  N|               null|
|     2.0|  M| 0.7307692307692304|
|     2.0|  N|               -1.5|
|     3.0|  F| 0.6379310344827586|
|     4.0|  F| 2.0000000000000004|
|     3.0|  N|               null|
|     4.0|  M|-0.5999999999999998|
|     1.0|  M|-0.9999999999999998|
|     1.0|  F| 1.2500000000000002|
|     1.0|  N| 1.0000000000000004|
|     2.0|  F|               null|
|     3.0|  M| 2.0000000000000013|
+--------+---+-------------------+



# На примере RF

In [20]:
import shutil
from datetime import datetime, timedelta

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
import pyspark.sql
from pyspark.sql import SparkSession
import pyspark.sql.functions as sf
from pyspark.sql.types import DoubleType
import pyarrow

In [21]:
# определим данные и целевую
TITANIC_URL = "https://raw.githubusercontent.com/amueller/scipy-2017-sklearn/091d371/notebooks/datasets/titanic3.csv"
TARGET = "fare"
NUMERICAL_FEATURES = [
    "sibsp",
    "parch",
    "age"
]
CATEGORICAL_FEATURES = [
    "sex",
    "cabin"
]
ALL_FEATURES = NUMERICAL_FEATURES + CATEGORICAL_FEATURES

In [22]:
# получим данные
df = (
    pd.read_csv(TITANIC_URL)[NUMERICAL_FEATURES + CATEGORICAL_FEATURES + [TARGET]]
    .dropna()
)

for num_feat in NUMERICAL_FEATURES:
    df[num_feat] = df[num_feat].astype(float)

In [23]:
df.head()

Unnamed: 0,sibsp,parch,age,sex,cabin,fare
0,0.0,0.0,29.0,female,B5,211.3375
1,1.0,2.0,0.9167,male,C22 C26,151.55
2,1.0,2.0,2.0,female,C22 C26,151.55
3,1.0,2.0,30.0,male,C22 C26,151.55
4,1.0,2.0,25.0,female,C22 C26,151.55


In [24]:
ddf = spark.createDataFrame(df)
ddf.show(5)

+-----+-----+------+------+-------+--------+
|sibsp|parch|   age|   sex|  cabin|    fare|
+-----+-----+------+------+-------+--------+
|  0.0|  0.0|  29.0|female|     B5|211.3375|
|  1.0|  2.0|0.9167|  male|C22 C26|  151.55|
|  1.0|  2.0|   2.0|female|C22 C26|  151.55|
|  1.0|  2.0|  30.0|  male|C22 C26|  151.55|
|  1.0|  2.0|  25.0|female|C22 C26|  151.55|
+-----+-----+------+------+-------+--------+
only showing top 5 rows



### Добавим RF из Sklearn

In [25]:
# pandas UDF

def spark_predict(model, cols) -> pyspark.sql.column:
    """
        model: модель из Sklearn
        cols (list-like): параметры для предикта
    """
    @sf.pandas_udf(returnType=DoubleType())
    def predict_pandas_udf(*cols):
        X = pd.concat(cols, axis=1)
        return pd.Series(model.predict(X))
    
    return predict_pandas_udf(*cols)

In [26]:
# rf
rf = RandomForestRegressor()
rf = rf.fit(df[NUMERICAL_FEATURES], df[TARGET])

In [27]:
rf.predict(df[NUMERICAL_FEATURES])[:5]

array([119.84782992, 141.522249  , 140.452749  , 129.287917  ,
       129.908749  ])

### Используем в Spark

In [28]:
# withColum(str, функция (модель, фичи))
# !!!!!!!!!!!!!!!!!11
(
    ddf
    .select(NUMERICAL_FEATURES + [TARGET])
    .withColumn("prediction", spark_predict(rf, NUMERICAL_FEATURES).alias("prediction"))
    .show(5)
)

+-----+-----+------+--------+------------------+
|sibsp|parch|   age|    fare|        prediction|
+-----+-----+------+--------+------------------+
|  0.0|  0.0|  29.0|211.3375| 119.8478299242064|
|  1.0|  2.0|0.9167|  151.55|141.52224899999982|
|  1.0|  2.0|   2.0|  151.55| 140.4527489999998|
|  1.0|  2.0|  30.0|  151.55|129.28791699999985|
|  1.0|  2.0|  25.0|  151.55| 129.9087489999999|
+-----+-----+------+--------+------------------+
only showing top 5 rows



### В виде pipeline

In [29]:
pipe = Pipeline(steps=[("scaler", MinMaxScaler()), ("predictor", RandomForestRegressor())])
pipe = pipe.fit(df[NUMERICAL_FEATURES], df[TARGET])

(
    ddf
    .select(NUMERICAL_FEATURES + [TARGET])
    .withColumn("pipe_predict", spark_predict(pipe, NUMERICAL_FEATURES).alias("prediction")).show(5)
)

+-----+-----+------+--------+------------------+
|sibsp|parch|   age|    fare|      pipe_predict|
+-----+-----+------+--------+------------------+
|  0.0|  0.0|  29.0|211.3375|110.63214057961764|
|  1.0|  2.0|0.9167|  151.55|144.46233199999978|
|  1.0|  2.0|   2.0|  151.55|141.28503999999978|
|  1.0|  2.0|  30.0|  151.55|135.22670799999986|
|  1.0|  2.0|  25.0|  151.55|124.56416599999991|
+-----+-----+------+--------+------------------+
only showing top 5 rows

