In [1]:
import pyspark
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.types import *

packages = ','.join([
    'com.johnsnowlabs.nlp:spark-nlp_2.12:3.3.2'
])

spark_conf = SparkConf()
spark_conf = spark_conf.setAppName('Iris Test')\
    .setAppName('master[*]')\
    .set('spark.jars.parkages', packages)
spark = SparkSession.builder\
    .config(conf=spark_conf)\
    .getOrCreate()

In [2]:
schema = StructType([
    StructField('sepal_length', DoubleType(), nullable=False),
    StructField('sepal_width', DoubleType(), nullable=False),
    StructField('petal_length', DoubleType(), nullable=False),
    StructField('petal_width', DoubleType(), nullable=False),
    StructField('class', StringType(), nullable=False),
])

In [3]:
iris = spark.read.csv('./iris.data', schema=schema)

iris.describe().toPandas()

Unnamed: 0,summary,sepal_length,sepal_width,petal_length,petal_width,class
0,count,150.0,150.0,150.0,150.0,150
1,mean,5.843333333333335,3.0540000000000007,3.758666666666669,1.1986666666666672,
2,stddev,0.8280661279778637,0.4335943113621737,1.764420419952262,0.7631607417008414,
3,min,4.3,2.0,1.0,0.1,Iris-setosa
4,max,7.9,4.4,6.9,2.5,Iris-virginica


In [4]:
iris.select('class').distinct().toPandas()

Unnamed: 0,class
0,Iris-virginica
1,Iris-setosa
2,Iris-versicolor


In [5]:
iris.registerTempTable('iris')

In [6]:
spark.sql('''
SELECT *
FROM iris
LIMIT 5
''').toPandas()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [7]:
spark.sql('''
SELECT
    class,
    min(sepal_length), avg(sepal_length), max(sepal_length),
    min(sepal_width), avg(sepal_width), max(sepal_width),
    min(petal_length), avg(petal_length), max(petal_length),
    min(petal_width), avg(petal_width), max(petal_width)
FROM iris
GROUP BY class
''').toPandas()

Unnamed: 0,class,min(sepal_length),avg(sepal_length),max(sepal_length),min(sepal_width),avg(sepal_width),max(sepal_width),min(petal_length),avg(petal_length),max(petal_length),min(petal_width),avg(petal_width),max(petal_width)
0,Iris-virginica,4.9,6.588,7.9,2.2,2.974,3.8,4.5,5.552,6.9,1.4,2.026,2.5
1,Iris-setosa,4.3,5.006,5.8,2.3,3.418,4.4,1.0,1.464,1.9,0.1,0.244,0.6
2,Iris-versicolor,4.9,5.936,7.0,2.0,2.77,3.4,3.0,4.26,5.1,1.0,1.326,1.8


## SQLTransformer

- 단 하나의 매개변수만을 가지게 됨
    - 실행될 SQL 문

In [8]:
from pyspark.ml.feature import SQLTransformer

statement = '''
SELECT
    class,
    min(sepal_width), avg(sepal_width), max(sepal_width),
    min(sepal_length), avg(sepal_length), max(sepal_length),
    min(petal_width), avg(petal_width), max(petal_width),
    min(petal_length), avg(petal_length), max(petal_length)
FROM iris
GROUP BY class
'''

sql_transformer = SQLTransformer(statement=statement)

sql_transformer.transform(iris).toPandas()

Unnamed: 0,class,min(sepal_width),avg(sepal_width),max(sepal_width),min(sepal_length),avg(sepal_length),max(sepal_length),min(petal_width),avg(petal_width),max(petal_width),min(petal_length),avg(petal_length),max(petal_length)
0,Iris-virginica,2.2,2.974,3.8,4.9,6.588,7.9,1.4,2.026,2.5,4.5,5.552,6.9
1,Iris-setosa,2.3,3.418,4.4,4.3,5.006,5.8,0.1,0.244,0.6,1.0,1.464,1.9
2,Iris-versicolor,2.0,2.77,3.4,4.9,5.936,7.0,1.0,1.326,1.8,3.0,4.26,5.1


In [None]:
from pyspark.ml.feature import MinMaxScaler

scaler = MinMaxScaler(
    inputCo
)