In [3]:
from pyspark.sql.functions import col
from pyspark.sql.types import FloatType, IntegerType, StringType
from pyspark.sql.types import StructField, StructType

## loadDataSet

In [15]:
data = [(1, 1.0, "a"), (2, 2.0, "b")]

In [16]:
schema = StructType([
    StructField("int", IntegerType(), True),
    StructField("float", FloatType(), True),
    StructField("string", StringType(), True)])

In [17]:
df = spark.createDataFrame(data, schema=schema)
df.show()

+---+-----+------+
|int|float|string|
+---+-----+------+
|  1|  1.0|     a|
|  2|  2.0|     b|
+---+-----+------+



In [18]:
df.printSchema()

root
 |-- int: integer (nullable = true)
 |-- float: float (nullable = true)
 |-- string: string (nullable = true)



In [19]:
df.write.csv("../../../test/resources/loadDataSet/", header=True)

## replacementNoneValues

In [17]:
data = [(100, 1.0, 2), (100, 2.0, 4), (100, None, None), (200, 1.0, 1), (200, 1.0, 1)]

In [18]:
schema = StructType([
    StructField("target", IntegerType(), True),
    StructField("x", FloatType(), True),
    StructField("y", IntegerType(), True)])

In [19]:
df = spark.createDataFrame(data, schema=schema)
df.show()

+------+----+----+
|target|   x|   y|
+------+----+----+
|   100| 1.0|   2|
|   100| 2.0|   4|
|   100|null|null|
|   200| 1.0|   1|
|   200| 1.0|   1|
+------+----+----+



In [21]:
df.coalesce(1).write.mode("overwrite").csv("../../../test/resources/replacementNoneValues", header=True)

In [15]:
df.printSchema()

root
 |-- target: integer (nullable = true)
 |-- x: float (nullable = true)
 |-- y: integer (nullable = true)



In [16]:
(df
 .groupBy("target").mean("x", "y")
 .withColumnRenamed("avg(x)", "x")
 .withColumnRenamed("avg(y)", "y")
 .show())

+------+---+---+
|target|  x|  y|
+------+---+---+
|   100|1.5|3.0|
|   200|1.0|1.0|
+------+---+---+



## replacementYesNoValues

In [1]:
data = [(100, 1.0, 2), (100, 2.0, 4), (100, "yes", "no"), (200, 1.0, 1), (200, 1.0, 1)]

In [4]:
schema = StructType([
    StructField("target", IntegerType(), True),
    StructField("x", StringType(), True),
    StructField("y", StringType(), True)])

In [5]:
df = spark.createDataFrame(data, schema=schema)
df.show()

+------+---+---+
|target|  x|  y|
+------+---+---+
|   100|1.0|  2|
|   100|2.0|  4|
|   100|yes| no|
|   200|1.0|  1|
|   200|1.0|  1|
+------+---+---+



In [6]:
df.printSchema()

root
 |-- target: integer (nullable = true)
 |-- x: string (nullable = true)
 |-- y: string (nullable = true)



In [7]:
df.coalesce(1).write.mode("overwrite").csv("../../../test/resources/replacementYesNoValues", header=True)

## DefineLabelFeatures

In [6]:
data = [(0, 1.0, 2.0), 
        (0, 2.0, 4.0), 
        (1, 1.0, 1.0), 
        (1, 1.0, 1.0)]

In [7]:
schema = StructType([
    StructField("target", IntegerType(), True),
    StructField("x", FloatType(), True),
    StructField("y", FloatType(), True)])

In [8]:
df = spark.createDataFrame(data, schema=schema)
df.show()

+------+---+---+
|target|  x|  y|
+------+---+---+
|     0|1.0|2.0|
|     0|2.0|4.0|
|     1|1.0|1.0|
|     1|1.0|1.0|
+------+---+---+



In [9]:
df.coalesce(1).write.mode("overwrite").csv("../../../test/resources/defineLabelFeatures", header=True)