In [1]:
from pyspark.sql.functions import col
from pyspark.sql.types import FloatType, IntegerType, StringType, DoubleType
from pyspark.sql.types import StructField, StructType
from pyspark.sql.functions import udf, col

## loadDataSet

In [15]:
data = [(1, 1.0, "a"), (2, 2.0, "b")]

In [16]:
schema = StructType([
    StructField("int", IntegerType(), True),
    StructField("float", FloatType(), True),
    StructField("string", StringType(), True)])

In [17]:
df = spark.createDataFrame(data, schema=schema)
df.show()

+---+-----+------+
|int|float|string|
+---+-----+------+
|  1|  1.0|     a|
|  2|  2.0|     b|
+---+-----+------+



In [18]:
df.printSchema()

root
 |-- int: integer (nullable = true)
 |-- float: float (nullable = true)
 |-- string: string (nullable = true)



In [19]:
df.write.csv("../../../test/resources/loadDataSet/", header=True)

## replacementNoneValues

In [17]:
data = [(100, 1.0, 2), (100, 2.0, 4), (100, None, None), (200, 1.0, 1), (200, 1.0, 1)]

In [18]:
schema = StructType([
    StructField("target", IntegerType(), True),
    StructField("x", FloatType(), True),
    StructField("y", IntegerType(), True)])

In [19]:
df = spark.createDataFrame(data, schema=schema)
df.show()

+------+----+----+
|target|   x|   y|
+------+----+----+
|   100| 1.0|   2|
|   100| 2.0|   4|
|   100|null|null|
|   200| 1.0|   1|
|   200| 1.0|   1|
+------+----+----+



In [21]:
df.coalesce(1).write.mode("overwrite").csv("../../../test/resources/replacementNoneValues", header=True)

In [15]:
df.printSchema()

root
 |-- target: integer (nullable = true)
 |-- x: float (nullable = true)
 |-- y: integer (nullable = true)



In [16]:
(df
 .groupBy("target").mean("x", "y")
 .withColumnRenamed("avg(x)", "x")
 .withColumnRenamed("avg(y)", "y")
 .show())

+------+---+---+
|target|  x|  y|
+------+---+---+
|   100|1.5|3.0|
|   200|1.0|1.0|
+------+---+---+



## replacementYesNoValues

In [1]:
data = [(100, 1.0, 2), (100, 2.0, 4), (100, "yes", "no"), (200, 1.0, 1), (200, 1.0, 1)]

In [4]:
schema = StructType([
    StructField("target", IntegerType(), True),
    StructField("x", StringType(), True),
    StructField("y", StringType(), True)])

In [5]:
df = spark.createDataFrame(data, schema=schema)
df.show()

+------+---+---+
|target|  x|  y|
+------+---+---+
|   100|1.0|  2|
|   100|2.0|  4|
|   100|yes| no|
|   200|1.0|  1|
|   200|1.0|  1|
+------+---+---+



In [6]:
df.printSchema()

root
 |-- target: integer (nullable = true)
 |-- x: string (nullable = true)
 |-- y: string (nullable = true)



In [7]:
df.coalesce(1).write.mode("overwrite").csv("../../../test/resources/replacementYesNoValues", header=True)

## DefineLabelFeatures

In [2]:
data = [("a", 0, 1.0, 2.0), 
        ("b", 0, 2.0, 4.0), 
        ("c", 1, 1.0, 1.0), 
        ("d", 1, 1.0, 1.0)]

In [3]:
schema = StructType([
    StructField("id", StringType(), True),
    StructField("target", IntegerType(), True),
    StructField("x", FloatType(), True),
    StructField("y", FloatType(), True)])

In [4]:
df = spark.createDataFrame(data, schema=schema)
df.show()

+---+------+---+---+
| id|target|  x|  y|
+---+------+---+---+
|  a|     0|1.0|2.0|
|  b|     0|2.0|4.0|
|  c|     1|1.0|1.0|
|  d|     1|1.0|1.0|
+---+------+---+---+



In [5]:
df.coalesce(1).write.mode("overwrite").csv("../../../test/resources/defineLabelFeatures", header=True)

## String Indexer

In [5]:
data = [("0",), ("10",), ("5",), ("11",)]

In [6]:
schema = StructType([
    StructField("target", StringType(), True),])

In [7]:
df = spark.createDataFrame(data, schema=schema)
df.show()

+------+
|target|
+------+
|     0|
|    10|
|     5|
|    11|
+------+



In [8]:
df.printSchema()

root
 |-- target: string (nullable = true)



In [9]:
df.coalesce(1).write.mode("overwrite").csv("../../../test/resources/stringIndexer", header=True)

## adaBoost

In [3]:
data = spark.read.parquet("/home/ubuntu/Documents/costa_rican_household_poverty/src/test/resources/classificationTask")
data.show()

+---+------+-----------------+
| id|target|         features|
+---+------+-----------------+
|  a|     0|[1.0,2.0,1.0,2.0]|
|  b|     0|[2.0,4.0,2.0,4.0]|
|  c|     1|[1.0,1.0,1.0,1.0]|
|  d|     1|[1.0,1.0,1.0,1.0]|
+---+------+-----------------+



In [4]:
dic_weight = spark.sparkContext.broadcast({"a": 0.1, "b": 0.2, "c": 0.3, "d": 0.4})
dic_prediction = spark.sparkContext.broadcast({"a": 1, "b": 0, "c": 1, "d": 0})

set_weight = udf(lambda id_: dic_weight.value[id_], DoubleType())
set_prediction = udf(lambda id_: dic_prediction.value[id_], IntegerType())

In [5]:
data = data.withColumn("prediction", set_prediction(col("id"))).withColumn("weight", set_weight(col("id")))
data.show()

+---+------+-----------------+----------+------+
| id|target|         features|prediction|weight|
+---+------+-----------------+----------+------+
|  a|     0|[1.0,2.0,1.0,2.0]|         1|   0.1|
|  b|     0|[2.0,4.0,2.0,4.0]|         0|   0.2|
|  c|     1|[1.0,1.0,1.0,1.0]|         1|   0.3|
|  d|     1|[1.0,1.0,1.0,1.0]|         0|   0.4|
+---+------+-----------------+----------+------+



In [7]:
(data.select(col("id"), col("target").cast("double"),col("features"))
.write.mode("overwrite").parquet("../../../test/resources/adaBoost"))