# Binarizer - Creating a binary column base on the others

## Importing

In [1]:
import pyspark, findspark
from pyspark.sql import SparkSession

findspark.init()

spark = SparkSession.builder.appName("binarizer").getOrCreate()

In [2]:
from pyspark.ml.feature import PCA, Binarizer

## Loading Data

In [5]:
iris = spark.read.csv("../../data/iris.csv", header = True, inferSchema=True)
iris.show(2)

+-----------+----------+-----------+----------+-----------+
|sepallength|sepalwidth|petallength|petalwidth|      class|
+-----------+----------+-----------+----------+-----------+
|        5.1|       3.5|        1.4|       0.2|Iris-setosa|
|        4.9|       3.0|        1.4|       0.2|Iris-setosa|
+-----------+----------+-----------+----------+-----------+
only showing top 2 rows



## Using Binarizer

In [9]:
binarizer = Binarizer(
    threshold=5,
    inputCol="sepallength",
    outputCol="sepallengthBin"
)

In [10]:
irisbin = binarizer.transform(iris)

In [11]:
irisbin.show()

+-----------+----------+-----------+----------+-----------+--------------+
|sepallength|sepalwidth|petallength|petalwidth|      class|sepallengthBin|
+-----------+----------+-----------+----------+-----------+--------------+
|        5.1|       3.5|        1.4|       0.2|Iris-setosa|           1.0|
|        4.9|       3.0|        1.4|       0.2|Iris-setosa|           0.0|
|        4.7|       3.2|        1.3|       0.2|Iris-setosa|           0.0|
|        4.6|       3.1|        1.5|       0.2|Iris-setosa|           0.0|
|        5.0|       3.6|        1.4|       0.2|Iris-setosa|           0.0|
|        5.4|       3.9|        1.7|       0.4|Iris-setosa|           1.0|
|        4.6|       3.4|        1.4|       0.3|Iris-setosa|           0.0|
|        5.0|       3.4|        1.5|       0.2|Iris-setosa|           0.0|
|        4.4|       2.9|        1.4|       0.2|Iris-setosa|           0.0|
|        4.9|       3.1|        1.5|       0.1|Iris-setosa|           0.0|
|        5.4|       3.7| 