# Machine Learning Quick Start

In [1]:
import numpy as np
import pandas as pd
import pyspark
import sys

In [2]:
import pyspark.sql.functions as fn

In [3]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import VectorAssembler, StringIndexer

In [None]:
# Local mode
spark = SparkSession\
        .builder\
        .appName("iris")\
        .getOrCreate()

In [None]:
# yarn mode
spark = SparkSession\
        .builder\
        .master("yarn")\
        .config('spark.executor.instances','99')\
        .config('spark.executor.memory','4G')\
        .appName("iris")\
        .getOrCreate()

In [4]:
# Check spark app name
spark.sparkContext.appName

'PySparkShell'

In [5]:
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", True)

In [6]:
# print runtime versions
# Python version
sys.version

'3.8.5 (default, Jul 28 2020, 12:59:40) \n[GCC 9.3.0]'

In [7]:
# Spark version
spark.version

'3.0.1'

### Exploring Data

In [8]:
# load iris.csv into Spark dataframe
#df = spark.read.csv('file:///vagrant/data/iris.csv', header=True, inferSchema=True)
df = spark.read.csv('data/iris.csv', header=True, inferSchema=True)

In [9]:
# First 5 rows of Iris dataset
df.show(5)

+------------+-----------+------------+-----------+-----------+
|sepal_length|sepal_width|petal_length|petal_width|    species|
+------------+-----------+------------+-----------+-----------+
|         5.1|        3.5|         1.4|        0.2|Iris-setosa|
|         4.9|        3.0|         1.4|        0.2|Iris-setosa|
|         4.7|        3.2|         1.3|        0.2|Iris-setosa|
|         4.6|        3.1|         1.5|        0.2|Iris-setosa|
|         5.0|        3.6|         1.4|        0.2|Iris-setosa|
+------------+-----------+------------+-----------+-----------+
only showing top 5 rows



In [10]:
df.printSchema()

root
 |-- sepal_length: double (nullable = true)
 |-- sepal_width: double (nullable = true)
 |-- petal_length: double (nullable = true)
 |-- petal_width: double (nullable = true)
 |-- species: string (nullable = true)



In [11]:
df.describe().show()

+-------+------------------+-------------------+------------------+------------------+--------------+
|summary|      sepal_length|        sepal_width|      petal_length|       petal_width|       species|
+-------+------------------+-------------------+------------------+------------------+--------------+
|  count|               150|                150|               150|               150|           150|
|   mean| 5.843333333333335| 3.0540000000000007|3.7586666666666693|1.1986666666666672|          null|
| stddev|0.8280661279778637|0.43359431136217375| 1.764420419952262|0.7631607417008414|          null|
|    min|               4.3|                2.0|               1.0|               0.1|   Iris-setosa|
|    max|               7.9|                4.4|               6.9|               2.5|Iris-virginica|
+-------+------------------+-------------------+------------------+------------------+--------------+



In [12]:
# number of records for each species available in the dataset
df.groupBy('species').count().show(10,False)

+---------------+-----+
|species        |count|
+---------------+-----+
|Iris-virginica |50   |
|Iris-setosa    |50   |
|Iris-versicolor|50   |
+---------------+-----+



### Feature Engineering

In [13]:
# display all column names
df.columns

['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species']

In [14]:
# vectorize all numerical columns into a single feature column
feature_cols = df.columns[:-1]
assembler = VectorAssembler(inputCols=feature_cols, outputCol='features')
df = assembler.transform(df)

In [15]:
# convert text labels into indices
data = df.select(['features', 'species'])
label_indexer = StringIndexer(inputCol='species', outputCol='label').fit(data)
data = label_indexer.transform(data)

In [16]:
# only select the features and label column
data = data.select(['features', 'label'])

In [17]:
# Reading for machine learning
data.show(10)

+-----------------+-----+
|         features|label|
+-----------------+-----+
|[5.1,3.5,1.4,0.2]|  0.0|
|[4.9,3.0,1.4,0.2]|  0.0|
|[4.7,3.2,1.3,0.2]|  0.0|
|[4.6,3.1,1.5,0.2]|  0.0|
|[5.0,3.6,1.4,0.2]|  0.0|
|[5.4,3.9,1.7,0.4]|  0.0|
|[4.6,3.4,1.4,0.3]|  0.0|
|[5.0,3.4,1.5,0.2]|  0.0|
|[4.4,2.9,1.4,0.2]|  0.0|
|[4.9,3.1,1.5,0.1]|  0.0|
+-----------------+-----+
only showing top 10 rows



In [18]:
data.select(['label']).distinct().show()

+-----+
|label|
+-----+
|  0.0|
|  1.0|
|  2.0|
+-----+



### Split Data - Train & Test sets

In [19]:
# use Logistic Regression to train on the training set
train, test = data.randomSplit([0.70, 0.30], seed=42)

### Build Logistic Regression Model

In [20]:
# change regularization rate and you will likely get a different accuracy.
reg = 0.01

In [21]:
lr = LogisticRegression(regParam=reg)
model = lr.fit(train)

In [22]:
# predict on the test set
prediction = model.transform(test)

In [23]:
# print prediction
prediction.show(10)

+-----------------+-----+--------------------+--------------------+----------+
|         features|label|       rawPrediction|         probability|prediction|
+-----------------+-----+--------------------+--------------------+----------+
|[4.4,3.0,1.3,0.2]|  0.0|[5.89414428166008...|[0.97123403594185...|       0.0|
|[4.6,3.2,1.4,0.2]|  0.0|[6.00635948782287...|[0.97435566841077...|       0.0|
|[4.6,3.6,1.0,0.2]|  0.0|[7.29684218715271...|[0.99395737624340...|       0.0|
|[4.7,3.2,1.3,0.2]|  0.0|[5.99055639225875...|[0.97165290245960...|       0.0|
|[4.8,3.1,1.6,0.2]|  0.0|[5.35128065043783...|[0.94374394619750...|       0.0|
|[4.8,3.4,1.6,0.2]|  0.0|[6.01829070351088...|[0.97524834298394...|       0.0|
|[4.8,3.4,1.9,0.2]|  0.0|[5.71743873208656...|[0.96858641884940...|       0.0|
|[4.9,3.1,1.5,0.1]|  0.0|[5.54384540061680...|[0.94379632345249...|       0.0|
|[4.9,3.1,1.5,0.1]|  0.0|[5.54384540061680...|[0.94379632345249...|       0.0|
|[5.0,2.3,3.3,1.0]|  1.0|[-0.0313576005173...|[0.112

### Evaluate Model

In [24]:
# evaluate the accuracy of the model using the test set
evaluator = MulticlassClassificationEvaluator(metricName='accuracy')

In [25]:
accuracy = evaluator.evaluate(prediction)

In [26]:
# print accuracy 
accuracy

0.9782608695652174