<a href="https://colab.research.google.com/github/JihunSKKU/PySpark/blob/main/SpqrkML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# SparkML - Lec08

## Spark ML library - Clustering

K-means clustering algorithm
1. Initialize centroids
2. Assign cluster to each data according to its nearest centroid
3. Update centroids towards the center of data
4. Repeat 2, 3 until centroids remain unchanged

In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import VectorAssembler

spark = SparkSession.builder.appName("KMeansPractice").getOrCreate()

data = [[0.0, 0.1], [1.0, 1.0], [0.5, 0.6], [0.5, 2.0], [9.0, 8.0], [8.0, 9.0], [9.0, 9.5], [10.0, 10.0]]
columns = ["feature1", "feature2"]

df = spark.createDataFrame(data, columns)
df.show()

+--------+--------+
|feature1|feature2|
+--------+--------+
|     0.0|     0.1|
|     1.0|     1.0|
|     0.5|     0.6|
|     0.5|     2.0|
|     9.0|     8.0|
|     8.0|     9.0|
|     9.0|     9.5|
|    10.0|    10.0|
+--------+--------+



In [None]:
assembler = VectorAssembler(inputCols=columns, outputCol="features")

df = assembler.transform(df)
df.show()

+--------+--------+-----------+
|feature1|feature2|   features|
+--------+--------+-----------+
|     0.0|     0.1|  [0.0,0.1]|
|     1.0|     1.0|  [1.0,1.0]|
|     0.5|     0.6|  [0.5,0.6]|
|     0.5|     2.0|  [0.5,2.0]|
|     9.0|     8.0|  [9.0,8.0]|
|     8.0|     9.0|  [8.0,9.0]|
|     9.0|     9.5|  [9.0,9.5]|
|    10.0|    10.0|[10.0,10.0]|
+--------+--------+-----------+



### Train the K-means model
- KMeans(featuresCol, predictionCol, k, maxIter, distanceMeasure)
    - featuresCol: Features column name.
    - predictionCol: Prediction column name.
    - k: The number of clusters to create.
    - maxIter: Max number of iteration.
    - distanceMeasure: The distance measure. (euclidean/cosine)


In [None]:
kmeans = KMeans(featuresCol='features',
                predictionCol='prediction',
                k=2,
                maxIter=20,
                distanceMeasure='euclidean')

model = kmeans.fit(df)

In [None]:
centroids = model.clusterCenters()
predictions = model.transform(df).select("features", "prediction")

## Spark ML library - Classification

### Logistic Regression

In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler

spark = SparkSession.builder.appName("IrisLogisticRegression").getOrCreate()

iris_data = spark.read.option('header', 'true').option('inferSchema', 'true').csv('Iris.csv')

In [None]:
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType
idx_dict = {'Iris-setosa': 0,
            'Iris-versicolor': 1,
            'Iris-virginica': 2}

# User Define Function
label_mapping_udf = udf(lambda label: idx_dict.get(label), IntegerType())
iris_data = iris_data.withColumn("Species", label_mapping_udf(iris_data["Species"]))
iris_data.show()

+---+-------------+------------+-------------+------------+-------+
| Id|SepalLengthCm|SepalWidthCm|PetalLengthCm|PetalWidthCm|Species|
+---+-------------+------------+-------------+------------+-------+
|  1|          5.1|         3.5|          1.4|         0.2|      0|
|  2|          4.9|         3.0|          1.4|         0.2|      0|
|  3|          4.7|         3.2|          1.3|         0.2|      0|
|  4|          4.6|         3.1|          1.5|         0.2|      0|
|  5|          5.0|         3.6|          1.4|         0.2|      0|
|  6|          5.4|         3.9|          1.7|         0.4|      0|
|  7|          4.6|         3.4|          1.4|         0.3|      0|
|  8|          5.0|         3.4|          1.5|         0.2|      0|
|  9|          4.4|         2.9|          1.4|         0.2|      0|
| 10|          4.9|         3.1|          1.5|         0.1|      0|
| 11|          5.4|         3.7|          1.5|         0.2|      0|
| 12|          4.8|         3.4|          1.6|  

In [None]:
# Assemble the features into a vector column and name the column to "features"
assembler = VectorAssembler(inputCols=["SepalLengthCm", "SepalWidthCm", "PetalLengthCm", "PetalWidthCm"],
                            outputCol="features")

# Rename the target column to "label"
iris_data = assembler.transform(iris_data).select("features", "Species").withColumnRenamed("Species", "label")

In [None]:
train_data, test_data = iris_data.randomSplit([0.8, 0.2], seed=2023)

In [None]:
from pyspark.ml.classification import LogisticRegression

logistic_regression = LogisticRegression(featuresCol="features",
                                         labelCol="label",
                                         predictionCol='prediction',
                                         maxIter=100)

model = logistic_regression.fit(train_data)

In [None]:
model

LogisticRegressionModel: uid=LogisticRegression_06b914faa6d6, numClasses=3, numFeatures=4

In [None]:
predictions = model.transform(test_data)

In [None]:
predictions.show()

+-----------------+-----+--------------------+--------------------+----------+
|         features|label|       rawPrediction|         probability|prediction|
+-----------------+-----+--------------------+--------------------+----------+
|[4.7,3.2,1.3,0.2]|    0|[9438.32169347803...|       [1.0,0.0,0.0]|       0.0|
|[4.9,3.1,1.5,0.1]|    0|[8794.92750919444...|       [1.0,0.0,0.0]|       0.0|
|[5.0,3.6,1.4,0.2]|    0|[9604.88573039206...|       [1.0,0.0,0.0]|       0.0|
|[5.4,3.9,1.7,0.4]|    0|[8318.84741194077...|       [1.0,0.0,0.0]|       0.0|
|[5.5,2.4,3.7,1.0]|    1|[-711.8096549038,...|       [0.0,1.0,0.0]|       1.0|
|[5.5,2.5,4.0,1.3]|    1|[-1965.4704489795...|       [0.0,1.0,0.0]|       1.0|
|[5.6,3.0,4.1,1.3]|    1|[-1128.9673545014...|       [0.0,1.0,0.0]|       1.0|
|[5.6,3.0,4.5,1.5]|    1|[-2440.3856093591...|       [0.0,1.0,0.0]|       1.0|
|[5.7,2.8,4.1,1.3]|    1|[-1826.3269796827...|       [0.0,1.0,0.0]|       1.0|
|[5.8,2.7,4.1,1.0]|    1|[-1258.7122062005...|      

In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")

accuracy = evaluator.evaluate(predictions)
print(f"Accuracy: {accuracy}")

Accuracy: 0.9583333333333334
