In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession 
from pyspark.ml.feature import VectorAssembler, StandardScaler, PCA 

In [2]:
spark=SparkSession.builder.appName('num_of_hacker').getOrCreate()
df=spark.read.csv("D:\\MEPCO\\SEMESTER 5\\Machine Learning Essentials\\iris.csv",header=True,inferSchema=True)
from pyspark.ml.feature import VectorAssembler,PCA
df.columns

['Id',
 'SepalLengthCm',
 'SepalWidthCm',
 'PetalLengthCm',
 'PetalWidthCm',
 'Species']

In [3]:
assembler = VectorAssembler(inputCols = ['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm'] , outputCol = 'features')
df = assembler.transform(df)
df.show(6)

+---+-------------+------------+-------------+------------+-----------+-----------------+
| Id|SepalLengthCm|SepalWidthCm|PetalLengthCm|PetalWidthCm|    Species|         features|
+---+-------------+------------+-------------+------------+-----------+-----------------+
|  1|          5.1|         3.5|          1.4|         0.2|Iris-setosa|[5.1,3.5,1.4,0.2]|
|  2|          4.9|         3.0|          1.4|         0.2|Iris-setosa|[4.9,3.0,1.4,0.2]|
|  3|          4.7|         3.2|          1.3|         0.2|Iris-setosa|[4.7,3.2,1.3,0.2]|
|  4|          4.6|         3.1|          1.5|         0.2|Iris-setosa|[4.6,3.1,1.5,0.2]|
|  5|          5.0|         3.6|          1.4|         0.2|Iris-setosa|[5.0,3.6,1.4,0.2]|
|  6|          5.4|         3.9|          1.7|         0.4|Iris-setosa|[5.4,3.9,1.7,0.4]|
+---+-------------+------------+-------------+------------+-----------+-----------------+
only showing top 6 rows



In [4]:
scaler = StandardScaler(
    inputCol = 'features', 
    outputCol = 'scaledFeatures',
    withMean = True,
    withStd = True
).fit(df)

In [5]:
df_scaled = scaler.transform(df)
df_scaled.show(6)

+---+-------------+------------+-------------+------------+-----------+-----------------+--------------------+
| Id|SepalLengthCm|SepalWidthCm|PetalLengthCm|PetalWidthCm|    Species|         features|      scaledFeatures|
+---+-------------+------------+-------------+------------+-----------+-----------------+--------------------+
|  1|          5.1|         3.5|          1.4|         0.2|Iris-setosa|[5.1,3.5,1.4,0.2]|[-0.8976738791967...|
|  2|          4.9|         3.0|          1.4|         0.2|Iris-setosa|[4.9,3.0,1.4,0.2]|[-1.1392004834649...|
|  3|          4.7|         3.2|          1.3|         0.2|Iris-setosa|[4.7,3.2,1.3,0.2]|[-1.3807270877331...|
|  4|          4.6|         3.1|          1.5|         0.2|Iris-setosa|[4.6,3.1,1.5,0.2]|[-1.5014903898672...|
|  5|          5.0|         3.6|          1.4|         0.2|Iris-setosa|[5.0,3.6,1.4,0.2]|[-1.0184371813308...|
|  6|          5.4|         3.9|          1.7|         0.4|Iris-setosa|[5.4,3.9,1.7,0.4]|[-0.5353839727944...|
+

In [6]:
n_components = 2
pca = PCA(
    k = n_components, 
    inputCol = 'scaledFeatures', 
    outputCol = 'pcaFeatures'
).fit(df_scaled)
df_pca = pca.transform(df_scaled)
print('Explained Variance Ratio', pca.explainedVariance.toArray())
df_pca.show(6)

Explained Variance Ratio [0.72770452 0.23030523]
+---+-------------+------------+-------------+------------+-----------+-----------------+--------------------+--------------------+
| Id|SepalLengthCm|SepalWidthCm|PetalLengthCm|PetalWidthCm|    Species|         features|      scaledFeatures|         pcaFeatures|
+---+-------------+------------+-------------+------------+-----------+-----------------+--------------------+--------------------+
|  1|          5.1|         3.5|          1.4|         0.2|Iris-setosa|[5.1,3.5,1.4,0.2]|[-0.8976738791967...|[2.25698063306802...|
|  2|          4.9|         3.0|          1.4|         0.2|Iris-setosa|[4.9,3.0,1.4,0.2]|[-1.1392004834649...|[2.07945911889540...|
|  3|          4.7|         3.2|          1.3|         0.2|Iris-setosa|[4.7,3.2,1.3,0.2]|[-1.3807270877331...|[2.36004408158420...|
|  4|          4.6|         3.1|          1.5|         0.2|Iris-setosa|[4.6,3.1,1.5,0.2]|[-1.5014903898672...|[2.29650366000388...|
|  5|          5.0|        

In [7]:
X_pca = df_pca.rdd.map(lambda row: row.pcaFeatures).collect()
X_pca = np.array(X_pca)
X_pca

array([[ 2.25698063e+00, -5.04015404e-01],
       [ 2.07945912e+00,  6.53216394e-01],
       [ 2.36004408e+00,  3.17413945e-01],
       [ 2.29650366e+00,  5.73446613e-01],
       [ 2.38080159e+00, -6.72514411e-01],
       [ 2.06362348e+00, -1.51347827e+00],
       [ 2.43754534e+00, -7.43137171e-02],
       [ 2.22638327e+00, -2.46787172e-01],
       [ 2.33413810e+00,  1.09148977e+00],
       [ 2.18136797e+00,  4.47131117e-01],
       [ 2.15626287e+00, -1.06702096e+00],
       [ 2.31960686e+00, -1.58057946e-01],
       [ 2.21665672e+00,  7.06750478e-01],
       [ 2.63090249e+00,  9.35149145e-01],
       [ 2.18497165e+00, -1.88366805e+00],
       [ 2.24394778e+00, -2.71328133e+00],
       [ 2.19539570e+00, -1.50869601e+00],
       [ 2.18286636e+00, -5.12587094e-01],
       [ 1.88775015e+00, -1.42633236e+00],
       [ 2.33213620e+00, -1.15416686e+00],
       [ 1.90816387e+00, -4.29027880e-01],
       [ 2.19728429e+00, -9.49277150e-01],
       [ 2.76490710e+00, -4.87882574e-01],
       [ 1.