<a href="https://colab.research.google.com/github/ItishaK/AI_Colab_WS/blob/main/Spark_MLlib_intro.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Reference: https://github.com/krishnaik06/Pyspark-With-Python

In [1]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.0.tar.gz (316.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.9/316.9 MB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.0-py2.py3-none-any.whl size=317425345 sha256=0c7c78c143a0531228a34f5358f64679372b2bd58c757d5834430b6f34941c0b
  Stored in directory: /root/.cache/pip/wheels/41/4e/10/c2cf2467f71c678cfc8a6b9ac9241e5e44a01940da8fbb17fc
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.0


In [2]:
from pyspark.sql import SparkSession

In [4]:
spark = SparkSession.builder.appName('MLlib_Practice').getOrCreate()

In [8]:
training = spark.read.csv('Test1_ds.csv', header=True, inferSchema=True)

In [9]:
training.show()

+------+---+----------+------+
|  Name|Age|Experience|Salary|
+------+---+----------+------+
|  Riya| 23|         3| 30000|
|   Sam| 29|         5| 50000|
|Joseph| 30|         4| 40000|
|   Raj| 35|         1| 10000|
| James| 26|         7| 45000|
|  Mary| 31|         6| 60000|
|   Sia| 28|         8| 80000|
+------+---+----------+------+



In [10]:
training.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Salary: integer (nullable = true)



In [11]:
training.columns

['Name', 'Age', 'Experience', 'Salary']

In [12]:
## Group: [Age, Experience] --> new-feature --> independent feature

from pyspark.ml.feature import VectorAssembler
feature_assembler = VectorAssembler(inputCols=['Age','Experience'],
                                    outputCol='Independent Features')

In [13]:
output = feature_assembler.transform(training)

In [14]:
output.show()

+------+---+----------+------+--------------------+
|  Name|Age|Experience|Salary|Independent Features|
+------+---+----------+------+--------------------+
|  Riya| 23|         3| 30000|          [23.0,3.0]|
|   Sam| 29|         5| 50000|          [29.0,5.0]|
|Joseph| 30|         4| 40000|          [30.0,4.0]|
|   Raj| 35|         1| 10000|          [35.0,1.0]|
| James| 26|         7| 45000|          [26.0,7.0]|
|  Mary| 31|         6| 60000|          [31.0,6.0]|
|   Sia| 28|         8| 80000|          [28.0,8.0]|
+------+---+----------+------+--------------------+



In [15]:
output.columns

['Name', 'Age', 'Experience', 'Salary', 'Independent Features']

In [16]:
finalized_data = output.select("Independent Features", "Salary")

In [17]:
finalized_data.show()

+--------------------+------+
|Independent Features|Salary|
+--------------------+------+
|          [23.0,3.0]| 30000|
|          [29.0,5.0]| 50000|
|          [30.0,4.0]| 40000|
|          [35.0,1.0]| 10000|
|          [26.0,7.0]| 45000|
|          [31.0,6.0]| 60000|
|          [28.0,8.0]| 80000|
+--------------------+------+



In [19]:
from pyspark.ml.regression import LinearRegression

train_data,test_data=finalized_data.randomSplit([0.75,0.25])
regressor = LinearRegression(featuresCol='Independent Features', labelCol='Salary')
regressor = regressor.fit(train_data)

In [20]:
regressor.coefficients

DenseVector([-0.0, 10000.0])

In [21]:
regressor.intercept

0.0

In [22]:
pred_results = regressor.evaluate(test_data)

In [23]:
pred_results.predictions.show()

+--------------------+------+-----------------+
|Independent Features|Salary|       prediction|
+--------------------+------+-----------------+
|          [26.0,7.0]| 45000|70000.00000000001|
|          [29.0,5.0]| 50000|          50000.0|
|          [30.0,4.0]| 40000|39999.99999999999|
+--------------------+------+-----------------+



In [24]:
pred_results.meanAbsoluteError,pred_results.meanSquaredError

(8333.333333333341, 208333333.33333358)