## ML Pipeline in spark 2

Performs ETL and ML on Nasa Airfoil noise data

In [1]:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn
warnings.filterwarnings('ignore')


import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.pipeline import PipelineModel
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import StandardScaler

### Part 1: ETL

In [2]:
spark = SparkSession.builder.appName("Airfoil_Noise_project").getOrCreate()

24/06/10 15:21:36 WARN Utils: Your hostname, ubuntu-MS-7D15 resolves to a loopback address: 127.0.1.1; using 192.168.1.3 instead (on interface enp5s0)
24/06/10 15:21:36 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
24/06/10 15:21:51 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [3]:
# !wget https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMSkillsNetwork-BD0231EN-Coursera/datasets/NASA_airfoil_noise_raw.csv

In [4]:
df = spark.read.csv("NASA_airfoil_noise_raw.csv", header=True, inferSchema=True)
df.show(5)

+---------+-------------+-----------+------------------+-----------------------+----------+
|Frequency|AngleOfAttack|ChordLength|FreeStreamVelocity|SuctionSideDisplacement|SoundLevel|
+---------+-------------+-----------+------------------+-----------------------+----------+
|      800|          0.0|     0.3048|              71.3|             0.00266337|   126.201|
|     1000|          0.0|     0.3048|              71.3|             0.00266337|   125.201|
|     1250|          0.0|     0.3048|              71.3|             0.00266337|   125.951|
|     1600|          0.0|     0.3048|              71.3|             0.00266337|   127.591|
|     2000|          0.0|     0.3048|              71.3|             0.00266337|   127.461|
+---------+-------------+-----------+------------------+-----------------------+----------+
only showing top 5 rows



In [5]:
df = df.withColumnRenamed("SoundLevel","SoundLevelDecibels")

#### duplicates

In [6]:
rowcount1 = df.count()
print(rowcount1)

1522


In [7]:
df = df.dropDuplicates()

In [8]:
rowcount2 = df.count()
print(rowcount2)

1503


In [9]:
df=df.dropna()

In [10]:
rowcount3 = df.count()
print(rowcount3)

1499


In [11]:
df.write.mode("overwrite").parquet("NASA_airfoil_noise_cleaned.parquet")

24/06/10 15:21:56 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
24/06/10 15:21:56 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 84.44% for 9 writers
24/06/10 15:21:56 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 76.00% for 10 writers
24/06/10 15:21:56 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 69.09% for 11 writers
24/06/10 15:21:56 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 63.33% for 12 writers
24/06/10 15:21:56 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 69.09% for 11 writers
24/06/10 15:21:56 WARN MemoryManager: Total allocation exceeds 95.

In [12]:
print("Part 1 - Evaluation")

print("Total rows = ", rowcount1)
print("Total rows after dropping duplicate rows = ", rowcount2)
print("Total rows after dropping duplicate rows and rows with null values = ", rowcount3)
print("New column name = ", df.columns[-1])

import os

print("NASA_airfoil_noise_cleaned.parquet exists :", os.path.isdir("NASA_airfoil_noise_cleaned.parquet"))

Part 1 - Evaluation
Total rows =  1522
Total rows after dropping duplicate rows =  1503
Total rows after dropping duplicate rows and rows with null values =  1499
New column name =  SoundLevelDecibels
NASA_airfoil_noise_cleaned.parquet exists : True


### Part 2: ML pipeline

In [13]:
df = spark.read.parquet("NASA_airfoil_noise_cleaned.parquet")
rowcount4 = df.count()
print(rowcount4)

1499


In [14]:
assembler = VectorAssembler(inputCols=['Frequency','AngleOfAttack','ChordLength','FreeStreamVelocity','SuctionSideDisplacement'], outputCol="features")

#### standard scaler

In [15]:
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures")

In [16]:
df.show(5)

+---------+-------------+-----------+------------------+-----------------------+------------------+
|Frequency|AngleOfAttack|ChordLength|FreeStreamVelocity|SuctionSideDisplacement|SoundLevelDecibels|
+---------+-------------+-----------+------------------+-----------------------+------------------+
|     1250|          0.0|     0.3048|              39.6|             0.00310138|           125.499|
|     2000|          2.0|     0.2286|              39.6|             0.00346574|           122.797|
|     8000|          4.0|     0.2286|              71.3|             0.00400603|           112.848|
|      315|          7.3|     0.2286|              39.6|              0.0123481|           132.149|
|      400|          7.3|     0.2286|              39.6|              0.0123481|           132.039|
+---------+-------------+-----------+------------------+-----------------------+------------------+
only showing top 5 rows



#### pipeline

In [17]:
lr = LinearRegression(featuresCol="scaledFeatures", labelCol="SoundLevelDecibels")
pipeline = Pipeline(stages=[assembler, scaler, lr])

#### test train split

In [18]:
(trainingData, testingData) = df.randomSplit([0.7, 0.3], seed=42)

#### fit pipeline

In [19]:
pipelineModel = pipeline.fit(trainingData)

24/06/10 15:21:57 WARN Instrumentation: [f6855407] regParam is zero, which might cause numerical instability and overfitting.
24/06/10 15:21:57 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
24/06/10 15:21:57 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
24/06/10 15:21:57 WARN LAPACK: Failed to load implementation from: com.github.fommil.netlib.NativeSystemLAPACK
24/06/10 15:21:57 WARN LAPACK: Failed to load implementation from: com.github.fommil.netlib.NativeRefLAPACK


In [20]:
print("Part 2 - Evaluation")
print("Total rows = ", rowcount4)
ps = [str(x).split("_")[0] for x in pipeline.getStages()]

print("Pipeline Stage 1 = ", ps[0])
print("Pipeline Stage 2 = ", ps[1])
print("Pipeline Stage 3 = ", ps[2])

print("Label column = ", lr.getLabelCol())

Part 2 - Evaluation
Total rows =  1499
Pipeline Stage 1 =  VectorAssembler
Pipeline Stage 2 =  StandardScaler
Pipeline Stage 3 =  LinearRegression
Label column =  SoundLevelDecibels


### part 3 evaluation

In [21]:
predictions = pipelineModel.transform(testingData)

In [22]:
from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="SoundLevelDecibels", metricName="mse")
mse = evaluator.evaluate(predictions)
print(mse)

evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="SoundLevelDecibels", metricName="mae")
mae = evaluator.evaluate(predictions)
print(mae)

evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="SoundLevelDecibels", metricName="r2")
r2 = evaluator.evaluate(predictions)
print(r2)

24.033521241590137
3.824256678479536
0.5003729961991632


In [23]:
print("Part 3 - Evaluation")

print("Mean Squared Error = ", round(mse,2))
print("Mean Absolute Error = ", round(mae,2))
print("R Squared = ", round(r2,2))

lrModel = pipelineModel.stages[-1]

print("Intercept = ", round(lrModel.intercept,2))

Part 3 - Evaluation
Mean Squared Error =  24.03
Mean Absolute Error =  3.82
R Squared =  0.5
Intercept =  132.72


In [24]:
print("Part 4 - Evaluation")

loadedmodel = pipelineModel.stages[-1]
totalstages = len(pipelineModel.stages)
inputcolumns = pipelineModel.stages[0].getInputCols()

print("Number of stages in the pipeline = ", totalstages)
for i,j in zip(inputcolumns, loadedmodel.coefficients):
    print(f"Coefficient for {i} is {round(j,4)}")

Part 4 - Evaluation
Number of stages in the pipeline =  3
Coefficient for Frequency is -4.033
Coefficient for AngleOfAttack is -2.2639
Coefficient for ChordLength is -3.269
Coefficient for FreeStreamVelocity is 1.5076
Coefficient for SuctionSideDisplacement is -2.0833


In [25]:
loadedmodel.coefficients

DenseVector([-4.033, -2.2639, -3.269, 1.5076, -2.0833])