In [1]:
import findspark
findspark.init()
findspark.find()
import pyspark

import os
import functools as reduce
from pyspark.context import SparkContext
from pyspark.sql import DataFrame, SQLContext, SparkSession, Window
from pyspark.sql.functions import *
from pyspark.sql.types import *

conf = pyspark.SparkConf().setAppName('MLLib-Overview').setMaster('local')
sc = pyspark.SparkContext(conf = conf)
spark = SparkSession(sc)
sqlContext = SQLContext(sc)

In [2]:
spark

In [3]:
# Reading the dataset
df1 = spark.read.csv(r"D:\Data Science\IIITB\Data Engineering - II\Module - 6 (Analytics Using PySpark)\1. Basic EDA using Spark ML Library\auto-miles-per-gallon-Raw+(3).csv", header=True, inferSchema=False)

In [4]:
# Displaying samples
df1.show()

+-----+---------+------------+----------+------+------------+---------+--------------------+
|  MPG|CYLINDERS|DISPLACEMENT|HORSEPOWER|WEIGHT|ACCELERATION|MODELYEAR|                NAME|
+-----+---------+------------+----------+------+------------+---------+--------------------+
|   18|        8|         307|       130|  3504|          12|       70|chevrolet chevell...|
|   15|        8|         350|       165|  3693|        11.5|       70| buick skylark 320""|
|   18|        8|         318|       150|  3436|          11|       70|plymouth satellite""|
|   16|        8|         304|       150|  3433|          12|       70|     amc rebel sst""|
|   17|        8|         302|       140|  3449|        10.5|       70|       ford torino""|
|   15|        8|         429|      Null|  4341|          10|       70|  ford galaxie 500""|
|   14|        8|         454|       220|  4354|           9|       70|  chevrolet impala""|
|   14|        8|         440|       215|  4312|         8.5|       70

In [5]:
# Printing Schema
df1.printSchema()

root
 |-- MPG: string (nullable = true)
 |-- CYLINDERS: string (nullable = true)
 |-- DISPLACEMENT: string (nullable = true)
 |-- HORSEPOWER: string (nullable = true)
 |-- WEIGHT: string (nullable = true)
 |-- ACCELERATION: string (nullable = true)
 |-- MODELYEAR: string (nullable = true)
 |-- NAME: string (nullable = true)



In [6]:
# Statistical summary
df1.describe().show()

+-------+------------------+-----------------+------------------+------------------+------------------+------------------+------------------+--------------------+
|summary|               MPG|        CYLINDERS|      DISPLACEMENT|        HORSEPOWER|            WEIGHT|      ACCELERATION|         MODELYEAR|                NAME|
+-------+------------------+-----------------+------------------+------------------+------------------+------------------+------------------+--------------------+
|  count|               406|              406|               406|               406|               406|               406|               406|                 406|
|   mean|23.514572864321615|5.475369458128079| 194.7795566502463|104.56675062972292|2979.4137931034484|15.519704433497521| 75.92118226600985|                null|
| stddev| 7.815984312565783|1.712159631548529|104.92245837948867|  38.1556978120705| 847.0043282393513|2.8033588163425462|3.7487373454558743|                null|
|    min|             

In [7]:
df1.columns

['MPG',
 'CYLINDERS',
 'DISPLACEMENT',
 'HORSEPOWER',
 'WEIGHT',
 'ACCELERATION',
 'MODELYEAR',
 'NAME']

In [8]:
# Casting the numerical columns in Double format
cols = ["MPG", "CYLINDERS", "DISPLACEMENT", "HORSEPOWER", "WEIGHT", "ACCELERATION", "MODELYEAR"]

for i in cols:
    df1 = df1.withColumn(i, df1[i].cast(DoubleType()))

In [9]:
df1.show()

+----+---------+------------+----------+------+------------+---------+--------------------+
| MPG|CYLINDERS|DISPLACEMENT|HORSEPOWER|WEIGHT|ACCELERATION|MODELYEAR|                NAME|
+----+---------+------------+----------+------+------------+---------+--------------------+
|18.0|      8.0|       307.0|     130.0|3504.0|        12.0|     70.0|chevrolet chevell...|
|15.0|      8.0|       350.0|     165.0|3693.0|        11.5|     70.0| buick skylark 320""|
|18.0|      8.0|       318.0|     150.0|3436.0|        11.0|     70.0|plymouth satellite""|
|16.0|      8.0|       304.0|     150.0|3433.0|        12.0|     70.0|     amc rebel sst""|
|17.0|      8.0|       302.0|     140.0|3449.0|        10.5|     70.0|       ford torino""|
|15.0|      8.0|       429.0|      null|4341.0|        10.0|     70.0|  ford galaxie 500""|
|14.0|      8.0|       454.0|     220.0|4354.0|         9.0|     70.0|  chevrolet impala""|
|14.0|      8.0|       440.0|     215.0|4312.0|         8.5|     70.0| plymouth 

In [10]:
# Handling Missing Values
df2 = df1.dropna()

In [11]:
df2.describe().show()

+-------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+--------------------+
|summary|               MPG|         CYLINDERS|      DISPLACEMENT|        HORSEPOWER|            WEIGHT|      ACCELERATION|         MODELYEAR|                NAME|
+-------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+--------------------+
|  count|               389|               389|               389|               389|               389|               389|               389|                 389|
|   mean|23.505912596401025|5.4575835475578405| 193.1272493573265|103.93830334190231|2970.8817480719795| 15.56992287917736| 76.02570694087403|                null|
| stddev|7.8035440668103035|1.7024467902237224|103.52041672172308|37.849829402174656|  846.593666699619|2.7406345198432867|3.6600880681307015|                null|
|    min|       

In [12]:
# Using Imputer to impute the NULL values
from pyspark.ml.feature import Imputer

# Imputer method automatically replaces null values with mean values
imputer = Imputer(inputCols=["MPG", "HORSEPOWER"], outputCols=["MPG-Out", "HORSEPOWER-Out"])

# Fitting the dataframe into model
imputeModel = imputer.fit(df1)

# Transforming the dataframe
df3 = imputeModel.transform(df1)

In [13]:
df3.show()
df3.describe().show()

+----+---------+------------+----------+------+------------+---------+--------------------+------------------+------------------+
| MPG|CYLINDERS|DISPLACEMENT|HORSEPOWER|WEIGHT|ACCELERATION|MODELYEAR|                NAME|           MPG-Out|    HORSEPOWER-Out|
+----+---------+------------+----------+------+------------+---------+--------------------+------------------+------------------+
|18.0|      8.0|       307.0|     130.0|3504.0|        12.0|     70.0|chevrolet chevell...|              18.0|             130.0|
|15.0|      8.0|       350.0|     165.0|3693.0|        11.5|     70.0| buick skylark 320""|              15.0|             165.0|
|18.0|      8.0|       318.0|     150.0|3436.0|        11.0|     70.0|plymouth satellite""|              18.0|             150.0|
|16.0|      8.0|       304.0|     150.0|3433.0|        12.0|     70.0|     amc rebel sst""|              16.0|             150.0|
|17.0|      8.0|       302.0|     140.0|3449.0|        10.5|     70.0|       ford torino""

In [14]:
# Removing the unnecessary columns
df3.drop(df3["MPG"])
df3.drop(df3["HORSEPOWER"])

DataFrame[MPG: double, CYLINDERS: double, DISPLACEMENT: double, WEIGHT: double, ACCELERATION: double, MODELYEAR: double, NAME: string, MPG-Out: double, HORSEPOWER-Out: double]

### Feature Transformers
Feature Transformers transforms the data stores in a dataframe and store the data back as a new dataframe. This transformation generally takes place by appending one or more colums to the existing data frame.

#### Vector Assembler
VectorAssembler is a transformer that combines a given list of columns nto a singles vector column. It is useful for combining raw features and features generated by different feature transformers into a single feature vector, in order to train ML models like Logistic Regression and Decision Trees

In [15]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(
                inputCols=["CYLINDERS", "WEIGHT", "HORSEPOWER-Out", "DISPLACEMENT"],
                outputCol="features")

output = assembler.transform(df3)

#### StandardScaler
StandardScaler is an Estimator which can be fit on a dataset to produce a StandardScalerModel. It transforms a dataset of Vector rows, normalizing each feature to have unit standard deviation and/or zero mean

In [16]:
from pyspark.ml.feature import StandardScaler

scaler = StandardScaler(inputCol= "features", outputCol= "scaledFeatures",
                       withStd=True, withMean=False)

In [17]:
Scaled_Model = scaler.fit(output)

In [18]:
scaled_data = Scaled_Model.transform(output)

In [19]:
scaled_data.show()

+----+---------+------------+----------+------+------------+---------+--------------------+------------------+------------------+--------------------+--------------------+
| MPG|CYLINDERS|DISPLACEMENT|HORSEPOWER|WEIGHT|ACCELERATION|MODELYEAR|                NAME|           MPG-Out|    HORSEPOWER-Out|            features|      scaledFeatures|
+----+---------+------------+----------+------+------------+---------+--------------------+------------------+------------------+--------------------+--------------------+
|18.0|      8.0|       307.0|     130.0|3504.0|        12.0|     70.0|chevrolet chevell...|              18.0|             130.0|[8.0,3504.0,130.0...|[4.67246152320771...|
|15.0|      8.0|       350.0|     165.0|3693.0|        11.5|     70.0| buick skylark 320""|              15.0|             165.0|[8.0,3693.0,165.0...|[4.67246152320771...|
|18.0|      8.0|       318.0|     150.0|3436.0|        11.0|     70.0|plymouth satellite""|              18.0|             150.0|[8.0,3436.0

#### Pipeline
A pipeline is a series of activities or transformations. Machine learning is performed by implementing a set of tasks, that is collecting the data, cleaning the data, building the model, evaluating the model, etc.

In [20]:
from pyspark.ml import Pipeline

In [21]:
# Initialize the dataframe
data = df1

In [22]:
# Setting the stages of the pipeline
pipeline = Pipeline(stages = [imputer, assembler, scaler])

In [23]:
# Using .fit() and .transform() on the pipeline
model = pipeline.fit(data)
Final_output = model.transform(data)

In [24]:
# Output of the pipeline
Final_output.show()

+----+---------+------------+----------+------+------------+---------+--------------------+------------------+------------------+--------------------+--------------------+
| MPG|CYLINDERS|DISPLACEMENT|HORSEPOWER|WEIGHT|ACCELERATION|MODELYEAR|                NAME|           MPG-Out|    HORSEPOWER-Out|            features|      scaledFeatures|
+----+---------+------------+----------+------+------------+---------+--------------------+------------------+------------------+--------------------+--------------------+
|18.0|      8.0|       307.0|     130.0|3504.0|        12.0|     70.0|chevrolet chevell...|              18.0|             130.0|[8.0,3504.0,130.0...|[4.67246152320771...|
|15.0|      8.0|       350.0|     165.0|3693.0|        11.5|     70.0| buick skylark 320""|              15.0|             165.0|[8.0,3693.0,165.0...|[4.67246152320771...|
|18.0|      8.0|       318.0|     150.0|3436.0|        11.0|     70.0|plymouth satellite""|              18.0|             150.0|[8.0,3436.0