pyspark classification algorithm

|Algorithm | Description | 
|---|---|
|Logistic Regression| Logistic regression is a statistical model that in its basic form uses a logistic function to model a binary dependent variable.|

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("ML").getOrCreate()

24/03/27 17:29:00 WARN Utils: Your hostname, majid resolves to a loopback address: 127.0.1.1; using 192.168.0.230 instead (on interface wlp3s0)
24/03/27 17:29:00 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/03/27 17:29:06 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [50]:

# "tmp/iris.data" is a tab separated file
from pyspark.sql.types import StructField, StringType, IntegerType, StructType,TimestampType,FloatType

schema = StructType([
    StructField("sepal_length", FloatType(), True),
    StructField("sepal_width", FloatType(), True),
    StructField("petal_length", FloatType(), True),
    StructField("petal_width", FloatType(), True),
    StructField("class", StringType(), True)
])
df = spark.read.csv("tmp/iris.data", schema=schema, header=False, sep=",")

In [51]:
df.show(5)

+------------+-----------+------------+-----------+-----------+
|sepal_length|sepal_width|petal_length|petal_width|      class|
+------------+-----------+------------+-----------+-----------+
|         5.1|        3.5|         1.4|        0.2|Iris-setosa|
|         4.9|        3.0|         1.4|        0.2|Iris-setosa|
|         4.7|        3.2|         1.3|        0.2|Iris-setosa|
|         4.6|        3.1|         1.5|        0.2|Iris-setosa|
|         5.0|        3.6|         1.4|        0.2|Iris-setosa|
+------------+-----------+------------+-----------+-----------+
only showing top 5 rows



In [52]:
# find distinct values in the fruit_name column
df.select('class').distinct().show()

+---------------+
|          class|
+---------------+
| Iris-virginica|
|    Iris-setosa|
|Iris-versicolor|
+---------------+



In [53]:
# add new column for the label
from pyspark.sql.functions import when
df = df.withColumn("label", when(df["class"] == "Iris-setosa", 0).when(df["class"] == "Iris-versicolor", 1).otherwise(2))

In [54]:
df.show(5)

+------------+-----------+------------+-----------+-----------+-----+
|sepal_length|sepal_width|petal_length|petal_width|      class|label|
+------------+-----------+------------+-----------+-----------+-----+
|         5.1|        3.5|         1.4|        0.2|Iris-setosa|    0|
|         4.9|        3.0|         1.4|        0.2|Iris-setosa|    0|
|         4.7|        3.2|         1.3|        0.2|Iris-setosa|    0|
|         4.6|        3.1|         1.5|        0.2|Iris-setosa|    0|
|         5.0|        3.6|         1.4|        0.2|Iris-setosa|    0|
+------------+-----------+------------+-----------+-----------+-----+
only showing top 5 rows



In [55]:
df.printSchema()

root
 |-- sepal_length: float (nullable = true)
 |-- sepal_width: float (nullable = true)
 |-- petal_length: float (nullable = true)
 |-- petal_width: float (nullable = true)
 |-- class: string (nullable = true)
 |-- label: integer (nullable = false)



In [57]:
# feature selection

features_col = df.columns[:-2]
target_col = df.columns[-1]

In [59]:
# logistic regression
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression

assembler = VectorAssembler(inputCols=features_col, outputCol='features')
df = assembler.transform(df)

In [60]:
df.show(5)

+------------+-----------+------------+-----------+-----------+-----+--------------------+
|sepal_length|sepal_width|petal_length|petal_width|      class|label|            features|
+------------+-----------+------------+-----------+-----------+-----+--------------------+
|         5.1|        3.5|         1.4|        0.2|Iris-setosa|    0|[5.09999990463256...|
|         4.9|        3.0|         1.4|        0.2|Iris-setosa|    0|[4.90000009536743...|
|         4.7|        3.2|         1.3|        0.2|Iris-setosa|    0|[4.69999980926513...|
|         4.6|        3.1|         1.5|        0.2|Iris-setosa|    0|[4.59999990463256...|
|         5.0|        3.6|         1.4|        0.2|Iris-setosa|    0|[5.0,3.5999999046...|
+------------+-----------+------------+-----------+-----------+-----+--------------------+
only showing top 5 rows



In [61]:
# apply standard scaler to the features
from pyspark.ml.feature import StandardScaler

scaler = StandardScaler(inputCol='features', outputCol='scaled_features')
df_scaled = scaler.fit(df).transform(df)

In [62]:
df_scaled.show(5)

+------------+-----------+------------+-----------+-----------+-----+--------------------+--------------------+
|sepal_length|sepal_width|petal_length|petal_width|      class|label|            features|     scaled_features|
+------------+-----------+------------+-----------+-----------+-----+--------------------+--------------------+
|         5.1|        3.5|         1.4|        0.2|Iris-setosa|    0|[5.09999990463256...|[6.15892840615775...|
|         4.9|        3.0|         1.4|        0.2|Iris-setosa|    0|[4.90000009536743...|[5.91740202781600...|
|         4.7|        3.2|         1.3|        0.2|Iris-setosa|    0|[4.69999980926513...|[5.67587507362994...|
|         4.6|        3.1|         1.5|        0.2|Iris-setosa|    0|[4.59999990463256...|[5.55511188445906...|
|         5.0|        3.6|         1.4|        0.2|Iris-setosa|    0|[5.0,3.5999999046...|[6.03816521698687...|
+------------+-----------+------------+-----------+-----------+-----+--------------------+--------------

In [66]:
# define the logistic regression model
lr = LogisticRegression(featuresCol='scaled_features',
                        labelCol='label',
                         maxIter=10,
                         regParam=0.3,
                         elasticNetParam=0.01,
                         family='multinomial')

In [67]:
# split the data into training and test sets
train, test = df_scaled.randomSplit([0.8, 0.2])

In [68]:
# fit the model
lr_model = lr.fit(train)

In [70]:
# make predictions
predictions = lr_model.transform(test)


In [71]:
predictions.select('label', 'prediction').show(5)

+-----+----------+
|label|prediction|
+-----+----------+
|    0|       0.0|
|    0|       0.0|
|    0|       0.0|
|    1|       1.0|
|    2|       1.0|
+-----+----------+
only showing top 5 rows



In [72]:
predictions.groupBy('label', 'prediction').count().show()

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|    2|       1.0|    4|
|    0|       0.0|   11|
|    2|       2.0|   13|
|    1|       1.0|   12|
+-----+----------+-----+



In [73]:
# evaluate the model
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(labelCol='label', predictionCol='prediction', metricName='accuracy')
accuracy = evaluator.evaluate(predictions)
print(f'Accuracy: {accuracy}')

Accuracy: 0.9


In [80]:
from pyspark.ml.classification import NaiveBayes

nb = NaiveBayes(featuresCol='scaled_features',
                 labelCol='label', 
                #  smoothing=1.0, 
                #  modelType='multinomial'
                 )

nb_model = nb.fit(train)

In [81]:
# make predictions
predictions = nb_model.transform(test)

In [82]:
predictions.select('label', 'prediction').show(5)

+-----+----------+
|label|prediction|
+-----+----------+
|    0|       0.0|
|    0|       0.0|
|    0|       0.0|
|    1|       1.0|
|    2|       1.0|
+-----+----------+
only showing top 5 rows



In [83]:
predictions.groupBy('label', 'prediction').count().show()

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|    2|       1.0|   10|
|    0|       0.0|   11|
|    2|       2.0|    7|
|    1|       1.0|   12|
+-----+----------+-----+



In [84]:
# evaluate the model
accuracy = evaluator.evaluate(predictions)
print(f'Accuracy: {accuracy}')

Accuracy: 0.75
