In [None]:
!pip install pyspark



# 1.Build a Classification Model with Spark with a dataset of your choice

In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


spark = SparkSession.builder \
    .appName("Iris Classification") \
    .getOrCreate()
import urllib.request


url = "https://raw.githubusercontent.com/uiuc-cse/data-fa14/gh-pages/data/iris.csv"
urllib.request.urlretrieve(url, "iris.csv")
df = spark.read.csv("iris.csv", header=True, inferSchema=True)

df.show(5)

indexer = StringIndexer(inputCol="species", outputCol="label")
df_indexed = indexer.fit(df).transform(df)

assembler = VectorAssembler(
    inputCols=["sepal_length", "sepal_width", "petal_length", "petal_width"],
    outputCol="features"
)
df_final = assembler.transform(df_indexed).select("features", "label")

train_data, test_data = df_final.randomSplit([0.8, 0.2], seed=42)

lr = LogisticRegression(featuresCol="features", labelCol="label")
lr_model = lr.fit(train_data)

predictions = lr_model.transform(test_data)
predictions.select("features", "label", "prediction").show()

evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print(f"Accuracy: {accuracy:.2f}")

spark.stop()


+------------+-----------+------------+-----------+-------+
|sepal_length|sepal_width|petal_length|petal_width|species|
+------------+-----------+------------+-----------+-------+
|         5.1|        3.5|         1.4|        0.2| setosa|
|         4.9|        3.0|         1.4|        0.2| setosa|
|         4.7|        3.2|         1.3|        0.2| setosa|
|         4.6|        3.1|         1.5|        0.2| setosa|
|         5.0|        3.6|         1.4|        0.2| setosa|
+------------+-----------+------------+-----------+-------+
only showing top 5 rows

+-----------------+-----+----------+
|         features|label|prediction|
+-----------------+-----+----------+
|[4.4,3.0,1.3,0.2]|  0.0|       0.0|
|[4.6,3.2,1.4,0.2]|  0.0|       0.0|
|[4.6,3.6,1.0,0.2]|  0.0|       0.0|
|[4.8,3.1,1.6,0.2]|  0.0|       0.0|
|[4.9,3.1,1.5,0.1]|  0.0|       0.0|
|[5.0,2.3,3.3,1.0]|  1.0|       1.0|
|[5.0,3.5,1.3,0.3]|  0.0|       0.0|
|[5.1,3.5,1.4,0.2]|  0.0|       0.0|
|[5.3,3.7,1.5,0.2]|  0.0|   

# 2.Build  a Clustering Model with Spark with a dataset of your choice


In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator
import urllib.request


spark = SparkSession.builder \
    .appName("Iris Clustering") \
    .getOrCreate()

url = "https://raw.githubusercontent.com/uiuc-cse/data-fa14/gh-pages/data/iris.csv"
urllib.request.urlretrieve(url, "iris.csv")


df = spark.read.csv("iris.csv", header=True, inferSchema=True)
df.show(5)


assembler = VectorAssembler(
    inputCols=["sepal_length", "sepal_width", "petal_length", "petal_width"],
    outputCol="features"
)
df_features = assembler.transform(df).select("features")
df_features.show(5)

kmeans = KMeans().setK(3).setSeed(1)
model = kmeans.fit(df_features)

predictions = model.transform(df_features)
predictions.show(5)

evaluator = ClusteringEvaluator()

silhouette = evaluator.evaluate(predictions)
print(f"Silhouette Score = {silhouette:.2f}")

centers = model.clusterCenters()
print("Cluster Centers:")
for center in centers:
    print(center)


spark.stop()


+------------+-----------+------------+-----------+-------+
|sepal_length|sepal_width|petal_length|petal_width|species|
+------------+-----------+------------+-----------+-------+
|         5.1|        3.5|         1.4|        0.2| setosa|
|         4.9|        3.0|         1.4|        0.2| setosa|
|         4.7|        3.2|         1.3|        0.2| setosa|
|         4.6|        3.1|         1.5|        0.2| setosa|
|         5.0|        3.6|         1.4|        0.2| setosa|
+------------+-----------+------------+-----------+-------+
only showing top 5 rows

+-----------------+
|         features|
+-----------------+
|[5.1,3.5,1.4,0.2]|
|[4.9,3.0,1.4,0.2]|
|[4.7,3.2,1.3,0.2]|
|[4.6,3.1,1.5,0.2]|
|[5.0,3.6,1.4,0.2]|
+-----------------+
only showing top 5 rows

+-----------------+----------+
|         features|prediction|
+-----------------+----------+
|[5.1,3.5,1.4,0.2]|         1|
|[4.9,3.0,1.4,0.2]|         1|
|[4.7,3.2,1.3,0.2]|         1|
|[4.6,3.1,1.5,0.2]|         1|
|[5.0,3.6,1.4

# 3.Build a Recommendation Engine with Spark with a dataset of your choice

In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql.functions import col
import urllib.request
import zipfile
import os


spark = SparkSession.builder \
    .appName("Movie Recommendation Engine") \
    .getOrCreate()


url = "http://files.grouplens.org/datasets/movielens/ml-100k.zip"
urllib.request.urlretrieve(url, "ml-100k.zip")

with zipfile.ZipFile("ml-100k.zip", "r") as zip_ref:
    zip_ref.extractall()


ratings_path = "ml-100k/u.data"
ratings_df = spark.read.csv(ratings_path, sep="\t", inferSchema=True)\
    .toDF("userId", "movieId", "rating", "timestamp")

ratings_df.show(5)

(training, test) = ratings_df.randomSplit([0.8, 0.2], seed=42)

als = ALS(
    userCol="userId",
    itemCol="movieId",
    ratingCol="rating",
    coldStartStrategy="drop"
)
model = als.fit(training)

predictions = model.transform(test)

evaluator = RegressionEvaluator(
    metricName="rmse",
    labelCol="rating",
    predictionCol="prediction"
)
rmse = evaluator.evaluate(predictions)
print(f"Root-mean-square error = {rmse:.2f}")

userRecs = model.recommendForAllUsers(5)
userRecs.show(5, truncate=False)

spark.stop()


+------+-------+------+---------+
|userId|movieId|rating|timestamp|
+------+-------+------+---------+
|   196|    242|     3|881250949|
|   186|    302|     3|891717742|
|    22|    377|     1|878887116|
|   244|     51|     2|880606923|
|   166|    346|     1|886397596|
+------+-------+------+---------+
only showing top 5 rows

Root-mean-square error = 0.92
+------+------------------------------------------------------------------------------------------+
|userId|recommendations                                                                           |
+------+------------------------------------------------------------------------------------------+
|1     |[{169, 5.145367}, {1449, 5.1151657}, {408, 5.0260806}, {114, 4.9943943}, {1129, 4.939193}]|
|2     |[{1449, 5.1401997}, {119, 4.7666917}, {169, 4.6891603}, {318, 4.6781487}, {64, 4.676061}] |
|3     |[{1643, 5.0305285}, {1368, 4.4396415}, {320, 4.178387}, {74, 4.1180644}, {865, 4.080035}] |
|4     |[{1631, 5.883494}, {1449, 5.827