In [11]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, ClusteringEvaluator
from pyspark.ml.tuning import TrainValidationSplit, ParamGridBuilder

spark = SparkSession.builder.appName("IrisClassificationClustering").getOrCreate()

## Загрузка и предобработка данных

In [12]:
df = spark.read.csv("Iris.csv", header=True, inferSchema=True)

In [13]:
indexer = StringIndexer(inputCol="Species", outputCol="label")
df = indexer.fit(df).transform(df)

In [14]:
assembler = VectorAssembler(
    inputCols=["SepalLengthCm", "SepalWidthCm", "PetalLengthCm", "PetalWidthCm"],
    outputCol="features"
)
df = assembler.transform(df)

In [15]:
train_data, test_data = df.randomSplit([0.7, 0.3], seed=42)

## Классификация

In [16]:
lr = LogisticRegression(featuresCol="features", labelCol="label", maxIter=10)
lr_model = lr.fit(train_data)

In [17]:
predictions = lr_model.transform(test_data)

In [18]:
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)

print(f"Classification Accuracy: {accuracy:.4f}")

Classification Accuracy: 0.9348


## Кластеризация

In [19]:
kmeans = KMeans(featuresCol="features", k=3, seed=1)
kmeans_model = kmeans.fit(df)

In [20]:
clustered = kmeans_model.transform(df)

In [21]:
clustering_evaluator = ClusteringEvaluator()
silhouette = clustering_evaluator.evaluate(clustered)

print(f"Clustering Silhouette Score: {silhouette:.4f}")

Clustering Silhouette Score: 0.7355
