<h2>Lab7: Задачі кластеризації в Spark MLlib</h2>

In [None]:
from pyspark.sql import SparkSession
import os

os.environ['SPARK_HOME'] = "/home/zaranik/.sdkman/candidates/spark/current"
os.environ['PYSPARK_DRIVER_PYTHON'] = 'jupyter'
os.environ['PYSPARK_DRIVER_PYTHON_OPTS'] = 'lab'
os.environ['PYSPARK_PYTHON'] = 'python3'

Створення Spark-сессії

In [None]:
spark = SparkSession.builder \
    .appName("MLLib") \
    .getOrCreate()

Задання схеми даних

In [None]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType

# Define the schema
schema = StructType([
    StructField("week_ending", StringType(), True),
    StructField("week_number", IntegerType(), True),
    StructField("weekly_gross_overall", IntegerType(), True),
    StructField("show", StringType(), True),
    StructField("theatre", StringType(), True),
    StructField("weekly_gross", IntegerType(), True),
    StructField("potential_gross", StringType(), True),  # NA is treated as StringType
    StructField("avg_ticket_price", DoubleType(), True),
    StructField("top_ticket_price", StringType(), True),  # NA is treated as StringType
    StructField("seats_sold", IntegerType(), True),
    StructField("seats_in_theatre", IntegerType(), True),
    StructField("pct_capacity", DoubleType(), True),
    StructField("performances", IntegerType(), True),
    StructField("previews", IntegerType(), True),
])


Зчитування даних з файлу csv

In [None]:
df = spark.read.csv("./data/grosses.csv", header=True, schema=schema)
df = df.na.fill({
    "weekly_gross": 0.0,
    "weekly_gross_overall": 0.0,
    "pct_capacity": 0.0,
    "performances": 0.0,
    "seats_sold": 0.0
})
df.show()

In [None]:
from pyspark.ml.feature import VectorAssembler

# Step 1: Define the input columns for clustering
input_columns = ["weekly_gross", "weekly_gross_overall", "pct_capacity", "performances", "seats_sold"]

# Step 2: Create a vector column 'features'
vector_assembler = VectorAssembler(inputCols=input_columns, outputCol="features")
data = vector_assembler.transform(df).select("features", "show")  # Replace 'show' with the appropriate column

# Step 3: Show the transformed data
data.show(truncate=False)

In [None]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator

# Step 1: Configure K-Means clustering
kmeans = KMeans(featuresCol="features", k=3, seed=42)

# Step 2: Train the K-Means model
model = kmeans.fit(data)

# Step 3: Predict clusters for the data
predictions = model.transform(data)

# Step 4: Evaluate the clustering model
evaluator = ClusteringEvaluator(featuresCol="features", predictionCol="prediction", metricName="silhouette")
silhouette = evaluator.evaluate(predictions)

print(f"Silhouette Score: {silhouette}")

# Step 5: Show the predictions
predictions.show(truncate=False)

Silhouette Score використовується для оцінки якості кластеризації (чим ближче до 1, тим кращий результат). Оскільки початковий результат досить далекий від 1, спробуємо покращити його, збільшивши кількість кластерів.

In [None]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator

# Step 1: Configure K-Means clustering with more clusters
kmeans = KMeans(featuresCol="features", k=9, seed=42)  # Increased k to 9

# Step 2: Train the K-Means model
model = kmeans.fit(data)

# Step 3: Predict clusters for the data
predictions = model.transform(data)

# Step 4: Evaluate the clustering model
evaluator = ClusteringEvaluator(featuresCol="features", predictionCol="prediction", metricName="silhouette")
silhouette = evaluator.evaluate(predictions)

print(f"Silhouette Score: {silhouette}")

# Step 5: Show the predicted clusters
predictions.show(truncate=False)

Нормалізуємо дані

In [None]:
from pyspark.ml.feature import StandardScaler
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator

# Step 1: Normalize and scale the data
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures", withStd=True, withMean=True)
scaled_data = scaler.fit(data).transform(data)

# Step 2: Apply K-Means clustering with scaled features
kmeans = KMeans(featuresCol="scaledFeatures", k=3, seed=42)
model = kmeans.fit(scaled_data)

# Step 3: Predict clusters for the data
predictions = model.transform(scaled_data)

# Step 4: Evaluate the clustering model
evaluator = ClusteringEvaluator(featuresCol="features", predictionCol="prediction", metricName="silhouette")
silhouette = evaluator.evaluate(predictions)

print(f"Silhouette Score: {silhouette}")

# Step 5: Show the predictions
predictions.show(truncate=False)

In [None]:
spark.stop()