<h2>Lab6: Задачі регресії в Spark MLlib</h2>

In [None]:
from pyspark.sql import SparkSession
import os

os.environ['SPARK_HOME'] = "/home/zaranik/.sdkman/candidates/spark/current"
os.environ['PYSPARK_DRIVER_PYTHON'] = 'jupyter'
os.environ['PYSPARK_DRIVER_PYTHON_OPTS'] = 'lab'
os.environ['PYSPARK_PYTHON'] = 'python3'

Створення Spark-сессії

In [None]:
spark = SparkSession.builder \
    .appName("MLLib") \
    .getOrCreate()

Задання схеми даних

In [None]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType

# Define the schema
schema = StructType([
    StructField("week_ending", StringType(), True),
    StructField("week_number", IntegerType(), True),
    StructField("weekly_gross_overall", IntegerType(), True),
    StructField("show", StringType(), True),
    StructField("theatre", StringType(), True),
    StructField("weekly_gross", IntegerType(), True),
    StructField("potential_gross", StringType(), True),  # NA is treated as StringType
    StructField("avg_ticket_price", DoubleType(), True),
    StructField("top_ticket_price", StringType(), True),  # NA is treated as StringType
    StructField("seats_sold", IntegerType(), True),
    StructField("seats_in_theatre", IntegerType(), True),
    StructField("pct_capacity", DoubleType(), True),
    StructField("performances", IntegerType(), True),
    StructField("previews", IntegerType(), True),
])


Зчитування даних з файлу csv

In [None]:
df = spark.read.csv("./data/grosses.csv", header=True, schema=schema)
df.show()

Функція для виведення результатів оцінки точності моделювання

In [None]:
def evaluate_prescision(predictions):
  from pyspark.ml.evaluation import RegressionEvaluator
  # Evaluate the Model
  evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction")

  # Calculate Metrics
  mae = evaluator.evaluate(predictions, {evaluator.metricName: "mae"})
  mse = evaluator.evaluate(predictions, {evaluator.metricName: "mse"})
  rmse = evaluator.evaluate(predictions, {evaluator.metricName: "rmse"})
  r2 = evaluator.evaluate(predictions, {evaluator.metricName: "r2"})
  return mae, mse, rmse, r2

Функція побудови регрессії

In [None]:
def get_predictions(num_trees, max_depth):
  from pyspark.ml.feature import VectorAssembler
  from pyspark.ml.regression import RandomForestRegressor
    
  assembler = VectorAssembler(
      inputCols=["seats_sold", "avg_ticket_price", "seats_in_theatre", "pct_capacity"],  # Independent variables
      outputCol="features"
  )

  df_prepared = assembler.transform(df).select("features", "weekly_gross")
  df_prepared = df_prepared.withColumnRenamed("weekly_gross", "label")  # Rename target column to 'label'
  df_prepared = df_prepared.filter("label IS NOT NULL AND NOT isnan(label)")

  # Initialize LinearRegression
  lr = RandomForestRegressor(numTrees=num_trees, maxDepth=max_depth)

  # Explain Parameters
  # print(lr.explainParams())

  # Fit the Model
  lr_model = lr.fit(df_prepared)

  # Make Predictions
  prescisions = lr_model.transform(df_prepared)
  mae, mse, rmse, r2 = evaluate_prescision(prescisions)
  
  print("*********************************************************")
  print(f"Prediction for readmon forest with num_trees = {num_trees}, max_depth = {max_depth} evalution results: ")
  print(f"Mean Absolute Error (MAE): {mae}")
  print(f"Mean Squared Error (MSE): {mse}")
  print(f"Root Mean Squared Error (RMSE): {rmse}")
  print(f"R-squared (R²): {r2}")

In [None]:
get_predictions(10, 5)
get_predictions(25, 7)
get_predictions(50, 10)

In [None]:
spark.stop()