In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import rand

# Step 2: Initialize Spark Session
spark = SparkSession.builder.appName("RegressionLab").getOrCreate()

# Step 3: Simulate a dataset with random values
data = spark.range(0, 1000).withColumn("feature1", rand() * 100)\
                            .withColumn("feature2", rand() * 50)\
                            .withColumn("label", rand() * 200)

# Step 4: Feature Engineering using VectorAssembler
assembler = VectorAssembler(inputCols=["feature1", "feature2"], outputCol="features")
assembled_data = assembler.transform(data).select("features", "label")

# Step 5: Split data into training and test sets (80/20 split)
train_data, test_data = assembled_data.randomSplit([0.8, 0.2], seed=42)

# Step 6: Initialize and train the Linear Regression model
lr = LinearRegression(featuresCol="features", labelCol="label")
lr_model = lr.fit(train_data)

# Step 7: Make predictions on test data
predictions = lr_model.transform(test_data)

# Step 8: Evaluate the model
evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction")
mse = evaluator.evaluate(predictions, {evaluator.metricName: "mse"})
mae = evaluator.evaluate(predictions, {evaluator.metricName: "mae"})
rmse = evaluator.evaluate(predictions, {evaluator.metricName: "rmse"})
r2 = evaluator.evaluate(predictions, {evaluator.metricName: "r2"})

# Manually calculate Explained Variance
explained_variance = predictions.selectExpr(
    "variance(prediction) as var_pred", "variance(label) as var_label"
).collect()
ev = explained_variance[0]['var_pred'] / explained_variance[0]['var_label']

# Step 9: Print results
print("===== Regression Evaluation Metrics =====")
print(f"Coefficients: {lr_model.coefficients}")
print(f"Intercept: {lr_model.intercept}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R² (Coefficient of Determination): {r2}")
print(f"Explained Variance (EV): {ev}")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/05/06 14:44:38 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/05/06 14:44:41 WARN Instrumentation: [048f6d40] regParam is zero, which might cause numerical instability and overfitting.
25/05/06 14:44:42 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
25/05/06 14:44:42 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK


===== Regression Evaluation Metrics =====
Coefficients: [0.06584566324781785,0.16354174977667882]
Intercept: 91.08900531695903
Mean Squared Error (MSE): 3271.364906774294
Mean Absolute Error (MAE): 49.114648701604395
Root Mean Squared Error (RMSE): 57.1958469364192
R² (Coefficient of Determination): -0.010337607357987055
Explained Variance (EV): 0.0029260124880602714
