# Spark Linear Regression Demo

## Setup Spark in Colab

In [11]:
# Install Java, Spark 3.3.2 and py4j
!apt-get install openjdk-11-jdk -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-3.3.2/spark-3.3.2-bin-hadoop3.tgz
!tar xf spark-3.3.2-bin-hadoop3.tgz
!pip install -q findspark


In [20]:
# Set environment variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.3.2-bin-hadoop3"


In [21]:
#  Initialize Spark
import findspark
findspark.init()

from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Colab Spark MLlib Setup") \
    .getOrCreate()

print("Spark Session started successfully!")


Spark Session started successfully!



# Load the Dataset

In [22]:
# Download dataset (features like population, rooms, income, etc.)
!wget -q https://raw.githubusercontent.com/ageron/handson-ml/master/datasets/housing/housing.csv -O housing.csv


In [15]:
# Load CSV
df = spark.read.csv("housing.csv", header=True, inferSchema=True)
df.show(5)


+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+
|longitude|latitude|housing_median_age|total_rooms|total_bedrooms|population|households|median_income|median_house_value|ocean_proximity|
+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+
|  -122.23|   37.88|              41.0|      880.0|         129.0|     322.0|     126.0|       8.3252|          452600.0|       NEAR BAY|
|  -122.22|   37.86|              21.0|     7099.0|        1106.0|    2401.0|    1138.0|       8.3014|          358500.0|       NEAR BAY|
|  -122.24|   37.85|              52.0|     1467.0|         190.0|     496.0|     177.0|       7.2574|          352100.0|       NEAR BAY|
|  -122.25|   37.85|              52.0|     1274.0|         235.0|     558.0|     219.0|       5.6431|          341300.0|       NEAR BAY|
|  -122.25|   37.85|              

# Prepare Data

In [23]:
# Drop rows with missing data
df = df.dropna()

# Assemble features into a single vector
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(
    inputCols=["housing_median_age", "total_rooms", "total_bedrooms", "population", "households", "median_income"],
    outputCol="features"
)
assembled_df = assembler.transform(df).select("features", "median_house_value")
assembled_df.show(5)


+--------------------+------------------+
|            features|median_house_value|
+--------------------+------------------+
|[41.0,880.0,129.0...|          452600.0|
|[21.0,7099.0,1106...|          358500.0|
|[52.0,1467.0,190....|          352100.0|
|[52.0,1274.0,235....|          341300.0|
|[52.0,1627.0,280....|          342200.0|
+--------------------+------------------+
only showing top 5 rows



# Train Model

In [24]:
from pyspark.ml.regression import LinearRegression

# Split data
train_data, test_data = assembled_df.randomSplit([0.8, 0.2], seed=42)

# Train model
lr = LinearRegression(featuresCol="features", labelCol="median_house_value")
model = lr.fit(train_data)

print("Model trained.")


Model trained.


# Make Predictions

In [25]:
predictions = model.transform(test_data)
predictions.select("features", "median_house_value", "prediction").show(5)


+--------------------+------------------+------------------+
|            features|median_house_value|        prediction|
+--------------------+------------------+------------------+
|[1.0,2062.0,343.0...|          191300.0|204606.20679266436|
|[2.0,200.0,20.0,2...|          350000.0| 676768.5676170461|
|[2.0,337.0,55.0,1...|          164800.0|108236.42353609539|
|[2.0,790.0,135.0,...|          166500.0|202867.57818092796|
|[2.0,1658.0,290.0...|          136700.0| 208147.5386566134|
+--------------------+------------------+------------------+
only showing top 5 rows



# Evaluate Model

In [26]:
# Evaluation metrics
training_summary = model.summary

print("\nEvaluation Metrics:")
print(f"RMSE (Root Mean Squared Error): {training_summary.rootMeanSquaredError:.2f}")
print(f"MAE (Mean Absolute Error)     : {training_summary.meanAbsoluteError:.2f}")
print(f"R² (R-squared)                : {training_summary.r2:.2f}")



Evaluation Metrics:
RMSE (Root Mean Squared Error): 75824.69
MAE (Mean Absolute Error)     : 55871.86
R² (R-squared)                : 0.57
