# **HW3.2:  Regression with Decision Trees**  

(Cập nhật lần cuối: 4/5/2025)  

Họ tên: Nguyễn Lê Tấn Phát  

MSSV: 22120262

---


# **Prepare enviroment**

In [None]:
# install java
!apt-get updatec
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
# install spark (change the version number if needed)
!wget -q https://downloads.apache.org/spark/spark-3.5.5/spark-3.5.5-bin-hadoop3.tgz
# unzip the spark file to the current folder
!tar xf spark-3.5.5-bin-hadoop3.tgz

In [None]:
# set your spark folder to your system path environment.
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.5.5-bin-hadoop3"

In [None]:
# start pyspark
!pip install findspark
import findspark
findspark.init()

In [None]:
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local")\
          .appName("Spark APIs Exercises")\
          .config("spark.some.config.option", "some-value")\
          .getOrCreate()

sc = spark.sparkContext

In [None]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import hour, dayofweek, month
from pyspark.ml.feature import StringIndexer
from pyspark.mllib.tree import DecisionTree, DecisionTreeModel
from pyspark.mllib.util import MLUtils
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.linalg import Vectors

import datetime

# **Prepare data**

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')
# from google.colab import files
# uploaded = files.upload()

# **3.2.1: Structured API Implementation (High-Level)**

## Read data

In [None]:
train_rawData = spark.read.csv("train.csv", header=True, inferSchema=True)
test_rawData = spark.read.csv("test.csv", header=True, inferSchema=True)

## Train-val split

In [None]:
(train_df, val_df) = train_rawData.randomSplit([0.8, 0.2], seed=42)

## Pre-process data

Drop id column (This is not necessary when training)

In [None]:
train_df = train_df.drop("id")
val_df = val_df.drop("id")
test_df = test_rawData.drop("id")

Handle `pickup_datetime` column which have timestamp data type

In [None]:
for df in [train_df, val_df, test_df]:
    df = df.withColumn("pickup_hour", hour(df["pickup_datetime"]))
    df = df.withColumn("pickup_dayofweek", dayofweek(df["pickup_datetime"]))
    df = df.withColumn("pickup_month", month(df["pickup_datetime"]))

train_df = train_df.drop("pickup_datetime", "dropoff_datetime")
val_df = val_df.drop("pickup_datetime", "dropoff_datetime")
test_df = test_df.drop("pickup_datetime", "dropoff_datetime")

Handle `store_and_fwd_flag` column which have string data type

In [None]:
indexer = StringIndexer(inputCol="store_and_fwd_flag", outputCol="store_and_fwd_flag_index")
train_df = indexer.fit(train_df).transform(train_df).drop("store_and_fwd_flag")
val_df = indexer.fit(val_df).transform(val_df).drop("store_and_fwd_flag")
test_df = indexer.fit(test_df).transform(test_df).drop("store_and_fwd_flag")

Remove label column

In [None]:
inputCols = [col for col in train_df.columns if col != "trip_duration"]

Assemble numeric feature

In [None]:
assembler = VectorAssembler(inputCols=inputCols, outputCol="features")

Feature indexing - handle categorical features automatically

In [None]:
feature_indexer = VectorIndexer(
    inputCol="features",
    outputCol="indexedFeatures",
    maxCategories=4  # Features with ≤4 distinct values are treated as categorical
).fit(assembler.transform(train_df))

## Train the Decision Tree Regressor model using MLlib

Define Decision Tree model with parameters

In [None]:
dt = DecisionTreeRegressor(
    featuresCol="indexedFeatures",
    labelCol="trip_duration",
    maxDepth=5,                     # Control model complexity
    minInstancesPerNode=10,         # Prevent overfitting
    impurity="variance"             # Variance for regression
)

Create pipeline

In [None]:
pipeline = Pipeline(stages=[
    assembler,
    feature_indexer,
    dt
])

Train model

In [None]:
model = pipeline.fit(train_df)

Make predictions on validation

In [None]:
val_predictions = model.transform(val_df)

## Evaluation model

Create evaluators for different metrics

In [None]:
rmse_evaluator = RegressionEvaluator(
    labelCol="trip_duration",
    predictionCol="prediction",
    metricName="rmse"
)

r2_evaluator = RegressionEvaluator(
    labelCol="trip_duration",
    predictionCol="prediction",
    metricName="r2"
)

Analyze model structure and feature importance

In [None]:
tree_model = model.stages[2]  # DecisionTreeRegressor is the 3rd stage in pipeline

print("\nDecision Tree Model Summary:")
print("Depth:", tree_model.depth)
print("Number of Nodes:", tree_model.numNodes)
print("Feature importances:")
for col, imp in zip(inputCols, tree_model.featureImportances):
    print(f"- {col}: {imp:.2f}")


Decision Tree Model Summary:
Depth: 5
Number of Nodes: 63
Feature importances:
- vendor_id: 0.04
- passenger_count: 0.00
- pickup_longitude: 0.33
- pickup_latitude: 0.10
- dropoff_longitude: 0.33
- dropoff_latitude: 0.20
- store_and_fwd_flag_index: 0.00


Evaluate on validation set

In [None]:
val_rmse = rmse_evaluator.evaluate(val_predictions)
val_r2 = r2_evaluator.evaluate(val_predictions)

In [None]:
print("\nModel Evaluation Results:")
print("Validation Set:")
print("Root Mean Squared Error (RMSE) =", val_rmse)
print("R-squared (R²) =", val_r2)


Model Evaluation Results:
Validation Set:
Root Mean Squared Error (RMSE) = 4927.6391800412175
R-squared (R²) = 0.00806711612677502


# **3.2.2: MLlib RDD-Based Implementation**

## Read data

In [None]:
train_lines = sc.textFile("train.csv")
train_header = train_lines.first()
train_rawData = train_lines.filter(lambda line: line != train_header)

test_lines = sc.textFile("train.csv")
test_header = test_lines.first()
test_rawData = test_lines.filter(lambda line: line != test_header)

## Parsing and pre-process the data file

In [None]:
def parse(row):
    try:
        parts = row.split(",")
        vendor_id = float(parts[1])
        pickup_dt = datetime.datetime.strptime(parts[2].strip(), "%Y-%m-%d %H:%M:%S")
        dropoff_dt = datetime.datetime.strptime(parts[3].strip(), "%Y-%m-%d %H:%M:%S")
        passenger_count = float(parts[4])
        pickup_long = float(parts[5])
        pickup_lat = float(parts[6])
        dropoff_long = float(parts[7])
        dropoff_lat = float(parts[8])
        store_fwd = 1.0 if parts[9].strip().upper() == 'Y' else 0.0
        trip_duration = float(parts[10])

        # Tạo features
        pickup_epoch = float(pickup_dt.timestamp())
        dropoff_epoch = float(dropoff_dt.timestamp())
        features = [vendor_id, pickup_epoch, dropoff_epoch, passenger_count,
                   pickup_long, pickup_lat, dropoff_long, dropoff_lat, store_fwd]

        return LabeledPoint(trip_duration, features)

    except Exception as e:
        print(e)
        return None

train_parsed = train_rawData.map(parse).filter(lambda x: x is not None).cache()
test_parsed = test_rawData.map(parse).filter(lambda x: x is not None).cache()

## Train-val split

In [None]:
train_data, val_data = train_parsed.randomSplit([0.8, 0.2], seed=42)

## Train the Decision Tree Regressor model using MLlib

Define Decision Tree model with parameters

In [None]:
dt_model = DecisionTree.trainRegressor(
    train_data,
    categoricalFeaturesInfo={},
    impurity="variance",
    maxDepth=5,
    maxBins=32,
    minInstancesPerNode=10
)

Make predictions on validation

In [None]:
# Tính toán dự đoán và nhãn (trên driver)
predictions = dt_model.predict(val_data.map(lambda x: x.features))

# Chuyển thành list để tính toán thủ công (với tập nhỏ)
labelsAndPredictions = val_data.map(lambda lp: lp.label).zip(predictions)


Model Evaluation Results:
Validation Set:
Root Mean Squared Error (RMSE) = 3208.8514205671545
R-squared (R²) = -0.02246461142531264


## Evaluation model

In [None]:
def evaluate_metrics(labelsAndPredictions):
    # Tính toán các giá trị cần thiết
    metrics = labelsAndPredictions.map(
        lambda x: (1, x[0], x[1], (x[0] - x[1]) ** 2, abs(x[0] - x[1]), x[0] ** 2)
    ).reduce(
        lambda a, b: (
            a[0] + b[0],  # count
            a[1] + b[1],  # sum of labels
            a[2] + b[2],  # sum of predictions
            a[3] + b[3],  # sum of squared errors
            a[4] + b[4],  # sum of absolute errors
            a[5] + b[5]   # sum of squared labels (for total variance)
        )
    )

    n = metrics[0]
    if n == 0:
        return {"RMSE": 0, "R2": 0}

    mse = metrics[3] / n
    rmse = mse ** 0.5
    mean_label = metrics[1] / n
    ss_total = metrics[5] - n * (mean_label ** 2)
    ss_residual = metrics[3]
    r2 = 1 - (ss_residual / ss_total) if ss_total != 0 else 0.0

    return {"RMSE": rmse, "R2": r2}

metrics = evaluate_metrics(labelsAndPredictions)

print("\nModel Evaluation Results:")
print("Validation Set:")
print("Root Mean Squared Error (RMSE) =", metrics["RMSE"])
print("R-squared (R²) =", metrics["R2"])


Model Evaluation Results:
Validation Set:
Root Mean Squared Error (RMSE) = 3208.8514205671545
R-squared (R²) = -0.02246461142531264


# **Reference**

1. [Spark Document - Decistion Tree Regression](https://spark.apache.org/docs/latest/ml-classification-regression.html#decision-tree-regression)

2. [Spark Document - Decision Trees(RDD-based API)](https://spark.apache.org/docs/latest/mllib-decision-tree.html)