In [0]:
%pyspark



from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.ml.feature import StringIndexer, VectorAssembler, StandardScaler
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder, TrainValidationSplit


In [1]:
%pyspark

IS_SPARK_SUBMIT_CLI = False

from pyspark import SparkContext
sc = SparkContext.getOrCreate()
spark = SparkSession(sc)


In [2]:
%pyspark
# 1. Load and preview data
file_path = "/user/apang5/used_cars_sample_data--01percent.csv"
df = spark.read.option("header", "true").option("inferSchema", "true").csv(file_path)
df.printSchema()
df.show(5)


In [3]:
%pyspark
# 2. Feature selection & cleaning
from pyspark.sql.functions import col


data = df.select(
    "city_fuel_economy","highway_fuel_economy","daysonmarket",
    "engine_displacement","horsepower","mileage","seller_rating",
    "year","price","make_name","model_name",
    "torque","engine_cylinders","power","wheelbase","width"
)


for name, dtype in [
    ("city_fuel_economy", "double"),
    ("highway_fuel_economy","double"),
    ("daysonmarket","int"),
    ("engine_displacement","double"),
    ("horsepower","double"),
    ("mileage","double"),
    ("seller_rating","double"),
    ("year","int"),
    ("price","double"),
    ("torque","double"),
    ("engine_cylinders","double"),
    ("power","double"),
    ("wheelbase","double"),
    ("width","double"),
]:
    data = data.withColumn(name, col(name).cast(dtype))


essential = [
    "city_fuel_economy","highway_fuel_economy","daysonmarket",
    "engine_displacement","horsepower","mileage","seller_rating",
    "year","price"
]
data = data.dropna(subset=essential)


new_cols = ["torque","engine_cylinders","power","wheelbase","width"]
impute_map = {}
for c in new_cols:
   
    medians = data.stat.approxQuantile(c, [0.5], 0.001)
    impute_map[c] = medians[0] if medians else 0.0

data = data.na.fill(impute_map)

print(f"✅ Rows remaining after cleaning: {data.count()}")
data.printSchema()






In [4]:
%pyspark
# 3. Index categorical features
from pyspark.ml.feature import StringIndexer

make_indexer  = StringIndexer(inputCol="make_name",  outputCol="make_indexed")
model_indexer = StringIndexer(inputCol="model_name", outputCol="model_indexed")

data = make_indexer.fit(data).transform(data)
data = model_indexer.fit(data).transform(data)




In [5]:
%pyspark

# 4. Assemble + scale
from pyspark.ml.feature import VectorAssembler, StandardScaler

feature_columns = [
    "city_fuel_economy","highway_fuel_economy","daysonmarket",
    "engine_displacement","horsepower","mileage","seller_rating",
    "year","make_indexed","model_indexed",
    "torque","engine_cylinders","power","wheelbase","width"
]

assembler = VectorAssembler(
    inputCols=feature_columns,
    outputCol="assembled_features"
)
assembled_data = assembler.transform(data)

scaler = StandardScaler(
    inputCol="assembled_features",
    outputCol="features",
    withMean=True,
    withStd=True
)
scaler_model = scaler.fit(assembled_data)
final_data   = scaler_model.transform(assembled_data)


final_data.select("features").show(3, truncate=False)




In [6]:
%pyspark
# 5. Train/test split
train_data, test_data = final_data.randomSplit([0.8, 0.2], seed=42)


In [7]:

%pyspark
import time
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import RegressionEvaluator

# 6A. Cross-Validation
rf = RandomForestRegressor(featuresCol="features", labelCol="price")
paramGrid = ParamGridBuilder() \
    .addGrid(rf.numTrees, [50]) \
    .addGrid(rf.maxDepth, [5, 10]) \
    .build()

evaluator = RegressionEvaluator(labelCol="price", predictionCol="prediction", metricName="rmse")
crossval = CrossValidator(estimator=rf,
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator,
                          numFolds=3)

cv_start = time.time()
cv_model = crossval.fit(train_data)
cv_end = time.time()
cv_time = cv_end - cv_start

cv_preds = cv_model.transform(test_data)
cv_rmse = evaluator.evaluate(cv_preds)
cv_r2   = evaluator.setMetricName("r2").evaluate(cv_preds)

print(f"Cross-Validated RMSE: {cv_rmse:.2f}")
print(f"Cross-Validated R2:   {cv_r2:.4f}")
print(f"CV Time: {cv_time:.2f} seconds")



In [8]:

%pyspark
# 6B. TrainValidationSplit
from pyspark.ml.evaluation import RegressionEvaluator
import time


e_rmse = RegressionEvaluator(
    labelCol="price",
    predictionCol="prediction",
    metricName="rmse"
)
e_r2 = RegressionEvaluator(
    labelCol="price",
    predictionCol="prediction",
    metricName="r2"
)

tvs = TrainValidationSplit(
    estimator=rf,
    estimatorParamMaps=paramGrid,
    evaluator=e_rmse,      
    trainRatio=0.8
)

tvs_start = time.time()
tvs_model = tvs.fit(train_data)
tvs_end = time.time()
tvs_time = tvs_end - tvs_start

tvs_preds = tvs_model.transform(test_data)


tvs_rmse = e_rmse.evaluate(tvs_preds)
tvs_r2   = e_r2.evaluate(tvs_preds)

print(f"TrainValidationSplit RMSE: {tvs_rmse:.2f}")
print(f"TrainValidationSplit R2:   {tvs_r2:.4f}")
print(f"TVS Time: {tvs_time:.2f} seconds")





In [9]:
%pyspark
# 7. Final Random Forest model training
rf_final = RandomForestRegressor(featuresCol="features", labelCol="price", numTrees=100, maxDepth=10)
rf_model = rf_final.fit(train_data)


In [10]:
%pyspark
# 8. Final model evaluation
from pyspark.ml.evaluation import RegressionEvaluator


rmse_evaluator = RegressionEvaluator(
    labelCol="price",
    predictionCol="prediction",
    metricName="rmse"
)
r2_evaluator = RegressionEvaluator(
    labelCol="price",
    predictionCol="prediction",
    metricName="r2"
)


predictions = rf_model.transform(test_data)


final_rmse = rmse_evaluator.evaluate(predictions)
final_r2   = r2_evaluator.evaluate(predictions)

print(f"Final RMSE: {final_rmse:.2f}")
print(f"Final R2:   {final_r2:.4f}")



In [11]:
%pyspark
# 9. Feature importance
importances = rf_model.featureImportances.toArray()
for feature, score in zip(feature_columns, importances):
    print(f"{feature}: {score:.4f}")


In [12]:
%pyspark
# --- Summary of All Results ---
print(f"Cross-Validated RMSE: {cv_rmse:.2f}")
print(f"Cross-Validated R2:   {cv_r2:.4f}")
print(f"CV Time: {cv_time:.2f} seconds")
print(f"TrainValidationSplit RMSE: {tvs_rmse:.2f}")
print(f"TrainValidationSplit R2:   {tvs_r2:.4f}")
print(f"TVS Time: {tvs_time:.2f} seconds")
print(f"Final RMSE: {final_rmse:.2f}")
print(f"Final R2:   {final_r2:.4f}")
