## CIS5560: Final Term Project -> Logistic Regression

### Andrew Pang (apang5@calstatela.edu)

In [1]:
%pyspark
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, MinMaxScaler
from pyspark.ml.classification import LogisticRegression
from pyspark.sql.functions import regexp_extract, col, when
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator
from pyspark.ml import Pipeline
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator, TrainValidationSplit
import time

from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession

# Configure Settings for spark-submit

In [3]:
%pyspark
# True when to create Python soure code to run with spark-submit 
IS_SPARK_SUBMIT_CLI = False

if IS_SPARK_SUBMIT_CLI:
    sc = SparkContext.getOrCreate()
    spark = SparkSession(sc)

# Import and Parse the Dataset

In [5]:
%pyspark
# File location and type
file_location = "/user/apang5/used_cars_sample_data--01percent.csv"
#file_location = "/user/apang5/used_cars_data.csv"
file_type = "csv"

# CSV options
infer_schema = "true"
first_row_is_header = "true"
delimiter = ","

# The applied options are for CSV files. For other file types, these will be ignored.
df = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(file_location)

df.show()

In [6]:
%pyspark
# Load the CSV with safe parsing options
df = spark.read.format(file_type) \
 .option("header", "true") \
 .option("inferSchema", "true") \
 .option("sep", ",") \
 .option("quote", "\"") \
 .option("escape", "\"") \
 .option("multiLine", "true") \
 .option("mode", "PERMISSIVE") \
 .load(file_location)

df.printSchema()
df.show(5, truncate=False)

# Feature Handling

In [8]:
%pyspark
# -----------------------------------------
# 1. Data Preparation (cleaning, as before)
# -----------------------------------------
data = df.select("city_fuel_economy", "highway_fuel_economy", "daysonmarket",
                 "engine_displacement", "horsepower", "mileage", "seller_rating",
                 "year", "price", "engine_cylinders", "torque", "power",
                 "front_legroom", "wheelbase", "width", "body_type", "has_accidents", 
                 'fuel_type', 'transmission', 'make_name', 'model_name', 'exterior_color',
                 'interior_color', 'dealer_zip', 'franchise_make', 'wheel_system')

# Cast numeric columns properly
data = data.withColumn("city_fuel_economy", col("city_fuel_economy").cast("double")) \
           .withColumn("highway_fuel_economy", col("highway_fuel_economy").cast("double")) \
           .withColumn("daysonmarket", col("daysonmarket").cast("int")) \
           .withColumn("engine_displacement", col("engine_displacement").cast("double")) \
           .withColumn("horsepower", col("horsepower").cast("double")) \
           .withColumn("mileage", col("mileage").cast("double")) \
           .withColumn("seller_rating", col("seller_rating").cast("double")) \
           .withColumn("year", col("year").cast("int")) \
           .withColumn("price", col("price").cast("double")) \
           .withColumn("engine_cylinders", regexp_extract("engine_cylinders", "(\\d+)", 1).cast("double")) \
           .withColumn("torque", regexp_extract("torque", "(\\d+)", 1).cast("double")) \
           .withColumn("power", regexp_extract("power", "(\\d+)", 1).cast("double"))

for col_name in ["front_legroom", "wheelbase", "width"]:
    data = data.withColumn(col_name, regexp_extract(col_name, "(\\d+\\.\\d+)", 1).cast("double"))

data = data.withColumn("has_accidents", col("has_accidents").cast("int"))
data = data.dropna()
data.groupBy("has_accidents").count().show()

In [9]:
%pyspark
# -----------------------------------------
# Feature Preparation
# -----------------------------------------
feature_cols = [
    'city_fuel_economy', 'highway_fuel_economy', 'engine_displacement',
    'horsepower', 'seller_rating', 'year', 'mileage',
    'engine_cylinders', 'torque', 'power', 'front_legroom', 'wheelbase', 'width'
]

numeric_features = [
    'city_fuel_economy', 'highway_fuel_economy', 'engine_displacement',
    'horsepower', 'seller_rating', 'year', 'mileage',
    'engine_cylinders', 'torque', 'power', 'front_legroom', 'wheelbase', 'width'
]

categorical_features = [
    'body_type', 'fuel_type', 'transmission', 'make_name', 'model_name', 'exterior_color',
    'interior_color', 'dealer_zip', 'franchise_make', 'wheel_system'
]

In [10]:
%pyspark
# === Index + One-Hot Encode categorical features ===
indexers = [StringIndexer(inputCol=col, outputCol=f"{col}_idx", handleInvalid="keep") for col in categorical_features]
encoders = [OneHotEncoder(inputCol=f"{col}_idx", outputCol=f"{col}_vec") for col in categorical_features]
encoded_features = [f"{col}_vec" for col in categorical_features]

# === Assemble and scale numeric features ===
num_assembler = VectorAssembler(inputCols=numeric_features, outputCol="num_features")
scaler = MinMaxScaler(inputCol="num_features", outputCol="scaled_features")

# === Final feature assembler ===
final_assembler = VectorAssembler(inputCols=encoded_features + ["scaled_features"], outputCol="features")



# Split the Dataset

In [12]:
%pyspark

splits = data.randomSplit([0.7, 0.3])
train = splits[0]
test = splits[1]
train_rows = train.count()
test_rows = test.count()
print("Training Rows:", train_rows, " Testing Rows:", test_rows)

# Build the Model

In [14]:
%pyspark

# === Define model ===
lr = LogisticRegression(featuresCol="features", labelCol="has_accidents", maxIter=10,regParam=0.3)

# === Build pipeline ===
pipeline_lr = Pipeline(stages=indexers + encoders + [num_assembler, scaler, final_assembler, lr])


# Build the ParamGridBuilder and the CrossValidator

In [16]:
%pyspark
# Set up hyperparameter grid (optional but useful)
paramGrid = (ParamGridBuilder() \
             .addGrid(lr.regParam, [0.01, 0.5]) \
             .addGrid(lr.elasticNetParam, [0.0, 0.5]) \
             .addGrid(lr.maxIter, [1, 5]) \
             .build())

# Define CrossValidator
evaluator_auc = BinaryClassificationEvaluator(labelCol="has_accidents", rawPredictionCol="rawPrediction", metricName="areaUnderROC")
evaluator_accuracy = MulticlassClassificationEvaluator(
    labelCol="has_accidents",
    predictionCol="prediction",
    metricName="accuracy"
)
evaluator_precision = MulticlassClassificationEvaluator(
    labelCol="has_accidents",
    predictionCol="prediction",
    metricName="weightedPrecision"
)
evaluator_recall = MulticlassClassificationEvaluator(
    labelCol="has_accidents",
    predictionCol="prediction",
    metricName="weightedRecall"
)
evaluator_f1 = MulticlassClassificationEvaluator(
    labelCol="has_accidents",
    predictionCol="prediction",
    metricName="f1"
)

cv = CrossValidator(estimator=pipeline_lr,
                       estimatorParamMaps=paramGrid,
                       evaluator=evaluator_auc, numFolds=3)

# Train model using Cross Validation
cv_start = time.time()
cvModel = cv.fit(train)
cv_end = time.time()

cv_time = cv_end - cv_start

# Make predictions
cv_predictions = cvModel.transform(test)

# Evaluate
cv_auc = evaluator_auc.evaluate(cv_predictions)
cv_accuracy = evaluator_accuracy.evaluate(cv_predictions)
cv_precision = evaluator_precision.evaluate(cv_predictions)
cv_recall = evaluator_recall.evaluate(cv_predictions)
cv_f1 = evaluator_f1.evaluate(cv_predictions)

print(f"Cross-Validated AUC (Logistic Regression): {cv_auc:.4f}")
print(f"Cross-Validated Accuracy (Logistic Regression): {cv_accuracy:.4f}")
print(f"Cross-Validated Precision (Logistic Regression): {cv_precision:.4f}")
print(f"Cross-Validated Recall (Logistic Regression): {cv_recall:.4f}")
print(f"Cross-Validated F1 (Logistic Regression): {cv_f1:.4f}")

# Build the ParamGridBuilder and the TrainValidationSplit

In [18]:
%pyspark
# paramGrid = (ParamGridBuilder()
#              .addGrid(lr.regParam, [0.001, 0.01, 0.1, 1.0])
#              .addGrid(lr.elasticNetParam, [0.0, 0.25, 0.5, 0.75, 1.0])
#              .build())

# 5. Define TrainValidationSplit
evaluator_auc = BinaryClassificationEvaluator(labelCol="has_accidents", rawPredictionCol="rawPrediction", metricName="areaUnderROC")
evaluator_accuracy = MulticlassClassificationEvaluator(
    labelCol="has_accidents",
    predictionCol="prediction",
    metricName="accuracy"
)
evaluator_precision = MulticlassClassificationEvaluator(
    labelCol="has_accidents",
    predictionCol="prediction",
    metricName="weightedPrecision"
)
evaluator_recall = MulticlassClassificationEvaluator(
    labelCol="has_accidents",
    predictionCol="prediction",
    metricName="weightedRecall"
)
evaluator_f1 = MulticlassClassificationEvaluator(
    labelCol="has_accidents",
    predictionCol="prediction",
    metricName="f1"
)

# 5. Create TrainValidationSplit
tvs = TrainValidationSplit(estimator=pipeline_lr,
                           estimatorParamMaps=paramGrid,
                           evaluator=evaluator_auc,
                           trainRatio=0.8)

# 6. Train model using Train Validation
tvs_start = time.time()
tvsModel = tvs.fit(train)
tvs_end = time.time()

tvs_time = tvs_end - tvs_start

# 7. Make predictions
tvs_predictions = tvsModel.transform(test)

# 8. Evaluate
tvs_auc = evaluator_auc.evaluate(tvs_predictions)
tvs_accuracy = evaluator_accuracy.evaluate(tvs_predictions)
tvs_precision = evaluator_precision.evaluate(tvs_predictions)
tvs_recall = evaluator_recall.evaluate(tvs_predictions)
tvs_f1 = evaluator_f1.evaluate(tvs_predictions)

print(f"Train Validation Split AUC (Logistic Regression): {tvs_auc:.4f}")
print(f"Train Validation Split Accuracy (Logistic Regression): {tvs_accuracy:.4f}")
print(f"Train Validation Split Precision (Logistic Regression): {tvs_precision:.4f}")
print(f"Train Validation Split Recall (Logistic Regression): {tvs_recall:.4f}")
print(f"Train Validation Split F1 (Logistic Regression): {tvs_f1:.4f}")

# Summarize the Evaluation Results

In [20]:
%pyspark

# --- Summary of All Results ---
print(f"\n--- Summary of All Results ---")
print(f"Training Rows: {train_rows:,}\nTesting Rows: {test_rows:,}")

print(f"\n--- Summary of Cross-Validation Results ---")
print(f"CV AUC: {cv_auc:.4f}")
print(f"CV Accuracy: {cv_accuracy:.4f}")
print(f"CV Precision: {cv_precision:.4f}")
print(f"CV Recall: {cv_recall:.4f}")
print(f"CV F1: {cv_f1:.4f}")
print(f"CV Time: {cv_time:.2f} seconds")

print(f"\n--- Summary of Train-Validation-Split Results ---")
print(f"TVS AUC: {tvs_auc:.4f}")
print(f"TVS Accuracy: {tvs_accuracy:.4f}")
print(f"TVS Precision: {tvs_precision:.4f}")
print(f"TVS Recall: {tvs_recall:.4f}")
print(f"TVS F1: {tvs_f1:.4f}")
print(f"TVS Time: {tvs_time:.2f} seconds")