# EDA

In [0]:
# Import libraries
from pyspark.sql import SparkSession, functions as F
from functools import reduce
import os
import pandas as pd
from statsmodels.stats.outliers_influence import variance_inflation_factor
import glob
import shutil

# Initialize Spark session
spark = SparkSession.builder.appName("EDA").getOrCreate()

In [0]:
# read in the data
csv_path = "/Volumes/workspace/mlrutting-3/mlrutting-3/rutting_climate_traffic.csv"

rutting_climate_traffic = spark.read.option("header", True).option("inferSchema", True).csv(csv_path)

# View schema and first rows
rutting_climate_traffic.printSchema()
rutting_climate_traffic.show(5)

# Print row count
print("Number of rows:", rutting_climate_traffic.count())

In [0]:
# Split into training (80%) and test (20%)
train_df, test_df = rutting_climate_traffic.randomSplit([0.8, 0.2], seed = 42)

In [0]:
# declare features
features = [
    "REL_HUM_AVG_AVG",
    "PRECIPITATION",
    "EVAPORATION",
    "PRECIP_DAYS",
    "CLOUD_COVER_AVG",
    "SHORTWAVE_SURFACE_AVG",
    "TEMP_AVG",
    "FREEZE_INDEX",
    "FREEZE_THAW",
    "WIND_VELOCITY_AVG",
    "AADTT_VEH_CLASS_4_TREND",
    "AADTT_VEH_CLASS_5_TREND",
    "AADTT_VEH_CLASS_6_TREND",
    "AADTT_VEH_CLASS_7_TREND",
    "AADTT_VEH_CLASS_8_TREND",
    "AADTT_VEH_CLASS_9_TREND",
    "AADTT_VEH_CLASS_10_TREND",
    "AADTT_VEH_CLASS_11_TREND",
    "AADTT_VEH_CLASS_12_TREND",
    "AADTT_VEH_CLASS_13_TREND"
]

print(f"Number of features: {len(features)}")

feature_df = train_df.select(features)

In [0]:
# Convert to Pandas
feature_pd = feature_df.toPandas()

# Compute VIF
vif_data = pd.DataFrame()
vif_data["feature"] = feature_pd.columns
vif_data["VIF"] = [variance_inflation_factor(feature_pd.values, i) 
                   for i in range(feature_pd.shape[1])]

print(vif_data.sort_values(by="VIF", ascending=False))

In [0]:
import matplotlib.pyplot as plt
import seaborn as sns

# Include the target variable along with features
corr_df = train_df.select(features + ["MAX_MEAN_DEPTH_1_8"]).toPandas()

# Compute correlation matrix
corr_matrix = corr_df.corr()

# Display correlation matrix
plt.figure(figsize=(16, 12))
sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap="coolwarm", cbar=True)
plt.title("Correlation Matrix Including Rutting Depth")
plt.show()

In [0]:
import math

# Convert full DataFrame to Pandas
plot_vars = features + ["MAX_MEAN_DEPTH_1_8"]
boxplot_df = train_df.select(plot_vars).toPandas()

# Ensure all columns are numeric
for col in plot_vars:
    boxplot_df[col] = pd.to_numeric(boxplot_df[col], errors='coerce')

# Define grid layout
n_features = len(plot_vars)
n_cols = 4
n_rows = math.ceil(n_features / n_cols)

# Create subplots
fig, axes = plt.subplots(n_rows, n_cols, figsize=(10, n_rows * 3))
axes = axes.flatten()

# Plot each feature separately
for i, var in enumerate(plot_vars):
    sns.boxplot(
        y=boxplot_df[var],
        ax=axes[i],
        color="skyblue",
        fliersize=1
    )
    axes[i].set_title(var, fontsize=10)  # FIXED
    axes[i].set_ylabel("Value")

# Hide any unused subplots
for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

# Adjust layout
fig.suptitle("Boxplots of Variable Distributions", fontsize=14, y=0.995)
plt.tight_layout(rect=[0, 0, 1, 0.97])
plt.show()

In [0]:
# Convert to Pandas
cols = features + ["MAX_MEAN_DEPTH_1_8"]
df_pd = train_df.select(cols).toPandas()

# Make sure all columns are numeric
for col in cols:
    df_pd[col] = pd.to_numeric(df_pd[col], errors='coerce')

# Melt data for facet plotting
melted = df_pd.melt(
    id_vars=["MAX_MEAN_DEPTH_1_8"],
    value_vars=features,
    var_name="Feature",
    value_name="Value"
)

# Create FacetGrid with 3 columns
g = sns.FacetGrid(
    melted,
    col="Feature",
    col_wrap=4,
    sharex=False,
    sharey=False,
    height=3.5
)
g.map(sns.scatterplot, "Value", "MAX_MEAN_DEPTH_1_8", alpha=0.5)
g.set_titles("{col_name}")
g.set_axis_labels("Feature Value", "Rutting Depth (mm)")
plt.subplots_adjust(top=0.95, hspace=0.5)
g.fig.suptitle("Scatterplots of Features vs. Rutting Depth", fontsize=14)
plt.show()

In [0]:
# Select relevant columns
cols = features + ["MAX_MEAN_DEPTH_1_8"]
df = train_df.select(cols)

# Initialize a list to store summary stats
stats = []

# Loop through columns and compute summary stats
for col in cols:
    # Compute approximate quantiles for 25th, 50th, 75th percentiles
    q25, q50, q75 = df.approxQuantile(col, [0.25, 0.5, 0.75], 0.01)

    # Compute min and max using Spark built-ins
    agg = df.select(
        F.min(F.col(col).cast("double")).alias("min"),
        F.max(F.col(col).cast("double")).alias("max")
    ).collect()[0]

    # Calculate IQR
    iqr = q75 - q25

    # Store results (ensure all numeric values are float)
    stats.append({
        "Feature": col,
        "Median": float(round(q50, 3)),
        "IQR": float(round(iqr, 3)),
        "Min": float(round(agg["min"], 3)),
        "Max": float(round(agg["max"], 3))
    })

# Convert list to Spark DataFrame safely
stats_df = spark.createDataFrame(stats)

# Display neatly
stats_df.show(len(cols), truncate=False)