# EDA

In [0]:
# Import libraries
from pyspark.sql import SparkSession, functions as F
from functools import reduce
import os
import pandas as pd
from statsmodels.stats.outliers_influence import variance_inflation_factor
import glob
import shutil

# Initialize Spark session
spark = SparkSession.builder.appName("EDA").getOrCreate()

In [0]:
# read in the data
csv_path = "/Volumes/workspace/mlrutting-3/mlrutting-3/rutting_climate_traffic.csv"

rutting_climate_traffic = spark.read.option("header", True).option("inferSchema", True).csv(csv_path)

# View schema and first rows
rutting_climate_traffic.printSchema()
rutting_climate_traffic.show(5)

In [0]:
# declare features
features = [
    "REL_HUM_AVG_AVG",
    "PRECIPITATION",
    "EVAPORATION",
    "PRECIP_DAYS",
    "CLOUD_COVER_AVG",
    "SHORTWAVE_SURFACE_AVG",
    "TEMP_AVG",
    "FREEZE_INDEX",
    "FREEZE_THAW",
    "WIND_VELOCITY_AVG",
    "AADTT_VEH_CLASS_4_TREND",
    "AADTT_VEH_CLASS_5_TREND",
    "AADTT_VEH_CLASS_6_TREND",
    "AADTT_VEH_CLASS_7_TREND",
    "AADTT_VEH_CLASS_8_TREND",
    "AADTT_VEH_CLASS_9_TREND",
    "AADTT_VEH_CLASS_10_TREND",
    "AADTT_VEH_CLASS_11_TREND",
    "AADTT_VEH_CLASS_12_TREND",
    "AADTT_VEH_CLASS_13_TREND"
]

numeric_df = rutting_climate_traffic.select(features)

In [0]:
# Convert to Pandas
numeric_pd = numeric_df.toPandas()

# Compute VIF
vif_data = pd.DataFrame()
vif_data["feature"] = numeric_pd.columns
vif_data["VIF"] = [variance_inflation_factor(numeric_pd.values, i) 
                   for i in range(numeric_pd.shape[1])]

print(vif_data.sort_values(by="VIF", ascending=False))

In [0]:
import matplotlib.pyplot as plt
import seaborn as sns

# Include the target variable along with features
corr_df = rutting_climate_traffic.select(features + ["MAX_MEAN_DEPTH_1_8"]).toPandas()

# Compute correlation matrix
corr_matrix = corr_df.corr()

# Display correlation matrix
plt.figure(figsize=(16, 12))
sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap="coolwarm", cbar=True)
plt.title("Correlation Matrix Including Rutting Depth")
plt.show()