In [4]:
import sys
import os

sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath("__file__"))))

from pyspark.sql import SparkSession
from pyspark.ml.classification import DecisionTreeClassifier, LinearSVC
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.ml.feature import StringIndexer
import numpy as np
import matplotlib.pyplot as plt

spark = SparkSession.builder.appName("Modelling_1").getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

In [5]:
from preprocessing.preprocessing import preprocess_data

df_train, df_test = preprocess_data()

print("Training data sample:")
df_train.select("features", "Accident_Severity").show(5, truncate=False)

AnalysisException: [PATH_NOT_FOUND] Path does not exist: file:/Users/m4/Desktop/school/4/TSVD/assignment/Big-Data-Processing/data.tmp/CarAccidents/Accidents.csv.

In [None]:
labelIndexer = StringIndexer(inputCol="Accident_Severity", outputCol="label")
df_train = labelIndexer.fit(df_train).transform(df_train)
df_test = labelIndexer.fit(df_test).transform(df_test)

In [None]:
def evaluate_model(model, data, model_name):
    """Evaluate model performance with confusion matrix and metrics"""
    # Make predictions
    predictions = model.transform(data)

    # Select (prediction, label) for confusion matrix
    predictionAndLabels = predictions.select("prediction", "label").rdd.map(lambda x: (float(x[0]), float(x[1])))

    # Compute confusion matrix
    metrics = MulticlassMetrics(predictionAndLabels)

    # Print contingency table (confusion matrix)
    labels = metrics.labels
    print(f"\n--- {model_name} Confusion Matrix ---")

    # Header row for confusion matrix
    header = "True \\ Predicted"
    for l in labels:
        header += f"\t{int(l)}"
    print(header)

    # Print each row of confusion matrix
    confusion_matrix = metrics.confusionMatrix().toArray()
    for i, l in enumerate(labels):
        row = f"{int(l)}"
        for j in range(len(labels)):
            row += f"\t{int(confusion_matrix[i][j])}"
        print(row)

    # Calculate metrics
    precision = metrics.precision()
    recall = metrics.recall()
    f1Score = metrics.fMeasure()

    # Calculate MCC
    # For binary classification, we can calculate MCC as follows
    if len(labels) == 2:
        TP = metrics.truePositiveRate(1.0) * confusion_matrix[1][1]
        TN = metrics.truePositiveRate(0.0) * confusion_matrix[0][0]
        FP = metrics.falsePositiveRate(0.0) * confusion_matrix[0][0]
        FN = metrics.falsePositiveRate(1.0) * confusion_matrix[1][1]

        denominator = np.sqrt((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN))
        mcc = ((TP * TN) - (FP * FN)) / denominator if denominator != 0 else 0
    else:
        # For multiclass MCC, we'll use a generic implementation
        n_classes = len(labels)
        confusion_sum = confusion_matrix.sum()

        # Initialize values
        cov_xy = 0
        cov_xx = 0
        cov_yy = 0

        for i in range(n_classes):
            row_sum = sum(confusion_matrix[i])
            col_sum = sum(confusion_matrix[:, i])

            cov_xy += row_sum * col_sum
            cov_xx += row_sum * row_sum
            cov_yy += col_sum * col_sum

        mcc_numerator = 0
        for i in range(n_classes):
            for j in range(n_classes):
                mcc_numerator += confusion_matrix[i][j] * (confusion_matrix.sum() * confusion_matrix[i][j] -
                                                         sum(confusion_matrix[i]) * sum(confusion_matrix[:, j]))

        mcc_denominator = np.sqrt((confusion_sum**2 - cov_xx) * (confusion_sum**2 - cov_yy))
        mcc = mcc_numerator / mcc_denominator if mcc_denominator != 0 else 0

    # Print metrics
    print(f"\n--- {model_name} Evaluation Metrics ---")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1Score:.4f}")
    print(f"MCC: {mcc:.4f}\n")

    return {"precision": precision, "recall": recall, "f1": f1Score, "mcc": mcc}

In [None]:
# 1. Decision Tree Model
dt = DecisionTreeClassifier(labelCol="label", featuresCol="features", maxDepth=5)
dt_model = dt.fit(df_train)

# Evaluate Decision Tree model
dt_metrics = evaluate_model(dt_model, df_test, "Decision Tree")

# Print feature importances for Decision Tree if available
if hasattr(dt_model, "featureImportances"):
    importances = dt_model.featureImportances
    print("Feature Importances:")
    for i, importance in enumerate(importances):
        print(f"Feature {i}: {importance}")

# 2. Linear SVM Model
lsvc = LinearSVC(maxIter=10, regParam=0.1, labelCol="label", featuresCol="features")
lsvc_model = lsvc.fit(df_train)

# Evaluate Linear SVM model
lsvc_metrics = evaluate_model(lsvc_model, df_test, "Linear SVM")

In [None]:
# Compare models
print("\nModel Comparison:")
print("Model\t\tPrecision\tRecall\t\tF1 Score\tMCC")
print(f"Decision Tree\t{dt_metrics['precision']:.4f}\t\t{dt_metrics['recall']:.4f}\t\t{dt_metrics['f1']:.4f}\t\t{dt_metrics['mcc']:.4f}")
print(f"Linear SVM\t{lsvc_metrics['precision']:.4f}\t\t{lsvc_metrics['recall']:.4f}\t\t{lsvc_metrics['f1']:.4f}\t\t{lsvc_metrics['mcc']:.4f}")

# Save models
models_dir = os.path.join("..", "models")
os.makedirs(models_dir, exist_ok=True)

# Save models
dt_model_path = os.path.join(models_dir, "decision_tree_model")
dt_model.save(dt_model_path)
print(f"Decision Tree model saved to {dt_model_path}")

lsvc_model_path = os.path.join(models_dir, "linear_svm_model")
lsvc_model.save(lsvc_model_path)
print(f"Linear SVM model saved to {lsvc_model_path}")

# Plot confusion matrices if matplotlib is available
try:
    # Plot Decision Tree confusion matrix
    plt.figure(figsize=(8, 6))
    plt.imshow(dt_metrics["confusion_matrix"], interpolation='nearest', cmap=plt.cm.Blues)
    plt.title("Decision Tree Confusion Matrix")
    plt.colorbar()
    plt.tight_layout()
    plt.savefig(os.path.join(models_dir, "dt_confusion_matrix.png"))

    # Plot Linear SVM confusion matrix
    plt.figure(figsize=(8, 6))
    plt.imshow(lsvc_metrics["confusion_matrix"], interpolation='nearest', cmap=plt.cm.Blues)
    plt.title("Linear SVM Confusion Matrix")
    plt.colorbar()
    plt.tight_layout()
    plt.savefig(os.path.join(models_dir, "svm_confusion_matrix.png"))
except Exception as e:
    print(f"Couldn't plot confusion matrices: {e}")