In [54]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.ml.feature import VectorAssembler, StringIndexer, MinMaxScaler
from pyspark.ml.classification import (
    LogisticRegression, DecisionTreeClassifier, RandomForestClassifier, 
    GBTClassifier, NaiveBayes, LinearSVC
)
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import BinaryClassificationEvaluator
import findspark
from pyspark.mllib.evaluation import MulticlassMetrics
import numpy as np
import pandas as pd
import shap

In [2]:
findspark.init()

In [34]:
spark = SparkSession.builder.appName("ClassificationModels").getOrCreate()

In [35]:
spark

In [5]:
df = spark.read.csv(r"C:\Users\Robyi\Documents\Data Science Dataset\diabetes.csv", header = True, inferSchema = True)

In [6]:
df.show()

+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin| BMI|DiabetesPedigreeFunction|Age|Outcome|
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
|          6|    148|           72|           35|      0|33.6|                   0.627| 50|      1|
|          1|     85|           66|           29|      0|26.6|                   0.351| 31|      0|
|          8|    183|           64|            0|      0|23.3|                   0.672| 32|      1|
|          1|     89|           66|           23|     94|28.1|                   0.167| 21|      0|
|          0|    137|           40|           35|    168|43.1|                   2.288| 33|      1|
|          5|    116|           74|            0|      0|25.6|                   0.201| 30|      0|
|          3|     78|           50|           32|     88|31.0|                   0.248| 26|      1|


In [12]:
df = df.dropDuplicates()

In [13]:
df = df.dropna(how="any")

In [14]:
assembler = VectorAssembler(inputCols=[
    "Pregnancies", "Glucose", "BloodPressure", "SkinThickness","Insulin","BMI","DiabetesPedigreeFunction"], 
                            outputCol="features")
df = assembler.transform(df)

In [15]:
MinMax = MinMaxScaler(inputCol="features", outputCol="features_scaled")
df = MinMax.fit(df).transform(df)

In [16]:
df.show()

+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+--------------------+--------------------+
|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin| BMI|DiabetesPedigreeFunction|Age|Outcome|            features|     features_scaled|
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+--------------------+--------------------+
|          4|    129|           60|           12|    231|27.5|                   0.527| 31|      0|[4.0,129.0,60.0,1...|[0.23529411764705...|
|          2|    105|           58|           40|     94|34.9|                   0.225| 25|      0|[2.0,105.0,58.0,4...|[0.11764705882352...|
|          3|    129|           64|           29|    115|26.4|                   0.219| 28|      1|[3.0,129.0,64.0,2...|[0.17647058823529...|
|          7|    136|           90|            0|      0|29.9|                    0.21| 50|      0|[7.0,136.0,90.0,0...|[0.41176470588235...|
|     

In [17]:
train, test = df.randomSplit([0.8, 0.2], seed=42)

In [18]:
evaluator = BinaryClassificationEvaluator(labelCol="Outcome", metricName="areaUnderROC")

In [19]:
lr = LogisticRegression(featuresCol="features_scaled", labelCol="Outcome")

paramGrid = ParamGridBuilder() \
    .addGrid(lr.regParam, [0.01, 0.1, 1.0]) \
    .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0]) \
    .build()

crossval = CrossValidator(estimator=lr, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5)
lr_model = crossval.fit(train)

In [None]:
dt = DecisionTreeClassifier(featuresCol="features_scaled", labelCol="label")

paramGrid = ParamGridBuilder() \
    .addGrid(dt.maxDepth, [3, 5, 10]) \
    .addGrid(dt.minInstancesPerNode, [1, 2, 4]) \
    .build()

crossval = CrossValidator(estimator=dt, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5)
dt_model = crossval.fit(train)

In [None]:
rf = RandomForestClassifier(featuresCol="features", labelCol="label")

paramGrid = ParamGridBuilder() \
    .addGrid(rf.numTrees, [10, 50, 100]) \
    .addGrid(rf.maxDepth, [3, 5, 10]) \
    .build()

crossval = CrossValidator(estimator=rf, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5)
rf_model = crossval.fit(train)

In [None]:
gbt = GBTClassifier(featuresCol="features", labelCol="label")

paramGrid = ParamGridBuilder() \
    .addGrid(gbt.maxDepth, [3, 5, 10]) \
    .addGrid(gbt.maxIter, [10, 50, 100]) \
    .build()

crossval = CrossValidator(estimator=gbt, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5)
gbt_model = crossval.fit(train)

In [None]:
nb = NaiveBayes(featuresCol="features", labelCol="label")

paramGrid = ParamGridBuilder() \
    .addGrid(nb.smoothing, [0.1, 1.0, 10.0]) \
    .build()

crossval = CrossValidator(estimator=nb, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5)
nb_model = crossval.fit(train)

In [None]:
svc = LinearSVC(featuresCol="features", labelCol="label")

paramGrid = ParamGridBuilder() \
    .addGrid(svc.regParam, [0.01, 0.1, 1.0]) \
    .build()

crossval = CrossValidator(estimator=svc, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5)
svc_model = crossval.fit(train)

In [24]:
best_model = lr_model.bestModel
predictions = best_model.transform(test)

In [25]:
predictions.select("features_scaled", "Outcome", "prediction", "probability").show(truncate=False)

+------------------------------------------------------------------------------------------------------------------------------------------+-------+----------+-----------------------------------------+
|features_scaled                                                                                                                           |Outcome|prediction|probability                              |
+------------------------------------------------------------------------------------------------------------------------------------------+-------+----------+-----------------------------------------+
|(7,[1,5,6],[0.36683417085427134,0.31445603576751124,0.11272416737830913])                                                                 |0      |0.0       |[0.9523229188426486,0.047677081157351364]|
|[0.0,0.4221105527638191,0.6721311475409837,0.31313131313131315,0.14775413711583923,0.5692995529061103,0.06618274978650726]                |0      |0.0       |[0.9197619198782415,0.08023808012

In [None]:
predictions.select("probability").getItem(1).alias("prob_positive")).show()

In [27]:
predictionAndLabels = predictions.select("prediction", "Outcome").rdd.map(tuple)

In [38]:
print(predictionAndLabels.take(5))

[(0.0, 0), (0.0, 0), (0.0, 0), (0.0, 0), (0.0, 0)]


In [39]:
predictionAndLabels = predictions.select(
    col("prediction").cast("double"),
    col("Outcome").cast("double")
).rdd.map(tuple)

In [40]:
print(predictionAndLabels.take(15))

[(0.0, 0.0), (0.0, 0.0), (0.0, 0.0), (0.0, 0.0), (0.0, 0.0), (0.0, 0.0), (0.0, 0.0), (0.0, 0.0), (0.0, 0.0), (0.0, 1.0), (0.0, 0.0), (0.0, 0.0), (0.0, 0.0), (0.0, 0.0), (0.0, 1.0)]


In [59]:
metrics = MulticlassMetrics(predictionAndLabels)



In [43]:
metrics.accuracy

0.8130081300813008

In [44]:
print("Confusion Matrix:")
print(metrics.confusionMatrix().toArray())

Confusion Matrix:
[[73.  7.]
 [16. 27.]]


In [48]:
precision = metrics.precision(1.0)
recall = metrics.recall(1.0) 
f1_score = metrics.fMeasure(1.0)

print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1_score:.4f}")

Precision: 0.7941
Recall: 0.6279
F1-Score: 0.7013


In [49]:
auc = evaluator.evaluate(prediction)
print(f"ROC AUC Score: {auc:.4f}")

ROC AUC Score: 0.8637


In [52]:
cv_results = lr_model.avgMetrics
print(f"Cross-validation Scores: {cv_results}")
print(f"Mean Cross-validation Score: {np.mean(cv_results):.4f}")

Cross-validation Scores: [0.825507035258046, 0.8254788948404921, 0.8252876535144436, 0.822364453025, 0.8163756417892095, 0.7945645731086121, 0.8112570978264981, 0.5, 0.5]
Mean Cross-validation Score: 0.7468


In [None]:
feature_importances = best_model.featureImportances
features = ["Pregnancies", "Glucose", "BloodPressure", "SkinThickness","Insulin","BMI","DiabetesPedigreeFunction"]

importance_df = pd.DataFrame(list(zip(features, feature_importances)), columns=["Feature", "Importance"])
importance_df = importance_df.sort_values(by="Importance", ascending=False)

print(importance_df)

In [56]:
best_model = lr_model.bestModel  

coefficients = best_model.coefficients
intercept = best_model.intercept

feature_names = ["Pregnancies", "Glucose", "BloodPressure", "SkinThickness","Insulin","BMI","DiabetesPedigreeFunction"]

for i, coef in enumerate(coefficients):
    print(f"{feature_names[i]}: {coef:.4f}")

Pregnancies: 2.3380
Glucose: 6.1165
BloodPressure: -1.2494
SkinThickness: -0.2320
Insulin: -0.6968
BMI: 5.2839
DiabetesPedigreeFunction: 2.4485
