## Import necessary libraries

In [28]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.ml import Pipeline

## Create Spark Session

In [2]:
def create_spark():
    """ Create a SparkSession object. """
    spark = SparkSession.builder \
        .master("local[*]") \
        .appName("TestSuite") \
        .config(key='spark.sql.shuffle.partitions', value='4') \
        .config(key='spark.default.parallelism', value='4') \
        .config(key='spark.sql.session.timeZone', value='UTC') \
        .config(key='spark.ui.enabled', value='false') \
        .config(key='spark.app.id', value='Test') \
        .config(key='spark.driver.host', value='localhost') \
        .getOrCreate()

    return spark

In [3]:
spark = create_spark()

## Load Hepatitis Dataset

In [29]:
path_to_data = '../../dataset/hcvdata.csv'
hepatit_spark_df = spark.read.csv(path_to_data, header=True, inferSchema=True)

## Data Preprocessing

In [30]:
numeric_cols = ['ALT', 'ALB', 'ALP', 'CHOL', 'PROT']


for c in numeric_cols:
    hepatit_spark_df = hepatit_spark_df.withColumn(
        c,
        F.when(F.col(c) == "NA", None)
        .otherwise(F.col(c).cast("double"))
    )

hepatit_spark_df = hepatit_spark_df.fillna(0.0, subset=numeric_cols)

In [31]:
# Define stages for the pipeline
# 1. Index the target variable
label_indexer = StringIndexer(inputCol='Category', outputCol='label')

# 2. Index and One-Hot Encode 'Sex'
sex_indexer = StringIndexer(inputCol='Sex', outputCol='Sex_index')
sex_encoder = OneHotEncoder(inputCols=['Sex_index'], outputCols=['Sex_vec'])

# 3. Assemble features
features = ['Age', 'Sex_vec', 'ALB', 'ALP', 'ALT', 'AST', 'BIL', 'CHE', 'CHOL', 'CREA', 'GGT', 'PROT']
assembler = VectorAssembler(inputCols=features, outputCol='features')

# 4. Logistic Regression model
lr = LogisticRegression(featuresCol='features', labelCol='label', maxIter=100)

# Create the pipeline
pipeline = Pipeline(stages=[label_indexer,
                            sex_indexer,
                            sex_encoder,
                            assembler,
                            lr])

# Split data into training and test sets
train, test = hepatit_spark_df.randomSplit([0.7, 0.3])

# Fit the pipeline on the training data
pipeline_model = pipeline.fit(train)

# Make predictions on the test data
y_pred = pipeline_model.transform(test)

## Evaluation using MulticlassClassificationEvaluator

In [32]:
metrics = {
    "accuracy": MulticlassClassificationEvaluator(metricName="accuracy"),
    "precision": MulticlassClassificationEvaluator(metricName="weightedPrecision"),
    "recall": MulticlassClassificationEvaluator(metricName="weightedRecall"),
    "f1": MulticlassClassificationEvaluator(metricName="f1"),
}

for name, evaluator in metrics.items():
    # evaluator.setLabelCol("label").setPredictionCol("prediction") # in case custom columns are used
    print(name, evaluator.evaluate(y_pred))


accuracy 0.9132653061224489
precision 0.9067840732784341
recall 0.913265306122449
f1 0.9055118300621843


## Evaluation using MulticlassMetrics

In [None]:
# TODO: something is wrong with this part ( seems in local env )
# pl = y_pred.select("prediction", "label").rdd.map(lambda r: (float(r.prediction), float(r.label)))
# metric = MulticlassMetrics(prediction_and_labels)
#
# print('Accuracy: ', metric.accuracy)
# print('Precision: ', metric.precision(1.0))
# print('Recall: ', metric.recall(1.0))
# print('F1 Score: ', metric.fMeasure(1.0))

In [None]:
# TODO: 1. Use CrossValidator for hyperparameter tuning