In [3]:
import os
os.environ["JAVA_HOME"] = "/home/WARK/jdk-11.0.25+9"  

from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# 1. Initialize Spark Session
spark = SparkSession.builder \
    .appName("WARK Data Pipeline") \
    .master("local[*]") \
    .config("spark.executor.memory", "16g") \
    .config("spark.driver.memory", "16g") \
    .getOrCreate()

# 2. Data Loading (Assuming you've already joined your CSVs)
data_path = "/home/WARK/DSDE_Final_Project_WARK/Final/joined_2017-2023.csv"
df = spark.read.csv(data_path, header=True, inferSchema=True)

# Assuming your joined CSV has columns: "SubjectID", "SubjectName", ... (other features), "label"

# 3. Feature Engineering Stages

# 3.1 StringIndexer (Example - if you have categorical features)
#     Convert categorical columns to numerical indices
indexer = StringIndexer(inputCols=["SubjectID", "SubjectName"], 
                        outputCols=["indexedSubjectID", "indexedSubjectName"],
                        handleInvalid="keep")  # Handle unseen labels during testing

# 3.2 VectorAssembler
#     Combine feature columns into a single "features" vector
feature_cols = [c for c in df.columns if c not in ["SubjectID", "SubjectName", "label"]] # Add other feature column names
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")

# 4. Model Training Stage
lr = LogisticRegression(featuresCol="features", labelCol="label", maxIter=40)

# 5. Create the Pipeline
pipeline = Pipeline(stages=[indexer, assembler, lr])

# 6. Split Data (You can integrate this into the pipeline as well if you like)
train_data, test_data = df.randomSplit([0.8, 0.2], seed=42)

# 7. Fit the Pipeline (Train the Model)
pipeline_model = pipeline.fit(train_data)

# 8. Make Predictions
predictions = pipeline_model.transform(test_data)

# 9. Evaluate the Model
evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy"
)
accuracy = evaluator.evaluate(predictions)
print(f"Accuracy: {accuracy:.2f}")

# 10. Save the Pipeline
pipeline_model.write().overwrite().save("/home/WARK/DSDE_Final_Project_WARK/DE-ML/Pipeline")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/12/10 09:23:39 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
                                                                                

Py4JJavaError: An error occurred while calling o36.fit.
: org.apache.spark.SparkException: Input column SubjectID does not exist.
	at org.apache.spark.ml.feature.StringIndexerBase.$anonfun$validateAndTransformSchema$2(StringIndexer.scala:128)
	at scala.collection.TraversableLike.$anonfun$flatMap$1(TraversableLike.scala:293)
	at scala.collection.IndexedSeqOptimized.foreach(IndexedSeqOptimized.scala:36)
	at scala.collection.IndexedSeqOptimized.foreach$(IndexedSeqOptimized.scala:33)
	at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:198)
	at scala.collection.TraversableLike.flatMap(TraversableLike.scala:293)
	at scala.collection.TraversableLike.flatMap$(TraversableLike.scala:290)
	at scala.collection.mutable.ArrayOps$ofRef.flatMap(ArrayOps.scala:198)
	at org.apache.spark.ml.feature.StringIndexerBase.validateAndTransformSchema(StringIndexer.scala:123)
	at org.apache.spark.ml.feature.StringIndexerBase.validateAndTransformSchema$(StringIndexer.scala:115)
	at org.apache.spark.ml.feature.StringIndexer.validateAndTransformSchema(StringIndexer.scala:145)
	at org.apache.spark.ml.feature.StringIndexer.transformSchema(StringIndexer.scala:252)
	at org.apache.spark.ml.PipelineStage.transformSchema(Pipeline.scala:71)
	at org.apache.spark.ml.feature.StringIndexer.fit(StringIndexer.scala:237)
	at org.apache.spark.ml.feature.StringIndexer.fit(StringIndexer.scala:145)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:829)
