In [27]:
from pyspark.sql import SparkSession
filePath = "student_habits_performance.csv"
#make the spark session and import the file

In [28]:
spark = SparkSession.builder.appName("TestApp").getOrCreate()

In [29]:
#read the dataset, and then get the rows, debug as well to make sure things work
taxiDF = spark.read.csv(filePath, header=True,inferSchema = True)
print("Original top 5 rows")
taxiDF.select("student_id","age","gender","study_hours_per_day","social_media_hours","netflix_hours","part_time_job","attendance_percentage","sleep_hours","diet_quality","exercise_frequency","parental_education_level","internet_quality","mental_health_rating","extracurricular_participation","exam_score").show(5)
#split the dataset
trainDF, testDF = taxiDF.randomSplit([0.8,0.2], seed=42)

Original top 5 rows
+----------+---+------+-------------------+------------------+-------------+-------------+---------------------+-----------+------------+------------------+------------------------+----------------+--------------------+-----------------------------+----------+
|student_id|age|gender|study_hours_per_day|social_media_hours|netflix_hours|part_time_job|attendance_percentage|sleep_hours|diet_quality|exercise_frequency|parental_education_level|internet_quality|mental_health_rating|extracurricular_participation|exam_score|
+----------+---+------+-------------------+------------------+-------------+-------------+---------------------+-----------+------------+------------------+------------------------+----------------+--------------------+-----------------------------+----------+
|     S1000| 23|Female|                0.0|               1.2|          1.1|           No|                 85.0|        8.0|        Fair|                 6|                  Master|         Average

In [30]:
    from pyspark.ml.feature import OneHotEncoder

    # Assuming 'category_indexed' is the output from StringIndexer
    encoder = OneHotEncoder(inputCols=["gender",
                                       "social_media_hours",
                                       "diet_quality",
                                       "parental_education_level",
                                       "internet_quality",
                                       "extracurricular_participation"],
                             outputCols=["gender_encoded","social_media_hours_encoded", "diet_quality_encoded", "parental_education_level_encoded", "internet_quality_encoded", "extracurricular_participation_encoded"])

    encoded_df = encoder.fit(taxiDF).transform(taxiDF)

IllegalArgumentException: requirement failed: Column gender must be of type numeric but was actually of type string.

In [None]:
#transformer
from pyspark.ml.feature import VectorAssembler
vecAssembler = VectorAssembler(inputCols=["student_id","age","gender_encoded","study_hours_per_day","social_media_hours_encoded","netflix_hours","part_time_job","attendance_percentage","sleep_hours","diet_quality_encoded","exercise_frequency","parental_education_level_encoded","internet_quality_encoded","mental_health_rating","extracurricular_participation_encoded"], outputCol="features")

vecTrainDF = vecAssembler.transform(trainDF)

In [None]:
#make the decision tree model
from pyspark.ml.regression import DecisionTreeRegressor
dt = DecisionTreeRegressor(featuresCol="features",labelCol="exam_score")

In [None]:
#train the model
from pyspark.ml import Pipeline
pipeline = Pipeline (stages=[vecAssembler,dt])
pipelineModel = pipeline.fit(trainDF)

In [None]:
#test dataset stuff
predDF = pipelineModel.transform(testDF)
print("Prediction 10 rows")
predDF.select("student_id","age","gender","study_hours_per_day","social_media_hours","netflix_hours","part_time_job","attendance_percentage","sleep_hours","diet_quality","exercise_frequency","parental_education_level","internet_quality","mental_health_rating","extracurricular_participation","prediction").show(10)

In [None]:
#check RMSE
from pyspark.ml.evaluation import RegressionEvaluator
regressionEvaluator = RegressionEvaluator(
    predictionCol="prediction",
    labelCol="total_amount",
    metricName="rmse"
)
rmse = regressionEvaluator.evaluate(predDF)
print("RMSE: ",rmse)

In [None]:
spark.stop()