In [1]:
from pyspark.sql import SparkSession
filePath = "student_habits_performance.csv"
#make the spark session and import the file

In [2]:
spark = SparkSession.builder.appName("TestApp").getOrCreate()
print(spark.sparkContext.version)

3.5.1


In [3]:
taxiDF = spark.read.csv(filePath, header=True,inferSchema = True)

from pyspark.ml.feature import StringIndexer

cat_attribs =         ["gender",
                       "part_time_job",
                       "diet_quality",
                       "parental_education_level",
                       "internet_quality",
                       "extracurricular_participation"]
cat_attribs_encoded =  ["gender_encoded",
                       "part_time_job_encoded",
                        "diet_quality_encoded",
                        "parental_education_level_encoded",
                        "internet_quality_encoded",
                        "extracurricular_participation_encoded"]
indexer = StringIndexer(inputCols=cat_attribs,
                           outputCols=cat_attribs_encoded)


In [4]:
#read the dataset, and then get the rows, debug as well to make sure things work
#of note, student_id was dropped since that's pretty irrelevant
taxiDF = taxiDF.drop('student_ID')
indexerModel = indexer.fit(taxiDF)
indexed_df = indexerModel.transform(taxiDF)
indexed_df = indexed_df.drop("gender",
                       "part_time_job",
                       "diet_quality",
                       "parental_education_level",
                       "internet_quality",
                       "extracurricular_participation")

indexed_df.show(4)
# print("Original top 5 rows")
#taxiDF.select("age","gender","study_hours_per_day","social_media_hours","netflix_hours","part_time_job","attendance_percentage","sleep_hours","diet_quality","exercise_frequency","parental_education_level","internet_quality","mental_health_rating","extracurricular_participation","exam_score").show(5)
#split the dataset
trainDF, testDF = indexed_df.randomSplit([0.8,0.2], seed=42)
trainDF.show(3)
# type(trainDF)

+---+-------------------+------------------+-------------+---------------------+-----------+------------------+--------------------+----------+--------------+---------------------+--------------------+--------------------------------+------------------------+-------------------------------------+
|age|study_hours_per_day|social_media_hours|netflix_hours|attendance_percentage|sleep_hours|exercise_frequency|mental_health_rating|exam_score|gender_encoded|part_time_job_encoded|diet_quality_encoded|parental_education_level_encoded|internet_quality_encoded|extracurricular_participation_encoded|
+---+-------------------+------------------+-------------+---------------------+-----------+------------------+--------------------+----------+--------------+---------------------+--------------------+--------------------------------+------------------------+-------------------------------------+
| 23|                0.0|               1.2|          1.1|                 85.0|        8.0|              

In [5]:
#transformer
from pyspark.ml.feature import VectorAssembler
vecAssembler = VectorAssembler(inputCols=["age","gender_encoded","study_hours_per_day","social_media_hours","netflix_hours","part_time_job_encoded","attendance_percentage","sleep_hours","diet_quality_encoded","exercise_frequency","parental_education_level_encoded","internet_quality_encoded","mental_health_rating","extracurricular_participation_encoded"], outputCol="features")

vecTrainDF = vecAssembler.transform(trainDF)

In [6]:
#make the decision tree model
from pyspark.ml.regression import DecisionTreeRegressor
dt = DecisionTreeRegressor(featuresCol="features",labelCol="exam_score")

In [7]:
#train the model
from pyspark.ml import Pipeline
pipeline = Pipeline (stages=[vecAssembler,dt])
pipelineModel = pipeline.fit(trainDF)

In [8]:
#test dataset stuff
predDF = pipelineModel.transform(testDF)
print("Prediction 5 rows")
predDF.show(5)

Prediction 5 rows
+---+-------------------+------------------+-------------+---------------------+-----------+------------------+--------------------+----------+--------------+---------------------+--------------------+--------------------------------+------------------------+-------------------------------------+--------------------+------------------+
|age|study_hours_per_day|social_media_hours|netflix_hours|attendance_percentage|sleep_hours|exercise_frequency|mental_health_rating|exam_score|gender_encoded|part_time_job_encoded|diet_quality_encoded|parental_education_level_encoded|internet_quality_encoded|extracurricular_participation_encoded|            features|        prediction|
+---+-------------------+------------------+-------------+---------------------+-----------+------------------+--------------------+----------+--------------+---------------------+--------------------+--------------------------------+------------------------+-------------------------------------+---------

In [9]:
#check RMSE
from pyspark.ml.evaluation import RegressionEvaluator
regressionEvaluator = RegressionEvaluator(
    predictionCol="prediction",
    labelCol="exam_score",
    metricName="rmse"
)
rmse = regressionEvaluator.evaluate(predDF)
print("RMSE: ",rmse)

RMSE:  8.442631143196058


In [10]:
for col in trainDF.columns:
    tempCorr = trainDF.corr(col,"exam_score")
    print(f"Correlation between {col} and exam_score: {tempCorr:.5f}".format(col,tempCorr))
    print()

Correlation between age and exam_score: -0.01503

Correlation between study_hours_per_day and exam_score: 0.81893

Correlation between social_media_hours and exam_score: -0.16536

Correlation between netflix_hours and exam_score: -0.18105

Correlation between attendance_percentage and exam_score: 0.11366

Correlation between sleep_hours and exam_score: 0.11130

Correlation between exercise_frequency and exam_score: 0.16545

Correlation between mental_health_rating and exam_score: 0.32407

Correlation between exam_score and exam_score: 1.00000

Correlation between gender_encoded and exam_score: -0.00120

Correlation between part_time_job_encoded and exam_score: -0.01967

Correlation between diet_quality_encoded and exam_score: -0.04667

Correlation between parental_education_level_encoded and exam_score: 0.00921

Correlation between internet_quality_encoded and exam_score: 0.04471

Correlation between extracurricular_participation_encoded and exam_score: 0.02282



In [11]:
spark.stop()