### Homework # 6: Linear Regression Exercise

In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression

spark = SparkSession.builder.appName("Homework 9.2").getOrCreate()

#### Load student performance data

In [2]:
df = spark.read.csv('../Datasets/student_scores.csv', inferSchema=True, header=True)

In [3]:
df.printSchema()

root
 |-- Study Hours: integer (nullable = true)
 |-- Attendance Rate: integer (nullable = true)
 |-- Previous Exam Score: integer (nullable = true)
 |-- Final Exam Score: integer (nullable = true)



In [4]:
df.show()

+-----------+---------------+-------------------+----------------+
|Study Hours|Attendance Rate|Previous Exam Score|Final Exam Score|
+-----------+---------------+-------------------+----------------+
|          5|             80|                 60|              65|
|          8|             85|                 70|              78|
|         12|             90|                 85|              92|
|          4|             75|                 55|              58|
|         10|             88|                 80|              85|
|          7|             82|                 68|              72|
|          9|             87|                 75|              80|
|         11|             93|                 89|              95|
|          6|             79|                 62|              68|
|         13|             95|                 91|              98|
|          3|             70|                 50|              53|
|         15|             97|                 95|             

#### Process features and target variable

In [5]:
df = df.withColumnRenamed("Study Hours", "study_hours") \
       .withColumnRenamed("Attendance Rate", "attendance") \
       .withColumnRenamed("Previous Exam Score", "previous_score") \
       .withColumnRenamed("Final Exam Score", "final_score")

In [6]:
assembler = VectorAssembler(inputCols=['study_hours','attendance','previous_score'], outputCol='features')

In [7]:
output = assembler.transform(df).select("features", df["final_score"].alias("label"))

In [8]:
output.show(truncate=False)

+----------------+-----+
|features        |label|
+----------------+-----+
|[5.0,80.0,60.0] |65   |
|[8.0,85.0,70.0] |78   |
|[12.0,90.0,85.0]|92   |
|[4.0,75.0,55.0] |58   |
|[10.0,88.0,80.0]|85   |
|[7.0,82.0,68.0] |72   |
|[9.0,87.0,75.0] |80   |
|[11.0,93.0,89.0]|95   |
|[6.0,79.0,62.0] |68   |
|[13.0,95.0,91.0]|98   |
|[3.0,70.0,50.0] |53   |
|[15.0,97.0,95.0]|100  |
|[2.0,60.0,40.0] |45   |
|[1.0,50.0,30.0] |38   |
|[14.0,96.0,94.0]|99   |
|[8.0,84.0,72.0] |76   |
|[9.0,86.0,74.0] |79   |
|[12.0,91.0,88.0]|94   |
|[10.0,89.0,83.0]|87   |
|[7.0,80.0,67.0] |71   |
+----------------+-----+



#### Train a Linear Regression model
Split the dataset into training (80%) and testing (20%).

In [9]:
train_data, test_data = output.randomSplit([0.8,0.2], seed=42)

In [10]:
lr = LinearRegression(labelCol = 'label')
lr_model = lr.fit(train_data)

In [11]:
lr_model.coefficients

DenseVector([1.0629, -0.0175, 0.7911])

In [12]:
lr_model.intercept

13.168219206208601

#### Evaluate the model using RMSE & R²

In [13]:
test_results = lr_model.evaluate(test_data)

In [14]:
print("RMSE :", test_results.rootMeanSquaredError)

RMSE : 1.6188564576933784


In [15]:
print("R² :", test_results.r2)

R² : 0.9894428930445716


#### Predict final exam scores for new students
Example: 10 study hours, 90% attendance, 75 previous score

More example about yourself.

In [16]:
from pyspark.sql import Row

new_data = [
    Row(study_hours=9.0, attendance=88.0, previous_score=74.0),
    Row(study_hours=8.0, attendance=95.0, previous_score=69.0)
]

unseen_df = spark.createDataFrame(new_data)

new_students = assembler.transform(unseen_df)

predictions = lr_model.transform(new_students)

predictions.select("features", "prediction").show(truncate=False)

+---------------+-----------------+
|features       |prediction       |
+---------------+-----------------+
|[9.0,88.0,74.0]|79.73786677976092|
|[8.0,95.0,69.0]|74.59717787764198|
+---------------+-----------------+

