In [1]:
!sudo apt update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
#Check this site for the latest download link https://www.apache.org/dyn/closer.lua/spark/spark-3.2.1/spark-3.2.1-bin-hadoop3.2.tgz
!wget -q https://dlcdn.apache.org/spark/spark-3.2.1/spark-3.2.1-bin-hadoop3.2.tgz
!tar xf spark-3.2.1-bin-hadoop3.2.tgz
!pip install -q findspark
!pip install pyspark
!pip install py4j

import os
import sys
# os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
# os.environ["SPARK_HOME"] = "/content/spark-3.2.1-bin-hadoop3.2"


import findspark
findspark.init()
findspark.find()

import pyspark

from pyspark.sql import DataFrame, SparkSession
from typing import List
import pyspark.sql.types as T
import pyspark.sql.functions as F

spark= SparkSession \
       .builder \
       .appName("Our First Spark Example") \
       .getOrCreate()

spark

Get:1 http://security.ubuntu.com/ubuntu jammy-security InRelease [110 kB]
Hit:2 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:3 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [119 kB]
Get:4 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
Get:5 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Hit:6 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Get:7 http://security.ubuntu.com/ubuntu jammy-security/universe amd64 Packages [1,081 kB]
Hit:8 https://ppa.launchpadcontent.net/c2d4u.team/c2d4u4.0+/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Get:10 http://security.ubuntu.com/ubuntu jammy-security/main amd64 Packages [1,641 kB]
Hit:11 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:12 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Get:13 https://developer.download.nvidia.com/comp

In [2]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler

In [5]:
spark=SparkSession.builder.appName('regresion_lineal').getOrCreate()

In [6]:
data = spark.read.csv("/content/Student_Performance.csv", header=True, sep=',', inferSchema=True)
data.show(5)

+-------------+---------------+--------------------------+-----------+--------------------------------+-----------------+
|Hours Studied|Previous Scores|Extracurricular Activities|Sleep Hours|Sample Question Papers Practiced|Performance Index|
+-------------+---------------+--------------------------+-----------+--------------------------------+-----------------+
|            7|             99|                       Yes|          9|                               1|             91.0|
|            4|             82|                        No|          4|                               2|             65.0|
|            8|             51|                       Yes|          7|                               2|             45.0|
|            5|             52|                       Yes|          5|                               2|             36.0|
|            7|             75|                        No|          8|                               5|             66.0|
+-------------+---------

In [7]:
from pyspark.sql.functions import col
from pyspark.ml.feature import MinMaxScaler
from pyspark.ml import Pipeline

In [8]:
from pyspark.sql.functions import when

def convertir_a_binario(valor):
    return when(valor == 'Yes', 1).otherwise(0)

data = data.withColumn("Extracurricular Activities", convertir_a_binario(data["Extracurricular Activities"]))

data.write.csv("Student_Performance.csv_updated.csv", header=True, mode="overwrite")

data.show(5)


+-------------+---------------+--------------------------+-----------+--------------------------------+-----------------+
|Hours Studied|Previous Scores|Extracurricular Activities|Sleep Hours|Sample Question Papers Practiced|Performance Index|
+-------------+---------------+--------------------------+-----------+--------------------------------+-----------------+
|            7|             99|                         1|          9|                               1|             91.0|
|            4|             82|                         0|          4|                               2|             65.0|
|            8|             51|                         1|          7|                               2|             45.0|
|            5|             52|                         1|          5|                               2|             36.0|
|            7|             75|                         0|          8|                               5|             66.0|
+-------------+---------

In [9]:
notes_data = data.select(col("Hours Studied"),
col("Previous Scores"),
col("Extracurricular Activities"),
col("Sleep Hours"),
col("Sample Question Papers Practiced"),
col("Performance Index"))

In [10]:
assembler = VectorAssembler(
    inputCols=["Previous Scores", "Hours Studied", "Sleep Hours", "Sample Question Papers Practiced","Extracurricular Activities"],
    outputCol="features")

data = assembler.transform(notes_data)
scaler = MinMaxScaler(inputCol="features", outputCol="scaled_features")
pipeline = Pipeline(stages=[scaler])
scalerModel = pipeline.fit(data)
scaledData = scalerModel.transform(data)

train_data, test_data = scaledData.randomSplit([0.8, 0.2], seed=53)

In [11]:
scaledData.show(10)

+-------------+---------------+--------------------------+-----------+--------------------------------+-----------------+--------------------+--------------------+
|Hours Studied|Previous Scores|Extracurricular Activities|Sleep Hours|Sample Question Papers Practiced|Performance Index|            features|     scaled_features|
+-------------+---------------+--------------------------+-----------+--------------------------------+-----------------+--------------------+--------------------+
|            7|             99|                         1|          9|                               1|             91.0|[99.0,7.0,9.0,1.0...|[1.0,0.75,1.0,0.1...|
|            4|             82|                         0|          4|                               2|             65.0|[82.0,4.0,4.0,2.0...|[0.71186440677966...|
|            8|             51|                         1|          7|                               2|             45.0|[51.0,8.0,7.0,2.0...|[0.18644067796610...|
|            5| 

In [12]:
from pyspark.ml.regression import LinearRegression

lr = LinearRegression(featuresCol="scaled_features", labelCol="Performance Index")

In [13]:
model = lr.fit(scaledData)

In [14]:
predictions = model.transform(scaledData)


In [15]:
predictions.select("prediction", "Performance Index").show()

+------------------+-----------------+
|        prediction|Performance Index|
+------------------+-----------------+
| 91.85200883832493|             91.0|
| 63.15778719259512|             65.0|
| 45.05283229046659|             45.0|
|36.551200812778006|             36.0|
| 67.09133944591392|             66.0|
|59.409075703567844|             61.0|
| 64.41949151913948|             63.0|
| 38.27575643260526|             42.0|
| 62.84083730330661|             61.0|
| 69.89922225879367|             69.0|
| 84.31702955794282|             84.0|
|  72.4755323391482|             73.0|
|27.062407180930805|             27.0|
| 33.21855456796726|             33.0|
| 65.55995877368275|             68.0|
| 47.46128330225888|             43.0|
| 68.32654855666902|             67.0|
| 71.93676233809217|             70.0|
| 30.90517826262616|             30.0|
| 59.93580488378601|             63.0|
+------------------+-----------------+
only showing top 20 rows



In [16]:
from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator(
    labelCol="Performance Index", predictionCol="prediction", metricName="rmse"
)

rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE):", rmse)


Root Mean Squared Error (RMSE): 2.037486351842894
