In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName('CA2').getOrCreate()

In [4]:
spark

In [5]:
csv_file_path = 'C:/Python Cursos/My Projects/Name123.csv'

In [6]:
df = spark.read.csv(csv_file_path, header=True, inferSchema=True)

In [7]:
df.show()

+-------------+---+---+---+
|         Name|  1|  2|  3|
+-------------+---+---+---+
|     John Doe| 85| 78| 92|
|   Jane Smith| 90| 88| 81|
|Alice Johnson| 72| 95| 87|
|    Bob Brown| 88| 83| 79|
|Charlie Davis| 91| 76| 85|
+-------------+---+---+---+



In [8]:
df.printSchema()

root
 |-- Name: string (nullable = true)
 |-- 1: integer (nullable = true)
 |-- 2: integer (nullable = true)
 |-- 3: integer (nullable = true)



In [9]:
df = df.withColumnRenamed("1",  "Chemistry").withColumnRenamed("2", "Physics").withColumnRenamed("3", "Biology")

In [10]:
df.show()

+-------------+---------+-------+-------+
|         Name|Chemistry|Physics|Biology|
+-------------+---------+-------+-------+
|     John Doe|       85|     78|     92|
|   Jane Smith|       90|     88|     81|
|Alice Johnson|       72|     95|     87|
|    Bob Brown|       88|     83|     79|
|Charlie Davis|       91|     76|     85|
+-------------+---------+-------+-------+



In [11]:
from pyspark.sql.functions import split

In [12]:
df = df.withColumn("FirstName", split(df["Name"], " ").getItem(0)) \
.withColumn("Surname", split(df["Name"], " ").getItem(1))

In [13]:
df.show()

+-------------+---------+-------+-------+---------+-------+
|         Name|Chemistry|Physics|Biology|FirstName|Surname|
+-------------+---------+-------+-------+---------+-------+
|     John Doe|       85|     78|     92|     John|    Doe|
|   Jane Smith|       90|     88|     81|     Jane|  Smith|
|Alice Johnson|       72|     95|     87|    Alice|Johnson|
|    Bob Brown|       88|     83|     79|      Bob|  Brown|
|Charlie Davis|       91|     76|     85|  Charlie|  Davis|
+-------------+---------+-------+-------+---------+-------+



In [14]:
df2 = df.select("FirstName", "Surname", "Chemistry", "Physics", "Biology")



In [15]:
df2.show()

+---------+-------+---------+-------+-------+
|FirstName|Surname|Chemistry|Physics|Biology|
+---------+-------+---------+-------+-------+
|     John|    Doe|       85|     78|     92|
|     Jane|  Smith|       90|     88|     81|
|    Alice|Johnson|       72|     95|     87|
|      Bob|  Brown|       88|     83|     79|
|  Charlie|  Davis|       91|     76|     85|
+---------+-------+---------+-------+-------+



In [16]:
from pyspark.sql.functions import col

In [17]:
df2 = df2.withColumn("Total", col("Chemistry") * 0.5 + col("Physics") * 0.25 + col("Biology") * 0.25)

In [18]:
df2.show()

+---------+-------+---------+-------+-------+-----+
|FirstName|Surname|Chemistry|Physics|Biology|Total|
+---------+-------+---------+-------+-------+-----+
|     John|    Doe|       85|     78|     92| 85.0|
|     Jane|  Smith|       90|     88|     81|87.25|
|    Alice|Johnson|       72|     95|     87| 81.5|
|      Bob|  Brown|       88|     83|     79| 84.5|
|  Charlie|  Davis|       91|     76|     85|85.75|
+---------+-------+---------+-------+-------+-----+



In [23]:
dfChemistry = df2.select("FirstName","Surname", "Chemistry")
dfChemistry.show()

+---------+-------+---------+
|FirstName|Surname|Chemistry|
+---------+-------+---------+
|     John|    Doe|       85|
|     Jane|  Smith|       90|
|    Alice|Johnson|       72|
|      Bob|  Brown|       88|
|  Charlie|  Davis|       91|
+---------+-------+---------+



In [24]:
dfPhysics = df2.select("FirstName","Surname","Physics")
dfPhysics.show()

+---------+-------+-------+
|FirstName|Surname|Physics|
+---------+-------+-------+
|     John|    Doe|     78|
|     Jane|  Smith|     88|
|    Alice|Johnson|     95|
|      Bob|  Brown|     83|
|  Charlie|  Davis|     76|
+---------+-------+-------+



In [25]:
dfBiology = df2.select("FirstName","Surname","Biology")
dfBiology.show()

+---------+-------+-------+
|FirstName|Surname|Biology|
+---------+-------+-------+
|     John|    Doe|     92|
|     Jane|  Smith|     81|
|    Alice|Johnson|     87|
|      Bob|  Brown|     79|
|  Charlie|  Davis|     85|
+---------+-------+-------+



In [37]:
dfBiology = dfBiology.orderBy(col("Surname").asc())

In [38]:
dfBiology.show()

+---------+-------+-------+
|FirstName|Surname|Biology|
+---------+-------+-------+
|      Bob|  Brown|     79|
|  Charlie|  Davis|     85|
|     John|    Doe|     92|
|    Alice|Johnson|     87|
|     Jane|  Smith|     81|
+---------+-------+-------+

