# Analysis of students performance

## Set up block
*==> It must be executed minimum one time before executing next code blocks*

In [11]:
# === All imports ===
## global imports
import os
import sys
## spark imports
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# === Get data souce file ===
data_file = "./data/StudentsPerformance.csv"
if os.path.exists(data_file):
    print(f"[ok]: data source file is found: {data_file}")
else:
    print(f"[ko]: data source file is missing: {data_file}\n/!\The following code blocks will not be able to function normally until this problem is resolved")
    sys.exit(1)

# === Create the Spark session
try:
    spark = SparkSession.builder.master("local").appName("# Analysis of students performance").getOrCreate()
    print("[ok]: spark session created")
except Exception as e:
    print(f"[ko]: cannot create spark session: {e}\n/!\The following code blocks will not be able to function normally until this problem is resolved")
    sys.exit(1)

[ok]: data source file is found: ./data/StudentsPerformance.csv
[ok]: spark session created


## Exercise 1.1 - Exploration
### 1.1.1) Show the schema

In [2]:
df_student_performance = (
    spark.read.option("header", "true")
    .option("inferSchema", "true")
    .option("sep", ",")
    .csv(data_file)
)
df_student_performance.printSchema()

root
 |-- gender: string (nullable = true)
 |-- race/ethnicity: string (nullable = true)
 |-- parental level of education: string (nullable = true)
 |-- lunch: string (nullable = true)
 |-- test preparation course: string (nullable = true)
 |-- math score: integer (nullable = true)
 |-- reading score: integer (nullable = true)
 |-- writing score: integer (nullable = true)



### 1.1.2) Count the number of students

In [3]:
nbr_of_students = df_student_performance.count()
print("nbr of students:", nbr_of_students)

nbr of students: 1000


### 1.1.3) Display the first 10 lines

In [4]:
df_student_performance.show(10)

+------+--------------+---------------------------+------------+-----------------------+----------+-------------+-------------+
|gender|race/ethnicity|parental level of education|       lunch|test preparation course|math score|reading score|writing score|
+------+--------------+---------------------------+------------+-----------------------+----------+-------------+-------------+
|female|       group B|          bachelor's degree|    standard|                   none|        72|           72|           74|
|female|       group C|               some college|    standard|              completed|        69|           90|           88|
|female|       group B|            master's degree|    standard|                   none|        90|           95|           93|
|  male|       group A|         associate's degree|free/reduced|                   none|        47|           57|           44|
|  male|       group C|               some college|    standard|                   none|        76|     

### 1.1.4) Display descriptive statistics (describe)

In [6]:
df_student_performance.describe().show()

+-------+------+--------------+---------------------------+------------+-----------------------+------------------+------------------+-----------------+
|summary|gender|race/ethnicity|parental level of education|       lunch|test preparation course|        math score|     reading score|    writing score|
+-------+------+--------------+---------------------------+------------+-----------------------+------------------+------------------+-----------------+
|  count|  1000|          1000|                       1000|        1000|                   1000|              1000|              1000|             1000|
|   mean|  NULL|          NULL|                       NULL|        NULL|                   NULL|            66.089|            69.169|           68.054|
| stddev|  NULL|          NULL|                       NULL|        NULL|                   NULL|15.163080096009454|14.600191937252223|15.19565701086966|
|    min|female|       group A|         associate's degree|free/reduced|          

## Exercise 1.2 - Selections and Filters
### 1.2.1) Select only gender and the 3 scores

In [10]:
df_student_performance.select(
    "gender", "math score", "reading score", "writing score"
).show(10)

+------+----------+-------------+-------------+
|gender|math score|reading score|writing score|
+------+----------+-------------+-------------+
|female|        72|           72|           74|
|female|        69|           90|           88|
|female|        90|           95|           93|
|  male|        47|           57|           44|
|  male|        76|           78|           75|
|female|        71|           83|           78|
|female|        88|           95|           92|
|  male|        40|           43|           39|
|  male|        64|           64|           67|
|female|        38|           60|           50|
+------+----------+-------------+-------------+
only showing top 10 rows



### 1.2.2) Filter students who obtained a grade above 90 in mathematics

In [12]:
df_student_performance.filter(col("math score") > 90).show(20)

+------+--------------+---------------------------+------------+-----------------------+----------+-------------+-------------+
|gender|race/ethnicity|parental level of education|       lunch|test preparation course|math score|reading score|writing score|
+------+--------------+---------------------------+------------+-----------------------+----------+-------------+-------------+
|  male|       group E|               some college|    standard|                   none|        97|           87|           82|
|  male|       group C|               some college|    standard|              completed|        98|           86|           90|
|female|       group E|          bachelor's degree|    standard|              completed|        99|          100|          100|
|  male|       group B|         associate's degree|    standard|              completed|        91|           89|           92|
|  male|       group E|         associate's degree|free/reduced|              completed|       100|     