In [1]:
# Importação das bibliotecas necessárias
from pyspark.sql import SparkSession
from pyspark.sql.functions import upper
from datetime import datetime, date
import pandas as pd
from pyspark.sql import Row
from pyspark.sql.functions import mean
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
# Initialize a Spark session
spark = SparkSession.builder \
    .appName("PySpark S3 Example") \
    .config("spark.jars", "aws-java-sdk-bundle-1.12.262.jar, hadoop-aws-3.3.4.jar") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.access.key", "AKIA4X7AMLEX37LGBTEQ") \
    .config("spark.hadoop.fs.s3a.secret.key", "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") \
    .config("spark.hadoop.fs.s3a.endpoint", "s3.amazonaws.com") \
    .getOrCreate()

# S3 bucket and file path
s3_bucket = "spark-s3-bucket-unisinos"
s3_file_path = f"s3a://{s3_bucket}/Cleaned_Students_Performance.csv"

# Read the CSV file from S3
df = spark.read.csv(s3_file_path, header=True, inferSchema=True)

# Show the data
print("Original Data:")
df.show()

24/11/26 19:54:27 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/11/26 19:54:31 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties


Original Data:
+------+--------------+---------------------------+-----+-----------------------+----------+-------------+-------------+-----------+------------------+
|gender|race_ethnicity|parental_level_of_education|lunch|test_preparation_course|math_score|reading_score|writing_score|total_score|     average_score|
+------+--------------+---------------------------+-----+-----------------------+----------+-------------+-------------+-----------+------------------+
|     0|       group B|          bachelor's degree|    1|                      0|        72|           72|           74|        218| 72.66666666666667|
|     0|       group C|               some college|    1|                      1|        69|           90|           88|        247| 82.33333333333333|
|     0|       group B|            master's degree|    1|                      0|        90|           95|           93|        278| 92.66666666666667|
|     1|       group A|         associate's degree|    0|                

24/11/26 19:54:41 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


## Analise de Desempenho por Gênero

In [None]:
df.groupBy("gender").agg(
    mean("math_score").alias("avg_math_score"),
    mean("reading_score").alias("avg_reading_score"),
    mean("writing_score").alias("avg_writing_score")
).show()

+------+------------------+-----------------+-----------------+
|gender|    avg_math_score|avg_reading_score|avg_writing_score|
+------+------------------+-----------------+-----------------+
|     1| 68.72821576763485|65.47302904564316|63.31120331950208|
|     0|63.633204633204635|72.60810810810811|72.46718146718146|
+------+------------------+-----------------+-----------------+



## Impacto do Nível de Educação Parental

In [None]:
df.groupBy("parental_level_of_education").agg(
    mean("math_score").alias("avg_math_score"),
    mean("reading_score").alias("avg_reading_score"),
    mean("writing_score").alias("avg_writing_score")
).orderBy("avg_math_score", ascending=False).show()

+---------------------------+------------------+-----------------+-----------------+
|parental_level_of_education|    avg_math_score|avg_reading_score|avg_writing_score|
+---------------------------+------------------+-----------------+-----------------+
|            master's degree|  69.7457627118644|75.37288135593221|75.67796610169492|
|          bachelor's degree| 69.38983050847457|             73.0|73.38135593220339|
|         associate's degree| 67.88288288288288|70.92792792792793| 69.8963963963964|
|               some college|  67.1283185840708|69.46017699115045|68.84070796460178|
|           some high school|63.497206703910614|66.93854748603351|64.88826815642459|
|                high school| 62.13775510204081|64.70408163265306|62.44897959183673|
+---------------------------+------------------+-----------------+-----------------+



## Comparação entre Estudantes que Fizeram ou Não Curso de Preparação

In [None]:
df.groupBy("test_preparation_course").agg(
    mean("math_score").alias("avg_math_score"),
    mean("reading_score").alias("avg_reading_score"),
    mean("writing_score").alias("avg_writing_score")
).show()

## Classificação de Estudantes por Faixas de Notas

In [5]:
from pyspark.sql.functions import when

df = df.withColumn("math_category", when(df.math_score >= 90, "Excelente")
                    .when(df.math_score >= 70, "Bom")
                    .when(df.math_score >= 50, "Regular")
                    .otherwise("Ruim"))

df.groupBy("math_category").count().orderBy("count", ascending=False).show()


+-------------+-----+
|math_category|count|
+-------------+-----+
|      Regular|  456|
|          Bom|  351|
|         Ruim|  135|
|    Excelente|   58|
+-------------+-----+



## Correlação entre Desempenho em Diferentes Matérias

In [6]:
df.select("math_score", "reading_score", "writing_score").summary("mean", "stddev").show()
math_reading_corr = df.stat.corr("math_score", "reading_score")
math_writing_corr = df.stat.corr("math_score", "writing_score")
print(f"Correlação entre Matemática e Leitura: {math_reading_corr}")
print(f"Correlação entre Matemática e Escrita: {math_writing_corr}")

+-------+------------------+------------------+-----------------+
|summary|        math_score|     reading_score|    writing_score|
+-------+------------------+------------------+-----------------+
|   mean|            66.089|            69.169|           68.054|
| stddev|15.163080096009454|14.600191937252223|15.19565701086966|
+-------+------------------+------------------+-----------------+

Correlação entre Matemática e Leitura: 0.8175796636720539
Correlação entre Matemática e Escrita: 0.8026420459498075
