In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df_enem_2023_original = pd.read_json('./enem_2023.json')

df_enem_2023 = df_enem_2023_original.dropna()
df_enem_2023 = df_enem_2023.rename(columns={"Linguagens": "Languages", "Ciências humanas": "Humanities", "Ciências da natureza": "Natural Sciences", "Matemática": "Mathematics", "Redação": "Essay","Sexo": "Gender"})
df_enem_2023['Gender'] = df_enem_2023['Gender'].replace({'Homem': 'Man', 'Mulher': 'Woman'})

In [3]:
df_enem_2023.head(10)

Unnamed: 0,Languages,Humanities,Natural Sciences,Mathematics,Essay,Gender
0,585.6693,686.512178,390.482473,452.281333,546.371706,Man
2,622.732384,495.510719,443.18358,669.683401,684.51575,Man
3,350.164605,602.041588,413.97981,676.608952,671.237114,Man
4,384.093657,489.24914,609.758123,520.426698,744.375048,Man
5,616.04336,502.007005,497.777935,615.902547,507.365147,Man
6,511.897593,479.19766,469.023899,441.139103,541.979136,Woman
7,377.439818,663.489624,556.239031,517.215575,635.424649,Woman
11,617.509514,685.004513,450.104243,556.460006,758.966704,Woman
14,447.988956,525.059975,351.735507,658.211529,716.065784,Man
18,475.352369,490.649377,442.433261,446.404701,688.692299,Man


In [4]:
def breadth(column):
    return df_enem_2023[column].max() - df_enem_2023[column].min()

def basic_statistics(column):
    average = df_enem_2023[column].mean()
    median = df_enem_2023[column].median()
    return average, median

def calculate_final_note(df):
    weights = {'Essay': 2, 'Mathematics': 4, 'Languages': 2, 'Humanities': 1, 'Natural Sciences': 1}
    return sum(df[col] * weight for col, weight in weights.items()) / sum(weights.values())

In [None]:
# Question 1: Breadth

disciplines = ['Essay', 'Mathematics', 'Languages', 'Humanities', 'Natural Sciences']
breadths = {discipline: breadth(discipline) for discipline in disciplines}

print("Breadth of disciplines:")
for discipline, breadth in breadths.items():
    print(f"- {discipline}: {breadth:.2f}") 

max_breadth = max(breadths, key=breadths.get)

print("-" * 50)

print(f"The subject with the breadth grade range is: {max_breadth}")

Breadth of disciplines:
- Essay: 641.00
- Mathematics: 488.08
- Languages: 517.46
- Humanities: 474.02
- Natural Sciences: 487.60
--------------------------------------------------
The subject with the breadth grade range is: Essay


In [None]:
# Question 2: Average and Median

statistics = {discipline: basic_statistics(discipline) for discipline in disciplines}

print("Average and median by subject:")
for discipline, stats in statistics.items():
    average = stats[0]
    median = stats[1]
    print(f"- {discipline}: Average = {average:.2f}, Median = {median:.2f}") 

Average and median by subject:
- Essay: Average = 641.19, Median = 639.60
- Mathematics: Average = 537.17, Median = 532.96
- Languages: Average = 517.47, Median = 517.13
- Humanities: Average = 528.35, Median = 528.50
- Natural Sciences: Average = 502.14, Median = 502.94


In [None]:
# Question 3: Top 500 in Computer Science

def calculate_final_note(df):
    weights = {
        'Essay': 2,
        'Mathematics': 4,
        'Languages': 2,
        'Humanities': 1,
        'Natural Sciences': 1
    }
    
    weighted_sum = (
        df['Essay'] * weights['Essay'] +
        df['Mathematics'] * weights['Mathematics'] +
        df['Languages'] * weights['Languages'] +
        df['Humanities'] * weights['Humanities'] +
        df['Natural Sciences'] * weights['Natural Sciences']
    )
    
    total_weights = sum(weights.values())
    return weighted_sum / total_weights

df_enem_2023['Final Note'] = calculate_final_note(df_enem_2023)
top_500 = df_enem_2023.nlargest(500, 'Final Note')
average_500 = top_500['Final Note'].mean()
deviation_500 = top_500['Final Note'].std()

print("Average and Standard Deviation of the top 500:")
print(f"- Average: {average_500:.2f}")
print(f"- Standard Deviation: {deviation_500:.2f}")

Average and Standard Deviation of the top 500:
- Average: 557.29
- Standard Deviation: 40.44


In [None]:
# Question 4: Top 40 in Computer Science

top_40 = top_500.nlargest(40, 'Final Note')
average_40 = top_40['Final Note'].mean()
variance_40 = top_40['Final Note'].var()

print("Average and variance of the top 40:")
print(f"- Average: {average_40:.2f}")
print(f"- Variance: {variance_40:.2f}")

Average and variance of the top 40:
- Average: 636.68
- Variance: 295.77


In [None]:
# Question 5: Third quartile (Q3)

Q3_matematica = np.percentile(df_enem_2023['Mathematics'].dropna(), 75)
Q3_linguagens = np.percentile(df_enem_2023['Languages'].dropna(), 75)

print(f"Q3 Mathematics: {Q3_matematica:.2f}")
print(f"Q3 Languages: {Q3_linguagens:.2f}")

Q3 Mathematics: 591.34
Q3 Languages: 572.65
