In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df_2020 = pd.read_csv("./2020input0.csv", header=None)
df_2020 = pd.DataFrame(
    df_2020[0].str.findall("(\d+)").to_list(), columns=["x_low", "x_high", "frequency"]
)
df_2020["x_low"] = df_2020["x_low"].astype(int)
df_2020["x_high"] = df_2020["x_high"].astype(int)
df_2020["frequency"] = df_2020["frequency"].astype(int)

df_2024 = pd.read_csv("./2024input0.csv", header=None, names=["grade"]).infer_objects()
df_2024["grade"] = df_2024["grade"].astype(int)

In [3]:
df_2020.sample(5)

Unnamed: 0,x_low,x_high,frequency
12,48,52,17
10,40,44,14
20,80,84,13
22,88,92,8
2,8,12,1


In [4]:
df_2024.sample(5)

Unnamed: 0,grade
115,60
277,75
122,56
81,49
248,84


In [13]:
def calculate_mean_grouped_data(x_low, x_high, frequency):
    """
    Calculates the weighted mean of grouped data.

    Args:
        x_low: An array-like object containing the lower limits of the class intervals.
        x_high: An array-like object containing the upper limits of the class intervals.
        frequency: An array-like object containing the frequencies of each class interval.

    Returns:
        The weighted mean of the grouped data.
    """
    class_score = (x_low + x_high) / 2
    return np.sum(class_score * frequency) / np.sum(frequency)


def calculate_standard_dev_grouped_data(mean, x_low, x_high, frequency):
    """
    Calculates the weighted mean of grouped data.

    Args:
        mean: Float, the weighted mean of the distribution
        x_low: An array-like object containing the lower limits of the class intervals.
        x_high: An array-like object containing the upper limits of the class intervals.
        frequency: An array-like object containing the frequencies of each class interval.

    Returns:
        The standard deviation of the distribution
    """
    class_score = (x_low + x_high) / 2
    N = np.sum(frequency)
    return (
        np.sqrt(
            N * np.sum(frequency * (class_score**2))
            - np.sum(frequency * class_score) ** 2
        )
        / N
    )

In [14]:
mean = calculate_mean_grouped_data(
    df_2020["x_low"], df_2020["x_high"], df_2020["frequency"]
)
std_dev = calculate_standard_dev_grouped_data(
    mean, df_2020["x_low"], df_2020["x_high"], df_2020["frequency"]
)

print(f"Mean: {mean}, Standard Deviation: {std_dev}")

Mean: 63.21739130434783, Standard Deviation: 14.958232771382704


In [7]:
df_2020["x_low"] - 20

0    -20
1    -16
2    -12
3     -8
4     -4
5      0
6      4
7      8
8     12
9     16
10    20
11    24
12    28
13    32
14    36
15    40
16    44
17    48
18    52
19    56
20    60
21    64
22    68
23    72
24    76
Name: x_low, dtype: int64

In [8]:
df_2024.describe()

Unnamed: 0,grade
count,297.0
mean,59.232323
std,13.004024
min,11.0
25%,52.0
50%,59.0
75%,68.0
max,91.0


In [9]:
df

NameError: name 'df' is not defined