In [None]:
import polars as pl
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

# ETL

In [None]:
df = pl.read_ods('projecoes_2024_tab1_idade_simples.ods')

In [None]:
df.head()

In [None]:
for col in df.columns:
    has_nulls = df.select(pl.col(col).has_nulls().any()).item()
    if has_nulls:
        print(f'{col} has null values.')

In [None]:
df = df.filter(
    pl.col('LOCAL') == 'Brasil'
).drop([
    'CÓD.', 'SIGLA', 'LOCAL'
]).rename({
    'IDADE': 'age',
    'SEXO': 'gender'
}).unpivot(
    index=['age', 'gender'],
    on=[str(x) for x in range(2000, 2071)],
    variable_name='year',
    value_name='predicted_population'
).with_columns(
    pl.col('gender').replace({
        'Ambos': 'Both',
        'Homens': 'Male',
        'Mulheres': 'Female'
})).with_columns(
    pl.col('age').cast(pl.Int8),
    pl.col('gender').cast(pl.Categorical),
    pl.col('year').cast(pl.Int16),
    pl.col('predicted_population').cast(pl.Float32)
)

In [None]:
df.head()

# ANALYSIS PER GENDER AND YEAR

In [None]:
df_age_year = df.group_by([
    'gender', 'year'
]).agg(
    pl.col('predicted_population').sum()
).sort('year', descending=False)

In [None]:
df_age_year.head()

In [None]:
years = sorted(df_age_year.select('year').unique().to_series().to_list())

In [None]:
year_max_population_both = df_age_year.filter(
    pl.col('gender') == 'Both'
).sort(
    'predicted_population',
    descending=True
).row(0)[1]

In [None]:
year_max_population_both

In [None]:
year_max_population_male = df_age_year.filter(
    pl.col('gender') == 'Male'
).sort(
    'predicted_population',
    descending=True
).row(0)[1]

In [None]:
year_max_population_male

In [None]:
year_max_population_female = df_age_year.filter(
    pl.col('gender') == 'Female'
).sort(
    'predicted_population',
    descending=True
).row(0)[1]

In [None]:
year_max_population_female

In [None]:
fig, ax = plt.subplots(figsize=(15, 10))

markers = ['o', '>', 'd']
dark2 = plt.get_cmap('Dark2').colors

for (gender, group), marker, color in zip(df_age_year.group_by('gender'), markers, dark2):
    plt.plot(
        group.select('year'),
        group.select('predicted_population'),
        label=gender,
        marker=marker,
        color=color
    )


ax.ticklabel_format(style='plain', axis='y')
ax.get_yaxis().set_major_formatter(
    ticker.FuncFormatter(lambda x, p: format(int(x), ',').replace(',', '.'))
)
ax.grid()
ax.tick_params('x', rotation=90)
ax.set_xticks(years)
ax.legend(loc='upper left', bbox_to_anchor=(1, 1))
ax.set_ylabel('Predicted Population')
ax.set_xlabel('Year')
ax.axvline(year_max_population_both, linestyle='--', color='red')
ax.axvline(year_max_population_female, linestyle='--', color='red')

plt.tight_layout()
plt.show()

# 2041 both and male max pop
# 2042 female max pop

# ANALYSIS PER AGE GROUP AND YEAR

In [None]:
df.head()

In [None]:
max_age = df.select('age').max().item()

In [None]:
df_agegroup_year = df.with_columns(
    age_group = pl.col("age").cut(
        breaks=[18, 30, 60], 
        labels=["Minor", "Young Adult", "Adult", "Elder"],
        left_closed=True
    )
).group_by(['year', 'age_group']).agg(
    pl.col('predicted_population').sum()
).sort('year', descending=False)

In [None]:
df_agegroup_year.head()

In [None]:
fig, ax = plt.subplots(figsize=(15, 10))

markers = ['o', '^', 'd', 's']
dark2 = plt.get_cmap('Dark2').colors

for (age_group, group), marker, color in zip(df_agegroup_year.group_by('age_group'), markers, dark2):
    plt.plot(
        group.select('year'),
        group.select('predicted_population'),
        marker=marker,
        color=color,
        label=age_group
    )

ax.ticklabel_format(style='plain', axis='y')
ax.get_yaxis().set_major_formatter(
    ticker.FuncFormatter(lambda x, p: format(int(x), ',').replace(',', '.'))
)
ax.grid()
ax.tick_params('x', rotation=90)
ax.set_xticks(years)
ax.legend(loc='upper left', bbox_to_anchor=(1, 1))
ax.set_ylabel('Predicted Population')
ax.set_xlabel('Year')
ax.axvline(2002, linestyle='--', color='red')
ax.axvline(2003, linestyle='--', color='red')
ax.axvline(2026, linestyle='--', color='red')
ax.axvline(2027, linestyle='--', color='red')
ax.axvline(2033, linestyle='--', color='red')
ax.axvline(2034, linestyle='--', color='red')
ax.axvline(2066, linestyle='--', color='red')
ax.axvline(2067, linestyle='--', color='red')

plt.tight_layout()
plt.show()

# EXPLICA MARCAÇÕES

# ANALYSIS PER GENDER, AGE GROUP AND YEAR

In [None]:
df.head()

In [None]:
df_agegroup_gender_year = df.with_columns(
    age_group = pl.col('age').cut(
        breaks=[18, 30, 60],
        labels=['Minor', 'Young Adult', 'Adult', 'Elder'],
        left_closed=True
)).filter(
    pl.col('gender') != 'Both'    
).group_by(['gender', 'year', 'age_group']).agg(
    pl.col('predicted_population').sum()
).sort('year', descending=False)

In [None]:
df_agegroup_gender_year.head()

In [None]:
fig, ax = plt.subplots(figsize=(15, 10))

markers = ['o', '^', 'd', 's', 'p', 'h', 'P']
dark2 = plt.get_cmap('Dark2').colors

for ((age_group, gender), group), color, marker in zip(df_agegroup_gender_year.group_by(['age_group', 'gender']), dark2, markers):
    label = f'{gender} | {age_group}'
    plt.plot(
        group.select('year'),
        group.select('predicted_population'),
        color=color,
        label=label,
        marker=marker
    )

ax.ticklabel_format(style='plain', axis='y')
ax.get_yaxis().set_major_formatter(
    ticker.FuncFormatter(lambda x, p: format(int(x), ',').replace(',', '.'))
)
ax.grid()
ax.tick_params('x', rotation=90)
ax.set_xticks(years)
ax.legend(loc='upper left', bbox_to_anchor=(1, 1))
ax.set_ylabel('Predicted Population')
ax.set_xlabel('Year')

plt.tight_layout()
plt.show()

# ANALYSIS: MALE AND YOUNG ADULLT

In [None]:
df.head()

In [None]:
df_male_youngadult = df.filter(
    pl.col('age').is_in(range(18)),
    pl.col('gender') == 'Male'
).group_by('year').agg(
    pl.col('predicted_population').sum()
).sort('year', descending=False)

In [None]:
df_male_youngadult.head()

In [None]:
max_pop_male_youngadult = df_male_youngadult.filter(
    pl.col('predicted_population') == pl.col('predicted_population').max()
).row(0)[0]

In [None]:
max_pop_male_youngadult

In [None]:
min_pop_male_youngadult = df_male_youngadult.filter(
    pl.col('predicted_population') == pl.col('predicted_population').min()
).row(0)[0]

In [None]:
min_pop_male_youngadult

In [None]:
fig, ax = plt.subplots(figsize=(15, 10))

plt.plot(
    df_male_youngadult.select('year'),
    df_male_youngadult.select('predicted_population'),
    marker='o'
)

ax.ticklabel_format(style='plain', axis='y')
ax.get_yaxis().set_major_formatter(
    ticker.FuncFormatter(lambda x, p: format(int(x), ',').replace(',', '.'))
)
ax.grid()
ax.tick_params('x', rotation=90)
ax.set_xticks(years)
ax.set_ylabel('Predicted Population')
ax.set_xlabel('Year')
ax.axvline(max_pop_male_youngadult,  color='red', linestyle='--')
ax.axvline(min_pop_male_youngadult,  color='red', linestyle='--')

plt.tight_layout()
plt.show()

# EXPLICA MARCAÇÕES

# ANALYSIS: 18 YEARS-OLD MALES

In [None]:
df.head()

In [None]:
df_male_18yo = df.filter(
    pl.col('gender') == 'Male',
    pl.col('age') == 18
).group_by('year').agg(
    pl.col('predicted_population').sum()
).sort(
    'year', 
    descending=False
).write_parquet(
    'population_male_18yo.parquet'
)

In [None]:
df_male_18yo

In [None]:
max_pop_male_18yo = df_male_18yo.filter(
    pl.col('predicted_population') == pl.col('predicted_population').max()
).row(0)[0]

In [None]:
max_pop_male_18yo

In [None]:
min_pop_male_18yo = df_male_18yo.filter(
    pl.col('predicted_population') == pl.col('predicted_population').min()
).row(0)[0]

In [None]:
min_pop_male_18yo

In [None]:
fig, ax = plt.subplots(figsize=(15, 10))

plt.plot(
    df_male_18yo.select('year'),
    df_male_18yo.select('predicted_population'),
    marker='o'
)

ax.ticklabel_format(style='plain', axis='y')
ax.get_yaxis().set_major_formatter(
    ticker.FuncFormatter(lambda x, p: format(int(x), ',').replace(',', '.'))
)
ax.grid()
ax.tick_params('x', rotation=90)
ax.set_xticks(years)
ax.set_ylabel('Predicted Population')
ax.set_xlabel('Year')
ax.axvline(max_pop_male_18yo,  color='red', linestyle='--')
ax.axvline(min_pop_male_18yo,  color='red', linestyle='--')

plt.tight_layout()
plt.show()

# EXPLICA MARCAÇÕES