In [None]:
import polars as pl
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

# ETL

In [None]:
df = pl.read_ods('projecoes_2024_tab1_idade_simples.ods')

In [None]:
df.head()

In [None]:
for col in df.columns:
    has_nulls = df.select(pl.col(col).has_nulls().any()).item()
    if has_nulls:
        print(f'{col} has null values.')

In [None]:
df = df.filter(
    pl.col('LOCAL') == 'Brasil'
).drop([
    'CÃ“D.', 'SIGLA', 'LOCAL'
]).rename({
    'IDADE': 'age',
    'SEXO': 'gender'
}).unpivot(
    index=['age', 'gender'],
    on=[str(x) for x in range(2000, 2071)],
    variable_name='year',
    value_name='predicted_population'
).with_columns(
    pl.col('gender').replace({
        'Ambos': 'Both',
        'Homens': 'Male',
        'Mulheres': 'Female'
})).with_columns(
    pl.col('age').cast(pl.Int8),
    pl.col('gender').cast(pl.Categorical),
    pl.col('year').cast(pl.Int16),
    pl.col('predicted_population').cast(pl.Float32)
)

In [None]:
df.head()

# ANALYSIS PER GENDER AND YEAR

In [None]:
df_age_year = df.group_by([
    'gender', 'year'
]).agg(
    pl.col('predicted_population').sum()
).sort('year', descending=False)

In [None]:
df_age_year.head()

In [None]:
years = sorted(df_age_year.select('year').unique().to_series().to_list())

In [None]:
year_max_population_both = df_age_year.filter(
    pl.col('gender') == 'Both'
).sort(
    'predicted_population',
    descending=True
).row(0)[1]

In [None]:
year_max_population_both

In [None]:
year_max_population_male = df_age_year.filter(
    pl.col('gender') == 'Male'
).sort(
    'predicted_population',
    descending=True
).row(0)[1]

In [None]:
year_max_population_male

In [None]:
year_max_population_female = df_age_year.filter(
    pl.col('gender') == 'Female'
).sort(
    'predicted_population',
    descending=True
).row(0)[1]

In [None]:
year_max_population_female

In [None]:
fig, ax = plt.subplots(figsize=(15, 10))

markers = ['o', '>', 'd']

for (gender, group), marker in zip(df_age_year.group_by('gender'), markers):
    plt.plot(
        group.select('year'),
        group.select('predicted_population'),
        label=gender,
        marker=marker
    )


ax.ticklabel_format(style='plain', axis='y')
ax.get_yaxis().set_major_formatter(
    ticker.FuncFormatter(lambda x, p: format(int(x), ',').replace(',', '.'))
)
ax.grid()
ax.tick_params('x', rotation=90)
ax.set_xticks(years)
ax.legend(loc='upper left', bbox_to_anchor=(1, 1))
ax.set_ylabel('Predicted Population')
ax.set_xlabel('Year')
ax.axvline(year_max_population_both, linestyle='--', color='red')
ax.axvline(year_max_population_female, linestyle='-', color='red')

plt.tight_layout()
#fig.savefig('predict_population_gender_year.png', dpi=300, bbox_inches='tight')
plt.show()

# 2041 both and male max pop
# 2042 female max pop

# ANALYSIS PER AGE GROUP AND YEAR

In [None]:
df.head()

# ANALYSIS PER GENDER, AGE GROUP AND YEAR