# persons

## Import and cleanup

Let's start with importing important modules and loading the `persons` table.

In [12]:
import pandas as pd
import numpy as np
from datetime import date
from datetime import datetime
from datetime import timedelta
from dateutil.relativedelta import relativedelta
from math import nan
import re

# import from CSV
persons = pd.read_csv("presidents-of-chile_persons.csv")

# quick view of what the table contains
print('---------- Quick view of the persons dataset ----------')
print(persons.head(3))
print('\n')
print(persons.tail(3))


---------- Quick view of the persons dataset ----------
  person_id                                full_name date_of_birth  \
0     P0001   Manuel José Blanco y Calvo de Encalada    1790-04-21   
1     P0002  Agustín Manuel de Eyzaguirre Arechavala    1768-05-03   
2     P0003  Ramón Saturnino Andrés Freire y Serrano    1787-11-29   

  date_of_death place_of_birth        region_of_birth  is_woman  
0    1876-09-05   Buenos Aires              Argentina         0  
1    1837-07-19       Santiago  Santiago Metropolitan         0  
2    1851-12-09       Santiago  Santiago Metropolitan         0  


   person_id                               full_name date_of_birth  \
53     P0054        Verónica Michelle Bachelet Jeria    1951-09-29   
54     P0055  Miguel Juan Sebastián Piñera Echenique    1949-12-01   
55     P0056                      Gabriel Boric Font    1986-02-11   

   date_of_death place_of_birth                    region_of_birth  is_woman  
53           NaN       Santiago      

In [13]:
# quick view of what columns we have
print(persons.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56 entries, 0 to 55
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   person_id        56 non-null     object
 1   full_name        56 non-null     object
 2   date_of_birth    56 non-null     object
 3   date_of_death    51 non-null     object
 4   place_of_birth   56 non-null     object
 5   region_of_birth  56 non-null     object
 6   is_woman         56 non-null     int64 
dtypes: int64(1), object(6)
memory usage: 3.2+ KB
None


We need to format date columns as dates, but before that we have to clean up some "non-dates".

A note about how I collected the data here:
* When the year is known but not the month and day, I put XXXX-00-00 because I know for a fact that one of the persons here was actually born on 01-01, so I can't use that as a placeholder. 
* When the month and day are known but not the year, I put 0000-XX-XX.

There are only a few persons here with unknown exact dates of birth or death, so we can live without them. Time to remove these non-dates without touching the CSV itself...

In [14]:
# find and clean up wrongly formatted dates by making them null
print('---------- These persons have incomplete dates of birth and death ----------')
print(persons[persons['date_of_birth'].str.contains('00-?00', regex=True) == True])  # index 6, 22, 35
print(persons[persons['date_of_death'].str.contains('00-?00', regex=True) == True])  # index 35
print("\n")

persons.loc[[6, 22, 35], 'date_of_birth'] = ''
persons.loc[35, 'date_of_death'] = ''

# convert dates to date format
persons['date_of_birth'] = pd.to_datetime(persons['date_of_birth'], format='%Y-%m-%d')
persons['date_of_death'] = pd.to_datetime(persons['date_of_death'], format='%Y-%m-%d')

# quick view of what the table contains
print('---------- Quick view of the persons dataset ----------')
print(persons.head(3))
print("\n")
print(persons.tail(3))
print("\n")
print(persons.info())


---------- These persons have incomplete dates of birth and death ----------
   person_id                                          full_name date_of_birth  \
6      P0007  Francisco Antonio Pascual de la Ascensión Ruiz...    1790-00-00   
22     P0023                             Elías Fernández Albano    1845-00-00   
35     P0036                                 Arturo Puga Osorio    1879-00-00   

   date_of_death place_of_birth        region_of_birth  is_woman  
6     1860-03-23       Santiago  Santiago Metropolitan         0  
22    1910-09-06       Santiago  Santiago Metropolitan         0  
35    0000-04-28       Santiago  Santiago Metropolitan         0  
   person_id           full_name date_of_birth date_of_death place_of_birth  \
35     P0036  Arturo Puga Osorio    1879-00-00    0000-04-28       Santiago   

          region_of_birth  is_woman  
35  Santiago Metropolitan         0  


---------- Quick view of the persons dataset ----------
  person_id                          

## Age and proper names

The first thing we can add is each person's age: this is the current age if alive, otherwise the age upon death.

In [15]:
# get date today and format it as YYYY-MM-DD
date_today = pd.to_datetime(datetime.today().strftime('%Y-%m-%d'), format='%Y-%m-%d')

# alive people have their date_of_death set to null. to properly calculate their ages we need to fill date_of_death with the current date.
for index, value in persons.iterrows():
    if pd.isnull(value['date_of_death']):
        persons.loc[index, 'date_of_death'] = date_today

# create new column to quickly specify if the person is alive
# below line is equivalent to - persons.loc[:, 'is_alive'] = 0
persons['is_alive'[:]] = 0

for index, value in persons.iterrows():
    if value['date_of_death'] == date_today:
        persons.loc[index, 'is_alive'] = 1

# create age column
# below line is equivalent to - persons.loc[:, 'age'] = abs(persons['date_of_death'] - persons['date_of_birth'])
persons['age'[:]] = abs(persons['date_of_death'] - persons['date_of_birth'])

# convert to years
for index, value in persons['age'].items():
    # because we can't get properly calculate ages of persons without complete date of birth and death
    if pd.isnull(value):
        persons.at[index, 'age'] = nan
    else:
        # pandas timedelta is different from native python timedelta, so need to convert it
        value.to_pytimedelta()
        # force the float 365.25 into a timedelta so that the next division returns a float
        days_in_a_yr = timedelta(days=365.25)
        # divide by 365.25 to convert days to years, then assign year value to 'age' column
        value = value / days_in_a_yr
        persons.at[index, 'age'] = float(value)

# print only the important columns
persons = persons.loc[:, ['person_id', 'full_name', 'date_of_birth', 'date_of_death',
                          'place_of_birth', 'region_of_birth', 'is_woman', 'is_alive', 'age']]
print('---------- Quick view of the persons dataset ----------')
print(persons.tail(10))

# DeprecationWarning: In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. 
# To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newvals)`
# persons.loc[:, 'age'] = abs(persons['date_of_death'] - persons['date_of_birth'])


---------- Quick view of the persons dataset ----------
   person_id                                      full_name date_of_birth  \
46     P0047             Jorge Eduardo Alessandri Rodríguez    1896-05-19   
47     P0048                  Eduardo Nicanor Frei Montalva    1911-01-16   
48     P0049             Salvador Guillermo Allende Gossens    1908-06-26   
49     P0050             Augusto José Ramón Pinochet Ugarte    1915-11-25   
50     P0051                  Miguel Patricio Aylwin Azócar    1918-11-26   
51     P0052  Eduardo Alfredo Juan Bernardo Frei Ruiz-Tagle    1942-06-24   
52     P0053                  Ricardo Froilán Lagos Escobar    1938-03-02   
53     P0054               Verónica Michelle Bachelet Jeria    1951-09-29   
54     P0055         Miguel Juan Sebastián Piñera Echenique    1949-12-01   
55     P0056                             Gabriel Boric Font    1986-02-11   

   date_of_death place_of_birth                    region_of_birth  is_woman  \
46    1986-08-31

Chilean people have two surnames, and like all other people they have preferred given names. The surnames and preferred given names are in the `tenures` table, so we have to join this with `persons`.

In [18]:
# import tenures from CSV
tenures = pd.read_csv("presidents-of-chile_tenures.csv")

# convert dates to date format
tenures['date_tenure_start'] = pd.to_datetime(tenures['date_tenure_start'], format='%Y-%m-%d')
tenures['date_tenure_end'] = pd.to_datetime(tenures['date_tenure_end'], format='%Y-%m-%d')

# set index on both tables so that the joining works (hopefully)
persons.set_index('person_id')
tenures.set_index('person_id')

# join on person_id
persons = persons.merge(tenures, on='person_id', how='inner')
persons = persons.loc[:, ['person_id', 'first_surname', 'second_surname', 'given_name', 'full_name',
                        'date_of_birth', 'date_of_death', 'place_of_birth', 'region_of_birth',
                        'is_woman', 'is_alive', 'age', 'date_tenure_start', 'date_tenure_end']]

# remove duplicates
persons.drop_duplicates(subset='person_id', inplace=True, ignore_index=True)

# quick view of what the table contains
print('---------- Quick view of the persons dataset ----------')
print(persons.tail(5))
print('\n')
print(persons.info())


---------- Quick view of the persons dataset ----------
   person_id first_surname second_surname given_name  \
51     P0052          Frei     Ruiz-Tagle    Eduardo   
52     P0053         Lagos        Escobar    Ricardo   
53     P0054      Bachelet          Jeria   Michelle   
54     P0055        Piñera      Echenique  Sebastián   
55     P0056         Boric           Font    Gabriel   

                                        full_name date_of_birth date_of_death  \
51  Eduardo Alfredo Juan Bernardo Frei Ruiz-Tagle    1942-06-24    2023-02-24   
52                  Ricardo Froilán Lagos Escobar    1938-03-02    2023-02-24   
53               Verónica Michelle Bachelet Jeria    1951-09-29    2023-02-24   
54         Miguel Juan Sebastián Piñera Echenique    1949-12-01    2023-02-24   
55                             Gabriel Boric Font    1986-02-11    2023-02-24   

   place_of_birth                    region_of_birth  is_woman  is_alive  \
51       Santiago              Santiago Metr

## Age upon taking office

At what age did a president take office? Everyone agrees that Gabriel Boric is the youngest president to take office, but let's make the data to back it up.

In [22]:
# create age_entered_office
persons['age_entered_office'[:]] = abs(persons['date_tenure_start'] - persons['date_of_birth'])

# calculate age_entered_office the same way we calculated age
for index, value in persons['age_entered_office'].items():
    if pd.isnull(value):
        persons.at[index, 'age_entered_office'] = nan
    else:
        value.to_pytimedelta()
        value = value / days_in_a_yr
        persons.at[index, 'age_entered_office'] = float(value)

# remove duplicates by keeping the very first date_tenure_start
persons.drop_duplicates(subset='person_id', keep='first', inplace=True, ignore_index=True)

# sort by youngest to oldest
persons.sort_values('age_entered_office', ascending=True, inplace=True, na_position='last')

# quick view of what the table contains
print('---------- 10 youngest persons to become President of Chile ----------')
print(persons.head(10))


---------- 10 youngest persons to become President of Chile ----------
   person_id first_surname second_surname         given_name  \
52     P0056         Boric           Font            Gabriel   
51     P0001        Blanco       Encalada             Manuel   
50     P0003        Freire        Serrano              Ramón   
49     P0010        Bulnes         Prieto             Manuel   
48     P0004         Pinto           Díaz  Francisco Antonio   
47     P0006        Ovalle      Bezanilla  José Tomás Ovalle   
46     P0011         Montt         Torres             Manuel   
45     P0024      Figueroa        Larraín           Emiliano   
44     P0037        Dávila       Espinoza             Carlos   
43     P0009        Prieto           Vial            Joaquín   

                                       full_name date_of_birth date_of_death  \
52                            Gabriel Boric Font    1986-02-11    2023-02-24   
51        Manuel José Blanco y Calvo de Encalada    1790-04-21  

In absolute terms, Gabriel Boric is _the_ youngest president who took office. However, if we consider Manuel Blanco Encalada as the very first person who held the title of "President of Chile" (even if on an interim basis), then these two actually share the honors of having taken office at 36 years old (or rather, young).

Let's now find out those who became president later in their lives.

In [21]:
# sort by oldest to youngest
persons.sort_values('age_entered_office', ascending=False, inplace=True, na_position='last')

print('---------- 10 oldest persons to become President of Chile ----------')
print(persons.head(10))


---------- 10 oldest persons to become President of Chile ----------
   person_id first_surname second_surname    given_name  \
24     P0025        Barros           Luco         Ramón   
50     P0051        Aylwin         Azócar      Patricio   
16     P0017     Baquedano       González        Manuel   
30     P0031        Barros        Borgoño          Luis   
46     P0047    Alessandri      Rodríguez         Jorge   
48     P0049       Allende        Gossens      Salvador   
52     P0053         Lagos        Escobar       Ricardo   
44     P0045     Iribarren        Cabezas  Juan Antonio   
11     P0012         Pérez      Mascayano  José Joaquín   
54     P0055        Piñera      Echenique     Sebastián   

                                 full_name date_of_birth date_of_death  \
24                  José Ramón Barros Luco    1835-06-09    1919-09-20   
50           Miguel Patricio Aylwin Azócar    1918-11-26    2016-04-19   
16         Manuel Jesús Baquedano González    1823-01-01   

Ah yes, Ramón Barros Luco, the president named after a sandwich...

Hey @Salvador Allende, being president at 62 is the effect of having to run a grand total of four times. If you would've gotten the presidency the first time, you would have been...

In [23]:
# Allende's age during his first presidential candidacy (out of four)
print(relativedelta(date(1952, 9, 4), date(1908, 6, 26)))

# second candidacy
print(relativedelta(date(1958, 9, 4), date(1908, 6, 26)))

# third candidacy
print(relativedelta(date(1964, 9, 4), date(1908, 6, 26)))

# fourth candidacy
print(relativedelta(date(1970, 9, 4), date(1908, 6, 26)))


relativedelta(years=+44, months=+2, days=+9)
relativedelta(years=+50, months=+2, days=+9)
relativedelta(years=+56, months=+2, days=+9)
relativedelta(years=+62, months=+2, days=+9)


44 years young, enough to make it to the 10 youngest presidents.

Anyway, whether Allende ran the most number of elections before becoming president requires another data collection and exploration.