In [56]:
import pandas as pd
from pandas import Series, DataFrame
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from scipy.stats import pearsonr
import math


%matplotlib inline

plt.rc('axes', labelsize=20) 
plt.rcParams["figure.figsize"] = (10,10)
plt.rc('xtick', labelsize=13)
plt.rc('ytick', labelsize=13) 
mpl.rcParams['axes.titlesize'] = 24

In [57]:
from numpy import math

def fill_names(line, merged, not_merged):
    if not isinstance(line['candidate_name'], basestring):
        correct_name = not_merged.ix[not_merged.INE_ID == line.INE_ID]['candidate_name']
        if len(correct_name) == 1:
            merged.set_value(col='candidate_name', index=line.name, value=correct_name.iloc[0])
            
            len_before = len(not_merged)
            not_merged.drop(not_merged.index[not_merged.INE_ID == line.INE_ID], inplace=True)
            len_after = len(not_merged)
            assert len_before - len_after == 1
        else:
            if len(correct_name) > 1:
                print 'More than one citizens party in INE_ID ' + str(line.INE_ID)
            if len(correct_name) == 0:
                print 'Zero with citizens party in INE_ID ' + str(line.INE_ID)
                
def fill_ballot_order(line, merged, not_merged):
    if math.isnan(line['Order']):
        correct_ballot_order = not_merged.ix[not_merged.INE_ID == line.INE_ID]['Order']
        if len(correct_ballot_order) == 1:
            merged.set_value(col='Order', index=line.name, value=correct_ballot_order.iloc[0])
            
            len_before = len(not_merged)
            not_merged.drop(not_merged.index[not_merged.INE_ID == line.INE_ID], inplace=True)
            len_after = len(not_merged)
            assert len_before - len_after == 1
            
        else:
            if len(correct_ballot_order) > 1:
                print 'More than one citizens party in INE_ID ' + str(line.INE_ID)
            if len(correct_ballot_order) == 0:
                print 'Zero with citizens party in INE_ID ' + str(line.INE_ID)

# 2013

In [87]:
local_election_results_2013 = pd.read_csv('data/processed_data/local_elections_2013_results.csv', encoding='utf-8')
local_election_candidates_2013 = pd.read_csv('data/processed_data/local_elections_candidates_2013.csv', encoding='utf-8')
local_elections_order_2013 = pd.read_csv('data/processed_data/ballot-order-local-elections-2013.csv')

In [88]:
merged_results_candidates_2013 = pd.merge(
    local_election_results_2013, local_election_candidates_2013, 
    left_on=['INE_ID', 'Sigla', 'year'], right_on=['INE_ID', 'party', 'year'],
    how='outer')
merged_results_candidates_2013 = merged_results_candidates_2013.drop('party', 1)

len_everything_together = len(merged_results_candidates_2013)

not_merged_results_candidates_2013 = merged_results_candidates_2013[merged_results_candidates_2013.Concelho.isnull()]
merged_results_candidates_2013 = merged_results_candidates_2013[merged_results_candidates_2013.Concelho.notnull()]

assert len_everything_together == (len(not_merged_results_candidates_2013) + len(merged_results_candidates_2013))

In [89]:
# Some hacking to merge the rest of the entries
for i in range(len(merged_results_candidates_2013)):
    line = merged_results_candidates_2013.ix[i]
    fill_names(line, merged_results_candidates_2013, not_merged_results_candidates_2013)
    
assert len(not_merged_results_candidates_2013) == 0

In [90]:
merged_2013 = pd.merge(
    merged_results_candidates_2013, local_elections_order_2013, 
    left_on=['INE_ID', 'Sigla'], right_on=['INE_ID', 'Party'],
    how='outer'
)
merged_2013 = merged_2013.drop('Party', 1)

len_everything_together = len(merged_2013)

not_merged_2013 = merged_2013[merged_2013.Concelho.isnull()]
merged_2013 = merged_2013[merged_2013.Concelho.notnull()]

assert len_everything_together == (len(not_merged_2013) + len(merged_2013))

In [91]:
# Some hacking to merge the missing ballot order numbers
for i in range(len(merged_2013)):
    line = merged_2013.ix[i]
    fill_ballot_order(line, merged_2013, not_merged_2013)
    
# assert len(not_merged_2013) == 0

## There is a problem with the data. Here is the e-mail I've send to CNE (Nacional Comission of Elections) in hope to try to solve them

Estou a analisar dados das eleições autárquicas de 2013 e deparei-me com algo estranho. Estou-me a referir apenas a dados referentes a eleições para a Câmara Municipal.

No documento do sorteio das candidaturas (disponível aqui) estão presentes alguns partidos que depois não aparecem nos resultados das eleições (disponíveis aqui).

Por exemplo, segundo o documento do sorteio, para Ferreira do Zêzere, o partido B.E. supostamente estaria em primeiro lugar no boletim de voto. Mas nos resultados, este partido nem sequer aparece listado. 

Este problema acontece com os seguintes locais e partidos:

[TABLE]

A primeira coluna pode ser ignorada. INE_ID é o número de identificação dado ao local pelo Instituto Nacional de Estatística.

Porque é que isto acontece?

In [92]:
# Mark the Locations where the problem ocurrs 
merged_2013['problems_ballot_order'] = merged_2013.INE_ID.isin(not_merged_2013.INE_ID)

# 2009

In [93]:
local_election_results_2009 = pd.read_csv('data/processed_data/local_elections_2009_results.csv', encoding='utf-8')
local_election_candidates_2009 = pd.read_csv('data/processed_data/local_elections_candidates_2009.csv', encoding='utf-8')

In [94]:
merged_results_candidates_2009 = pd.merge(
    local_election_results_2009, local_election_candidates_2009, 
    left_on=['INE_ID', 'Sigla', 'year'], right_on=['INE_ID', 'party', 'year'],
    how='outer')
merged_results_candidates_2009 = merged_results_candidates_2009.drop('party', 1)

len_everything_together = len(merged_results_candidates_2009)

not_merged_results_candidates_2009 = merged_results_candidates_2009[merged_results_candidates_2009.Concelho.isnull()]
merged_results_candidates_2009 = merged_results_candidates_2009[merged_results_candidates_2009.Concelho.notnull()]

assert len_everything_together == (len(not_merged_results_candidates_2009) + len(merged_results_candidates_2009))

In [95]:
# Some hacking to merge the rest of the entries
for i in range(len(merged_results_candidates_2009)):
    line = merged_results_candidates_2009.ix[i]
    fill_names(line, merged_results_candidates_2009, not_merged_results_candidates_2009)
    
assert len(not_merged_results_candidates_2009) == 0

In [96]:
merged_2009 = merged_results_candidates_2009

# Join data form all the years into just one data frame

In [131]:
elections_df = pd.concat([merged_2009, merged_2013])

In [132]:
import gender_guesser.detector as gender
gender_detector = gender.Detector()

def get_gender(name):  
    first_name = name.split(' ')[0]
    
    male_names = [u'João', u'Acílio', u'Litério', u'Hersílio', u'Agusto', u'Romão', u'Hélder', u'Dulcídio', u'Alano'
                  u'Gerónimo', u'Isaltino', u'Avantino', u'Atílio', u'Vitor', u'Beraldino', u'Estevão', u'Hernani',
                  u'Fábio', u'Juvenálio', u'Edegar', u'Bráulio', u'Vasques', u'Joviano', u'Alano', u'Gerónimo',
                  u'Honório', u'Albérico', u'Tomé', u'Gualter', u'Flamiano', u'Milcíades', u'Cílio', u'Parcidio',
                  u'Herlander', u'Lélio', u'Alírio', u'Patrique', u'Dinarte', u'Dírio', u'Jesus', u'Orlindo']
    female_names = [u'Sílvia', u'Ercília', u'Urãnia', u'Zuraida', u'Fermelinda', u'Brizelinda', u'Léli', u'Nair']
    
    if first_name in male_names:
        return u'male'
    
    if first_name in female_names:
        return u'female'
    
    return gender_detector.get_gender(first_name)


In [133]:
elections_df['gender'] = elections_df['candidate_name'].apply(get_gender)

In [134]:
elections_df.ix[elections_df.gender == u'unknown']

Unnamed: 0,Abstention (%),Brancos,Concelho,INE_ID,Inscritos,Mandatos,Nulos,Order,Sigla,Tipo,Votantes,Votos,Votos (%),candidate_name,problems_ballot_order,year,gender


In [135]:
elections_df.gender.unique()

array([u'male', u'female'], dtype=object)

In [136]:
elections_df = elections_df.set_index('INE_ID')

elections_df = elections_df.rename(columns={
    'Brancos': 'blank_votes',
    'Concelho': 'concelho',
    'Inscritos': 'enrolled',
    'Nulos': 'null_votes',
    'Mandatos': 'number_mandates',
    'Order': 'position_ballot',
    'Sigla': 'party_initials',
    'Tipo': 'party_type',
    'Votantes': 'voters',
    'Votos': 'votes',
    'Abstention (%)': 'abstention_%',
    'Votos (%)': 'votes_%',
})

In [137]:
new_columns_order = ['year', 'concelho', 'enrolled', 'voters', 'abstention_%', 'blank_votes', 'null_votes', 'party_initials', 'party_type', 'votes', 'votes_%', 'number_mandates', 'candidate_name', 'gender', 'position_ballot', 'problems_ballot_order']
assert len(new_columns_order) == len(list(elections_df))
elections_df = elections_df[new_columns_order]

In [138]:
# Some columns that should contain integers contain floats. Correct that. 
elections_df.year = elections_df.year.astype(int)
elections_df.enrolled = elections_df.enrolled.astype(int)
elections_df.voters = elections_df.voters.astype(int)
elections_df.votes = elections_df.votes.astype(int)
elections_df.number_mandates = elections_df.number_mandates.astype(int)
elections_df.blank_votes = elections_df.blank_votes.astype(int)
elections_df.null_votes = elections_df.null_votes.astype(int)
elections_df.null_votes = elections_df.null_votes.astype(int)

In [144]:
# Complete votes_% column
elections_df['votes_%'] = (elections_df.votes / (elections_df.voters - elections_df.blank_votes - elections_df.null_votes)) * 100

In [147]:
elections_df.to_csv('data/processed_data/elections_camaras_municipais_portugal.csv', encoding='utf-8')