In [242]:
import pandas as pd
from pandas import Series, DataFrame
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from scipy.stats import pearsonr


%matplotlib inline

plt.rc('axes', labelsize=20) 
plt.rcParams["figure.figsize"] = (10,10)
plt.rc('xtick', labelsize=13)
plt.rc('ytick', labelsize=13) 
mpl.rcParams['axes.titlesize'] = 24

In [243]:
from numpy import math

def fill_names(line, merged, not_merged):
    if not isinstance(line['candidate_name'], basestring):
        correct_name = not_merged.ix[not_merged.INE_ID == line.INE_ID]['candidate_name']
        if len(correct_name) == 1:
            merged.set_value(col='candidate_name', index=line.name, value=correct_name.iloc[0])
        else:
            if len(correct_name) > 1:
                print 'More than one citizens party in INE_ID ' + str(line.INE_ID)
            if len(correct_name) == 0:
                print 'Zero with citizens party in INE_ID ' + str(line.INE_ID)

# 2013

In [244]:
local_election_results_2013 = pd.read_csv('data/processed_data/local_elections_2013_results.csv', encoding='utf-8')
local_election_candidates_2013 = pd.read_csv('data/processed_data/local_elections_candidates_2013.csv', encoding='utf-8')

In [245]:
merged_df_2013 = pd.merge(
    local_election_results_2013, local_election_candidates_2013, 
    left_on=['INE_ID', 'Sigla', 'year'], right_on=['INE_ID', 'party', 'year'],
    how='outer')
merged_df_2013 = merged_df_2013.drop('party', 1)

In [246]:
not_merged_2013 = merged_df_2013[merged_df_2013.Concelho.isnull()]

In [247]:
# Some hacking to merge the rest of the entries
for i in range(len(merged_df_2013)):
    line = merged_df_2013.ix[i]
    fill_names(line, merged_df_2013, not_merged_2013)

In [248]:
merged_df_2013 = merged_df_2013.ix[merged_df_2013.Concelho.notnull()]

# 2009

In [249]:
local_election_results_2009 = pd.read_csv('data/processed_data/local_elections_2009_results.csv', encoding='utf-8')
local_election_candidates_2009 = pd.read_csv('data/processed_data/local_elections_candidates_2009.csv', encoding='utf-8')

In [250]:
merged_df_2009 = pd.merge(
    local_election_results_2009, local_election_candidates_2009, 
    left_on=['INE_ID', 'Sigla', 'year'], right_on=['INE_ID', 'party', 'year'],
    how='outer')
merged_df_2009 = merged_df_2009.drop('party', 1)

In [251]:
not_merged_2009 = merged_df_2009[merged_df_2009.Concelho.isnull()]

In [252]:
# Some hacking to merge the rest of the entries
for i in range(len(merged_df_2009)):
    line = merged_df_2009.ix[i]
    fill_names(line, merged_df_2009, not_merged_2009)

In [253]:
merged_df_2009 = merged_df_2009.ix[merged_df_2009.Concelho.notnull()]

# Join data form all the years into just one data frame

In [254]:
elections_df = pd.concat([merged_df_2009, merged_df_2013])

In [258]:
import gender_guesser.detector as gender
gender_detector = gender.Detector()

def get_gender(name):  
    first_name = name.split(' ')[0]
    
    male_names = [u'João', u'Acílio', u'Litério', u'Hersílio', u'Agusto', u'Romão', u'Hélder', u'Dulcídio', u'Alano'
                  u'Gerónimo', u'Isaltino', u'Avantino', u'Atílio', u'Vitor', u'Beraldino', u'Estevão', u'Hernani',
                  u'Fábio', u'Juvenálio', u'Edegar', u'Bráulio', u'Vasques', u'Joviano', u'Alano', u'Gerónimo',
                  u'Honório', u'Albérico', u'Tomé', u'Gualter', u'Flamiano', u'Milcíades', u'Cílio', u'Parcidio',
                  u'Herlander', u'Lélio', u'Alírio', u'Patrique', u'Dinarte', u'Dírio', u'Jesus', u'Orlindo']
    female_names = [u'Sílvia', u'Ercília', u'Urãnia', u'Zuraida', u'Fermelinda', u'Brizelinda', u'Léli', u'Nair']
    
    if first_name in male_names:
        return u'male'
    
    if first_name in female_names:
        return u'female'
    
    return gender_detector.get_gender(first_name)


In [259]:
elections_df['gender'] = elections_df['candidate_name'].apply(get_gender)

In [260]:
elections_df.ix[elections_df.gender == u'unknown']

Unnamed: 0,Abstention (%),Brancos,Concelho,INE_ID,Inscritos,Mandatos,Nulos,Sigla,Tipo,Votantes,Votos,Votos (%),candidate_name,year,gender


In [261]:
elections_df.gender.unique()

array([u'male', u'female'], dtype=object)