In [1]:
#Importing packages

import numpy as np
import pandas as pd
import requests
import glob
import openpyxl
import os


In [2]:
%load_ext autoreload
%autoreload 2


In [3]:
import helperfunctions as hf 

Creating relative paths

In [4]:
#creating relative paths
votes = os.path.join("VoteWatch-EP-voting-data_2004-2022", "EP6_RCVs_2022_06_13.xlsx")
votations = os.path.join("VoteWatch-EP-voting-data_2004-2022","EP6_Voted docs.xlsx")


Importing data from EU API (for correct names)

In [5]:

url = 'https://data.europarl.europa.eu/api/v2/meps?parliamentary-term=6&format=application%2Fld%2Bjson&offset=0'
response = requests.get(url)
meps_data = response.json()

Creating DataFrames (runtime is quite long, .xlsx files)

In [6]:

meps_df_6 = pd.json_normalize(meps_data['data'])
EP_6_votes = pd.read_excel(votes)
EP_6_votations = pd.read_excel(votations)

Inspecting the data

In [7]:
meps_df_6.head()

Unnamed: 0,id,type,identifier,label,familyName,givenName,sortLabel,officialFamilyName,officialGivenName
0,person/2278,Person,2278,Paul RÜBIG,Rübig,Paul,RUBIG,,
1,person/28196,Person,28196,Irena BELOHORSKÁ,Belohorská,Irena,BELOHORSKA,,
2,person/4436,Person,4436,Gianni PITTELLA,Pittella,Gianni,PITTELLA,,
3,person/39713,Person,39713,Renate WEBER,Weber,Renate,WEBERRENATE,,
4,person/34482,Person,34482,Alexandru-Ioan MORȚUN,Morțun,Alexandru-Ioan,MORTUN,,


Dropping irrelevant columns (to discuss)

In [8]:
meps_df_6.drop(columns=['id','type','sortLabel','officialFamilyName','officialGivenName'],inplace=True)

Renaming variables for pd.merge and correcting datatype

In [9]:
meps_df_6.rename(columns={'identifier': 'MepId'},inplace=True)
EP_6_votes.rename(columns={'WebisteEpID': 'MepId'},inplace=True)
meps_df_6.MepId = meps_df_6.MepId.astype(int)

Merging 

In [10]:
EP_6_votes_fixed_names = pd.merge(EP_6_votes,meps_df_6 , on='MepId', how='left')

Updating names of MPs

In [11]:
EP_6_votes_fixed_names[['Fname','Lname','FullName']] = EP_6_votes_fixed_names[['givenName','familyName','label']]
EP_6_votes_fixed_names.drop(columns=['label','familyName','givenName'],inplace=True)

Encoding missing party names

In [12]:
EP_6_votes_fixed_names.Party = EP_6_votes_fixed_names.Party.replace({".":np.NAN,"--":np.NAN})

Votations dataset inspection

In [13]:
EP_6_votations.head()

Unnamed: 0,euro_act_id,date,title,procedure,reading,type_of_vote,type_of_vote_en,rule,raporteur,url,...,final_vote,author_name,am_no,result_code,yes,no,abstain,did_not_vote,absent,absent_motivated
0,1,2004-09-15 00:00:00,EU-Mauritius fishing agreement,CNS *,Leg,Amended proposal for a Council regulation,Amended proposal for a Council regulation,s,Morillon Philippe,http://www.europarl.europa.eu/sides/getDoc.do?...,...,0,0,,+,405,63.0,73.0,0,0,0
1,2,2004-09-15 00:00:00,EU-Mauritius fishing agreement,CNS *,Leg,Legislative resolution for a Council regulation,Legislative resolution for a Council regulation,s,Morillon Philippe,http://www.europarl.europa.eu/sides/getDoc.do?...,...,1,0,,+,415,56.0,67.0,0,0,0
2,3,2004-09-15 00:00:00,EU-Madagascar Fishing agreement,CNS *,Leg,Amended proposal for a Council regulation,Amended proposal for a Council regulation,s,Morillon Philippe,http://www.europarl.europa.eu/sides/getDoc.do?...,...,0,0,,+,504,79.0,84.0,0,0,0
3,4,2004-09-15 00:00:00,EU-Madagascar Fishing agreement,CNS *,Leg,Legislative resolution for a Council regulation,Legislative resolution for a Council regulation,s,Morillon Philippe,http://www.europarl.europa.eu/sides/getDoc.do?...,...,0,0,,+,507,68.0,86.0,0,0,0
4,5,2004-09-15 00:00:00,EU-Cape Verde Fishing agreement,CNS *,Leg,Amended proposal for a Council regulation,Amended proposal for a Council regulation,s,Morillon Philippe,http://www.europarl.europa.eu/sides/getDoc.do?...,...,0,0,,+,495,74.0,104.0,0,0,0


Dropping irrelevant columns (also to be discussed)

In [14]:
EP_6_votations_columnsdropped = EP_6_votations.drop(columns=['type_of_vote_en', 'did_not_vote','absent','absent_motivated'])

Manually fixing some dates, in order for pd.to_datetime to work

In [15]:
replacements = {
    " 00:00:00": "",
    " ian ":"/01/",
    "03/11/2008":"11/03/2008",
    "03/12/2008":"12/03/2008",
    "13/12/2009":"13/12/2007",
    "13/12/2008":"13/12/2007",
    "3/13/2008":"13/3/2008",
    '2009-12-13':'2007-12-13',
}
EP_6_votations_copy = EP_6_votations_columnsdropped.copy()

EP_6_votations_copy['date'] = EP_6_votations_copy['date'].astype(str).replace(replacements, regex=True)

In [16]:
EP_6_votations_copy.date

0       2004-09-15
1       2004-09-15
2       2004-09-15
3       2004-09-15
4       2004-09-15
           ...    
6194     7.05.2009
6195     7.05.2009
6196     7.05.2009
6197     7.05.2009
6198     7.05.2009
Name: date, Length: 6199, dtype: object

In [17]:
EP_6_votations_date_fixed = EP_6_votations_copy.copy()

In [18]:
EP_6_votations_date_fixed['date'] = pd.to_datetime(EP_6_votations_copy['date'],format="mixed",dayfirst=True,yearfirst=True) 

In [19]:
EP_6_votations_date_fixed.date.max()

Timestamp('2009-05-07 00:00:00')

Recoding columns in line with convention for the app


In [20]:
original_columns = EP_6_votations_date_fixed.columns.tolist()

new_columns = [
    "VoteId",
    "Date",
    "Title",
    "Procedure",
    "Leg/Non-Leg/Bud",
    "TypeOfVote",
    "VotingRule",
    "Rapporteur",
    "Link",
    "CommitteeResponsabile",
    "PolicyArea",
    "Subject",
    "FinalVote",
    "Author",
    "AmNo",
    "Vote",
    "Yes",
    "No",
    "Abs"
]

# Create a dictionary mapping original column names to new names
rename_dict = dict(zip(original_columns, new_columns))

# Assuming 'df' is your DataFrame
EP_6_votations_column_names_good = EP_6_votations_date_fixed.rename(columns=rename_dict)

Recoding missing data

In [21]:
def recode_votations(df):
    df['TypeOfVote'] = df['TypeOfVote'].replace(0,np.NAN)
    df['VotingRule'] = df['VotingRule'].replace(0,np.NAN)
    df['Rapporteur'] = df['Rapporteur'].replace(0,np.NAN)
    df['CommitteeResponsabile'] = df['CommitteeResponsabile'].replace(0,np.NAN)
    df['Author'] = df['Author'].replace(0,np.NAN)
    df['Subject'] = df['Subject'].replace(0,np.NAN)
    df['AmNo'] = df['AmNo'].replace(0,np.NAN)
    df['Vote'] = df['Vote'].replace({'+': 1, '-': 0})
    df['Vote'] = df['Vote'].replace({"-*":0," +":1, "?  replaced by 1043":0})



In [22]:
recode_votations(EP_6_votations_column_names_good)

  df['Vote'] = df['Vote'].replace({"-*":0," +":1, "?  replaced by 1043":0})


Checking and fixing the datatypes

In [23]:
EP_6_votations_column_names_good.dtypes

VoteId                            int64
Date                     datetime64[ns]
Title                            object
Procedure                        object
Leg/Non-Leg/Bud                  object
TypeOfVote                       object
VotingRule                       object
Rapporteur                       object
Link                             object
CommitteeResponsabile            object
PolicyArea                       object
Subject                          object
FinalVote                         int64
Author                           object
AmNo                             object
Vote                            float64
Yes                               int64
No                              float64
Abs                             float64
dtype: object

In [24]:
EP_6_votations_column_names_good.No = EP_6_votations_column_names_good['No'].astype('Int64')
EP_6_votations_column_names_good.Abs = EP_6_votations_column_names_good['Abs'].astype('Int64')
EP_6_votations_column_names_good.Vote = EP_6_votations_column_names_good['Vote'].astype('Int64')
EP_6_votations_column_names_good.Yes = EP_6_votations_column_names_good['Yes'].astype('Int64')

Renaming EPG groups to fit the API convention

In [25]:

EPG_rename_dict = {
    "Group of the European People's Party (Christian Democrats) and European Democrats":"EPP-ED",
    "Socialist Group in the European Parliament":"PES",
    "Group of the Alliance of Liberals and Democrats for Europe":"ALDE",
    "Union for Europe of the Nations Group":"UEN",
    "Confederal Group of the European United Left - Nordic Green Left":"GUE–NGL",
    "Group of the Greens/European Free Alliance":"Greens–EFA",
    "Non-attached Members":"NI",  
    "Independence/Democracy Group":"IND/DEM"
}

In [26]:
EP_6_votes_fixed_names.EPG = EP_6_votes_fixed_names.EPG.replace(EPG_rename_dict, regex=False)


In [27]:
EP_6_votes_fixed_names.EPG.value_counts()
{}

EPG
EPP-ED        341
PES           263
ALDE          126
UEN            51
GUE–NGL        48
Greens–EFA     44
NI             40
IND/DEM        27
Name: count, dtype: int64

Creating a list to filter the non,observation columns from Votes


In [28]:
filtered_list = ['MepId',
 'Fname',
 'Lname',
 'FullName',
 'Activ',
 'Country',
 'Party',
 'EPG',
 'Start',
 'End']

In [30]:
EP_6_votes_fixed_names[~filtered_list]

TypeError: bad operand type for unary ~: 'list'

Melting votes into long format

In [31]:

mep_info = EP_6_votes_fixed_names[filtered_list]
vote_columns = [col for col in EP_6_votes_fixed_names.columns if col not in filtered_list]
melted_votes = pd.melt(EP_6_votes_fixed_names, id_vars='MepId', value_vars=vote_columns, var_name='VoteId', value_name='Vote')


In [35]:
mep_info_for_wnominate = mep_info[['MepId','EPG']]

In [44]:
wnominate_ep6_votes = EP_6_votes_fixed_names[vote_columns]

In [45]:
def recode(x):
    if x in [0, 1, 2]:
        return x
    else:
        return 3

In [47]:
wnominate_ep6_votes = wnominate_ep6_votes.map(recode)

In [28]:
epg_df = hf.get_epgs()
party_df = hf.get_parties()
ep_df = hf.generate_ep_df(6)
org_df = pd.concat([epg_df, party_df, ep_df], ignore_index=True)

In [29]:
org_df = pd.concat([epg_df, party_df, ep_df], ignore_index=True)

In [53]:
mep_info_for_wnominate[mep_info_for_wnominate['EPG']=="EPP-ED"]

Unnamed: 0,MepId,EPG
3,28367,EPP-ED
9,28276,EPP-ED
11,28422,EPP-ED
14,28132,EPP-ED
16,4537,EPP-ED
...,...,...
932,95014,EPP-ED
933,72754,EPP-ED
937,95704,EPP-ED
938,96101,EPP-ED


In [31]:
memberships_df = hf.get_memberships_df(mep_info,org_df)

In [32]:
memberships_df

Unnamed: 0,org_id,membershipClassification,memberDuring.startDate,memberDuring.endDate,citizenship,bday,hasGender,identifier,org_label
0,org/ep-6,,2004-07-20,2009-07-13,http://publications.europa.eu/resource/authori...,1932-10-30,http://publications.europa.eu/resource/authori...,28620,EP6
1,org/2615,def/ep-entities/NATIONAL_CHAMBER,2008-11-04,2009-07-13,http://publications.europa.eu/resource/authori...,1932-10-30,http://publications.europa.eu/resource/authori...,28620,-
2,org/1550,def/ep-entities/EU_POLITICAL_GROUP,2004-07-20,2009-07-13,http://publications.europa.eu/resource/authori...,1932-10-30,http://publications.europa.eu/resource/authori...,28620,ALDE
3,org/1547,def/ep-entities/NATIONAL_CHAMBER,2004-07-20,2008-11-03,http://publications.europa.eu/resource/authori...,1932-10-30,http://publications.europa.eu/resource/authori...,28620,LC
4,org/ep-6,,2004-07-20,2009-07-13,http://publications.europa.eu/resource/authori...,1955-11-16,http://publications.europa.eu/resource/authori...,28213,EP6
...,...,...,...,...,...,...,...,...,...
8406,org/1533,def/ep-entities/EU_POLITICAL_GROUP,2009-03-19,2009-07-13,http://publications.europa.eu/resource/authori...,1957-07-26,http://publications.europa.eu/resource/authori...,23852,PPE-DE
8407,org/1157,def/ep-entities/NATIONAL_CHAMBER,2004-05-01,2004-07-19,http://publications.europa.eu/resource/authori...,1957-07-26,http://publications.europa.eu/resource/authori...,23852,JL
8408,org/994,def/ep-entities/EU_POLITICAL_GROUP,2004-05-01,2004-07-19,http://publications.europa.eu/resource/authori...,1957-07-26,http://publications.europa.eu/resource/authori...,23852,PPE-DE
8409,org/994,def/ep-entities/EU_POLITICAL_GROUP,2003-04-22,2004-04-30,http://publications.europa.eu/resource/authori...,1957-07-26,http://publications.europa.eu/resource/authori...,23852,PPE-DE


AttributeError: module 'helperfunctions' has no attribute 'reload'

In [57]:
mep_info_db = hf.get_mep_database(mep_info,memberships_df)

In [33]:
votings_filtered_list = [
    "VoteId" 
    ,"Date" 
    ,"Title"
    ,"TypeOfVote"
    ,"Rapporteur"
    ,"Link"
    ,"CommitteeResponsabile"
    ,'Subject'
    ,'FinalVote' 
    ,'AmNo'
    ,'Author'
    ,'Vote'   
    ,'Yes'
    ,'No' 
    ,'Abs']

In [34]:
votings = EP_6_votations_column_names_good[votings_filtered_list]

In [36]:
votings.dtypes

VoteId                            int64
Date                     datetime64[ns]
Title                            object
TypeOfVote                       object
Rapporteur                       object
Link                             object
CommitteeResponsabile            object
Subject                          object
FinalVote                         int64
AmNo                             object
Author                           object
Vote                              Int64
Yes                               Int64
No                                Int64
Abs                               Int64
dtype: object

In [29]:
melted_votes.VoteId = melted_votes.VoteId.astype('Int64')

Saving files to .csv

In [37]:
base_directory = os.path.join("Cleaned_data","EP6_clean_data")
os.makedirs(base_directory, exist_ok=True)



In [48]:
wnominate_ep6_votes.to_csv(os.path.join(base_directory,"wnominate_ep6_votes.csv"),index=False)
mep_info_for_wnominate.to_csv(os.path.join(base_directory,"mep_info_for_wnominate.csv"),index=False)

In [59]:
mep_info_db.to_csv(os.path.join(base_directory,"mep_info_EP_6.csv"),index=False)
melted_votes.to_csv(os.path.join(base_directory,"votes_EP_6.csv"),index=False)
votings.to_csv(os.path.join(base_directory,"votings_EP_6.csv"),index=False)
memberships_df.to_csv(os.path.join(base_directory,"memberships_EP_6.csv"),index=False)
