In [1]:
import pandas as pd
import numpy as np
import requests
import glob
import openpyxl
import os
import helperfunctions as hf

Creating relative paths


In [2]:
votes = os.path.join("VoteWatch-EP-voting-data_2004-2022", "EP8_RCVs_2019_06_25.xlsx")
votations = os.path.join("VoteWatch-EP-voting-data_2004-2022","EP8_Voted docs.xlsx")

Importing data from EU API (for correct names)


In [3]:
url = 'https://data.europarl.europa.eu/api/v2/meps?parliamentary-term=8&format=application%2Fld%2Bjson&offset=0'
response = requests.get(url)
meps_data = response.json()

Creating DataFrames (runtime is quite long, .xlsx files)

In [4]:
meps_df_8 = pd.json_normalize(meps_data['data'])
EP_8_votes = pd.read_excel(votes)
EP_8_votations = pd.read_excel(votations)

Dropping irrelevant columns (to discuss)


In [5]:



meps_df_8.drop(columns=['id','type','sortLabel','officialFamilyName','officialGivenName'],inplace=True)

Renaming variables for pd.merge and correcting datatype


In [6]:
meps_df_8.rename(columns={'identifier': 'MepId'},inplace=True)
EP_8_votes.rename(columns={'WebisteEpID': 'MepId'},inplace=True)

In [7]:
meps_df_8.MepId = meps_df_8.MepId.astype(int)



Merging

In [8]:
EP_8_votes_fixed_names = pd.merge(EP_8_votes,meps_df_8 , on='MepId', how='left')


Updating names of MPs


In [9]:
EP_8_votes_fixed_names[['Fname','Lname','FullName']] = EP_8_votes_fixed_names[['givenName','familyName','label']]


In [10]:



EP_8_votes_fixed_names.drop(columns=['label','familyName','givenName'],inplace=True)


Encoding missing party names


In [11]:
filtered_df = EP_8_votes_fixed_names[EP_8_votes_fixed_names.Party.astype(str).apply(lambda x: len(x) < 3)]
filtered_df.Party.value_counts()    

Party
-    13
Name: count, dtype: int64

In [12]:

EP_8_votes_fixed_names.Party = EP_8_votes_fixed_names.Party.replace("-",np.nan)

Votations dataset inspection


In [13]:

EP_8_votes_fixed_names.columns

Index([   'MepId',    'Fname',    'Lname',    'Activ',  'Country',    'Party',
            'EPG',    'Start',      'End',          1,
       ...
            10268,      10269,      10270,      10271,      10272,      10273,
            10274,      10275,      10276, 'FullName'],
      dtype='object', length=10262)

Rearranging 

In [14]:
mepid_index = EP_8_votes_fixed_names.columns.get_loc('Lname')
Full_name_col = EP_8_votes_fixed_names.pop('FullName')
EP_8_votes_fixed_names.insert(loc=mepid_index + 1, column='FullName', value= Full_name_col)

In [15]:
EP_8_votes_fixed_names.columns

Index([   'MepId',    'Fname',    'Lname', 'FullName',    'Activ',  'Country',
          'Party',      'EPG',    'Start',      'End',
       ...
            10267,      10268,      10269,      10270,      10271,      10272,
            10273,      10274,      10275,      10276],
      dtype='object', length=10262)

Dropping irrelevant columns and renaming relevant (also to be discussed)


In [16]:


EP_8_votations.columns.tolist()

['Vote ID',
 'File',
 'Order of vote',
 'Date',
 "O'clock",
 'Title',
 'Title_Ro',
 'Title_Fr',
 'Title_German',
 'Title_Polski',
 'Procedure',
 'Leg/Non-Leg/Bud',
 'Type of Vote',
 'Voting Rule',
 'Rapporteur',
 'Code',
 'interinstitutional file number',
 'Link',
 'Committee responsabile',
 'De/Policy area',
 'Subject',
 'Subject_ro',
 'Subject_fr',
 'Subject_ger',
 'Subject_pl',
 'Final \nvote?',
 'Am No.',
 'Author',
 'RCV',
 'Vote',
 'Yeas',
 'No',
 'Abs']

In [17]:
renaming_dict = {
    'Vote ID': 'VoteId',
    'Date': 'Date',
    'Title': 'Title',
    'Procedure': 'Procedure',
    'Leg/Non-Leg/Bud': 'Leg/Non-Leg/Bud',
    'Type of Vote': 'TypeOfVote',
    'Voting Rule': 'VotingRule',
    'Rapporteur': 'Rapporteur',
    'Link': 'Link',
    'Committee responsabile': 'CommitteeResponsabile',
    'De/Policy area': 'PolicyArea',
    'Subject': 'Subject',
    'Final \nvote?': 'FinalVote',
    'Am No.': 'AmNo',
    'Author': 'Author',
    'Vote': 'Vote',
    'Yeas': 'Yes',
    'No': 'No',
    'Abs': 'Abs'
}

# Select and rename necessary columns
EP_8_votations = EP_8_votations[list(renaming_dict.keys())]  # Select only the columns we need
EP_8_votations.rename(columns=renaming_dict, inplace=True)  # Rename the columns

In [18]:
EP_8_votations.columns

Index(['VoteId', 'Date', 'Title', 'Procedure', 'Leg/Non-Leg/Bud', 'TypeOfVote',
       'VotingRule', 'Rapporteur', 'Link', 'CommitteeResponsabile',
       'PolicyArea', 'Subject', 'FinalVote', 'AmNo', 'Author', 'Vote', 'Yes',
       'No', 'Abs'],
      dtype='object')

Fixing dtype

In [19]:
EP_8_votations['Date'] = pd.to_datetime(EP_8_votations['Date'],format='mixed') 

In [20]:
EP_8_votations.Date.tail()

10247   2019-04-18
10248   2019-04-18
10249   2019-04-18
10250   2019-04-18
10251   2019-04-18
Name: Date, dtype: datetime64[ns]



Recoding missing data


In [21]:
def recode_votations(df):
    df['TypeOfVote'] = df['TypeOfVote'].replace(0,np.nan)
    df['VotingRule'] = df['VotingRule'].replace(0,np.nan)
    df['Rapporteur'] = df['Rapporteur'].replace(0,np.nan)
    df['CommitteeResponsabile'] = df['CommitteeResponsabile'].replace(0,np.nan)
    df['Author'] = df['Author'].replace(0,np.nan)
    df['Subject'] = df['Subject'].replace(0,np.nan)
    df['AmNo'] = df['AmNo'].replace(0,np.nan)
    df['Vote'] = df['Vote'].replace({'+': 1, '-': 0,"_":0})

In [22]:
recode_votations(EP_8_votations)

  df['Vote'] = df['Vote'].replace({'+': 1, '-': 0,"_":0})


In [23]:
EP_8_votations.head()


Unnamed: 0,VoteId,Date,Title,Procedure,Leg/Non-Leg/Bud,TypeOfVote,VotingRule,Rapporteur,Link,CommitteeResponsabile,PolicyArea,Subject,FinalVote,AmNo,Author,Vote,Yes,No,Abs
0,1,2014-07-16,Appointment of 4 members of the European Commi...,,Non,Appointment of commissioners,s,,http://www.europarl.europa.eu/sides/getDoc.do?...,,\n\nConstitutional and inter-institutional aff...,,1.0,,,1,421,170,32
1,2,2014-07-16,Adoption by Lithuania of the euro on 1 January...,*,Leg,Draft legislative resolution,s,Langen,http://www.europarl.europa.eu/sides/getDoc.do?...,Committee on Economic and Monetary Affairs,Economic & monetary affairs,Paragraph 2,0.0,4.0,GUE/NGL,0,148,514,10
2,3,2014-07-16,Adoption by Lithuania of the euro on 1 January...,*,Leg,Draft legislative resolution,s,Langen,http://www.europarl.europa.eu/sides/getDoc.do?...,Committee on Economic and Monetary Affairs,Economic & monetary affairs,After paragraph 7,0.0,1.0,Greens/ EFA,0,137,525,19
3,4,2014-07-16,Adoption by Lithuania of the euro on 1 January...,*,Leg,Draft legislative resolution,s,Langen,http://www.europarl.europa.eu/sides/getDoc.do?...,Committee on Economic and Monetary Affairs,Economic & monetary affairs,After paragraph 7,0.0,3.0,Greens/ EFA,0,93,550,40
4,5,2014-07-16,Adoption by Lithuania of the euro on 1 January...,*,Leg,Draft legislative resolution,s,Langen,http://www.europarl.europa.eu/sides/getDoc.do?...,Committee on Economic and Monetary Affairs,Economic & monetary affairs,vote: legislative resolution,1.0,,,1,545,116,34


Checking and fixing the datatypes


In [24]:
EP_8_votations.dtypes


VoteId                            int64
Date                     datetime64[ns]
Title                            object
Procedure                        object
Leg/Non-Leg/Bud                  object
TypeOfVote                       object
VotingRule                       object
Rapporteur                       object
Link                             object
CommitteeResponsabile            object
PolicyArea                       object
Subject                          object
FinalVote                       float64
AmNo                             object
Author                           object
Vote                              int64
Yes                               int64
No                                int64
Abs                               int64
dtype: object

In [25]:
EP_8_votations.FinalVote = EP_8_votations['FinalVote'].astype('Int64')

In [26]:
EP_8_votations.Vote = EP_8_votations['Vote'].astype('Int64')

Renaming EPG groups to fit the API convention


In [27]:
EP_8_votes_fixed_names.EPG.value_counts()

EPG
Group of the European People's Party (Christian Democrats)                                  219
Group of the Progressive Alliance of Socialists and Democrats in the European Parliament    188
European Conservatives and Reformists Group                                                  73
Group of the Alliance of Liberals and Democrats for Europe                                   68
Group of the Greens/European Free Alliance                                                   52
Confederal Group of the European United Left - Nordic Green Left                             51
Europe of Freedom and Direct Democracy Group                                                 42
Europe of Nations and Freedom Group                                                          35
Non-attached Members                                                                         23
Name: count, dtype: int64

In [28]:
EPG_rename_dict = {
    "Group of the European People's Party (Christian Democrats)":"EPP",
    "Group of the Progressive Alliance of Socialists and Democrats in the European Parliament":"S&D",
    "Group of the Alliance of Liberals and Democrats for Europe":"ALDE",
    "European Conservatives and Reformists Group":"ECR",
    "Confederal Group of the European United Left - Nordic Green Left":"GUE–NGL",
    "Group of the Greens/European Free Alliance":"Greens–EFA",
    "Non-attached Members":"NI",  
    "Europe of Freedom and Direct Democracy Group":"EFDD",
    "Europe of Nations and Freedom Group":"ENFF"
}

In [29]:
EP_8_votes_fixed_names.EPG = EP_8_votes_fixed_names.EPG.replace(EPG_rename_dict,regex=False)

In [30]:
EP_8_votes_fixed_names.EPG.value_counts()

EPG
EPP           219
S&D           188
ECR            73
ALDE           68
Greens–EFA     52
GUE–NGL        51
EFDD           42
ENFF           35
NI             23
Name: count, dtype: int64

Creating a list to filter the non,observation columns from Votes


In [31]:
filtered_list = ['MepId',
 'Fname',
 'Lname',
 'FullName',
 'Activ',
 'Country',
 'Party',
 'EPG',
 'Start',
 'End']

Melting votes into long format


In [32]:
mep_info = EP_8_votes_fixed_names[filtered_list]
vote_columns = [col for col in EP_8_votes_fixed_names.columns if col not in filtered_list]
melted_votes = pd.melt(EP_8_votes_fixed_names, id_vars='MepId', value_vars=vote_columns, var_name='VoteId', value_name='Vote')


In [33]:
epg_df = hf.get_epgs()
party_df = hf.get_parties()
ep_df = hf.generate_ep_df(9)
org_df = pd.concat([epg_df, party_df, ep_df], ignore_index=True)

In [34]:
mep_info

Unnamed: 0,MepId,Fname,Lname,FullName,Activ,Country,Party,EPG,Start,End
0,124990,Lars,Adaktusson,Lars ADAKTUSSON,no,Sweden,Kristdemokraterna,,2014-07-01,2018-09-23
1,124831,Isabella,Adinolfi,Isabella ADINOLFI,yes,Italy,Movimento 5 Stelle,EFDD,2014-07-01,2019-12-31
2,124797,Marco,Affronte,Marco AFFRONTE,yes,Italy,Independent,Greens–EFA,2014-07-01,2019-12-31
3,124811,Laura,Agea,Laura AGEA,yes,Italy,Movimento 5 Stelle,EFDD,2014-07-01,2019-12-31
4,96897,John Stuart,Agnew,John Stuart AGNEW,yes,United Kingdom,United Kingdom Independence Party,EFDD,2014-07-01,2019-12-31
...,...,...,...,...,...,...,...,...,...,...
853,196042,Aleksejs,Loskutovs,Aleksejs LOSKUTOVS,yes,Latvia,,,2019-01-24,2019-12-31
854,195454,Ralph,Packet,Ralph PACKET,yes,Belgium,,,2018-11-22,2019-12-31
855,111033,Kārlis,Šadurskis,Kārlis ŠADURSKIS,yes,Latvia,,,2018-11-28,2019-12-31
856,28299,Bogusław,Sonik,Bogusław SONIK,yes,Poland,,,2018-11-20,2019-12-31


In [None]:
mep_info_for_wnominate = mep_info[['FullName','EPG']]
matrix_ep_votes = EP_8_votes_fixed_names[vote_columns]

In [None]:
memberships_df = hf.get_memberships_df(mep_info,org_df)


In [35]:
memebershipsEP8 = pd.read_csv(os.path.join('Cleaned_data','EP8_clean_data','memberships_EP_8.csv'))


In [61]:
memebershipsEP8

Unnamed: 0,org_id,membershipClassification,memberDuring.startDate,memberDuring.endDate,citizenship,bday,hasGender,identifier,org_label
0,org/ep-8,,2014-07-01,2019-07-01,http://publications.europa.eu/resource/authori...,1985-05-23,http://publications.europa.eu/resource/authori...,99650,EP8
1,org/5073,def/ep-entities/NATIONAL_CHAMBER,2019-02-12,2019-07-01,http://publications.europa.eu/resource/authori...,1985-05-23,http://publications.europa.eu/resource/authori...,99650,BREX
2,org/4919,def/ep-entities/NATIONAL_CHAMBER,2018-12-06,2019-02-11,http://publications.europa.eu/resource/authori...,1985-05-23,http://publications.europa.eu/resource/authori...,99650,TI
3,org/4280,def/ep-entities/EU_POLITICAL_GROUP,2014-07-01,2014-10-15,http://publications.europa.eu/resource/authori...,1985-05-23,http://publications.europa.eu/resource/authori...,99650,EFDD
4,org/4880,def/ep-entities/EU_POLITICAL_GROUP,2014-10-20,2019-07-01,http://publications.europa.eu/resource/authori...,1985-05-23,http://publications.europa.eu/resource/authori...,99650,EFDD
...,...,...,...,...,...,...,...,...,...
7373,org/1533,def/ep-entities/EU_POLITICAL_GROUP,2004-07-20,2009-07-13,http://publications.europa.eu/resource/authori...,1953-12-03,http://publications.europa.eu/resource/authori...,28299,PPE-DE
7374,org/4273,def/ep-entities/EU_POLITICAL_GROUP,2018-11-20,2019-07-01,http://publications.europa.eu/resource/authori...,1953-12-03,http://publications.europa.eu/resource/authori...,28299,PPE
7375,org/1418,def/ep-entities/NATIONAL_CHAMBER,2004-07-20,2009-07-13,http://publications.europa.eu/resource/authori...,1953-12-03,http://publications.europa.eu/resource/authori...,28299,PO
7376,org/2816,def/ep-entities/NATIONAL_CHAMBER,2009-07-14,2014-06-30,http://publications.europa.eu/resource/authori...,1953-12-03,http://publications.europa.eu/resource/authori...,28299,PO


In [43]:
memebershipsEP8['memberDuring.startDate'] = pd.to_datetime(memebershipsEP8['memberDuring.startDate'])
memebershipsEP8['memberDuring.endDate'] = pd.to_datetime(memebershipsEP8['memberDuring.endDate'])

In [44]:
import pycountry

In [74]:
def get_epg_rowwise(row, df):
    mep_id = row['MepId']
    start_date = pd.to_datetime(row['Start']) + pd.Timedelta(days=20)
    end_date = pd.to_datetime(row['End'])

    epg_info = df[
        (df['identifier'].astype(str) == str(mep_id)) &
        (df['membershipClassification'] == "def/ep-entities/EU_POLITICAL_GROUP") &
        (df['memberDuring.startDate'] <= start_date) &
        (df['memberDuring.endDate'] >= start_date) &
        ((df['memberDuring.endDate'].isna()) | (df['memberDuring.endDate'] <= end_date))
        ]
    return epg_info['org_label'].iloc[0] if not epg_info.empty else np.nan

def extract_birthday(identifier, df):
    df = df[df['identifier'].astype(str) == str(identifier)]
    bday = df.iloc[0]["bday"]
    return bday


def extract_gender(identifier, df):
    df = df[df['identifier'].astype(str) == str(identifier)]
    hasGender = df.iloc[0]['hasGender']
    parts = hasGender.rsplit('/', 1)
    return parts[-1] if len(parts) > 1 else hasGender

def get_country(mep_id, df):
    country_url = df.loc[df['identifier'].astype(str) == str(mep_id), 'citizenship'].iloc[0]
    country_code = country_url.split('/')[-1]
    return pycountry.countries.get(alpha_3=country_code).name if country_code else np.nan

def get_mep_database2(mep_df, memberships_df):
    temp_df = mep_df.copy()
    mep_df = mep_df.copy()
    mep_df['MepId'] = temp_df['MepId'].astype("Int64")
    mep_df['Fname'] = temp_df['Fname'].astype("str")
    mep_df['Lname'] = temp_df['Lname'].astype("str")
    mep_df['EPG2'] = temp_df.apply(get_epg_rowwise, df=memberships_df, axis=1)
    mep_df['FullName'] = temp_df['FullName'].astype("str")
    mep_df['Birthday'] = temp_df['MepId'].apply(extract_birthday, df=memberships_df).astype("datetime64[ns]")
    mep_df['Gender'] = temp_df['MepId'].apply(extract_gender, df=memberships_df).astype("str")
    mep_df['Country'] = temp_df['MepId'].apply(get_country, df=memberships_df).astype("str")
    return mep_df


In [75]:
mep_info_db = get_mep_database2(mep_info, memebershipsEP8)


In [77]:
mep_info_db['EPG2'].replace("PPE", "EPP", inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  mep_info_db['EPG2'].replace("PPE", "EPP", inplace=True)


In [83]:
mep_info_db['EPG2'].replace("GUE-NGL","GUE–NGL",inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  mep_info_db['EPG2'].replace("GUE-NGL","GUE–NGL",inplace=True)


In [84]:
mep_info_db[mep_info_db['EPG'] != mep_info_db['EPG2']]

Unnamed: 0,MepId,Fname,Lname,FullName,Activ,Country,Party,EPG,Start,End,EPG2,Birthday,Gender
0,124990,Lars,Adaktusson,Lars ADAKTUSSON,no,Sweden,Kristdemokraterna,,2014-07-01,2018-09-23,EPP,1955-08-06,MALE
2,124797,Marco,Affronte,Marco AFFRONTE,yes,Italy,Independent,Greens–EFA,2014-07-01,2019-12-31,EFDD,1965-05-06,MALE
9,96736,Jan Philipp,Albrecht,Jan Philipp ALBRECHT,no,Germany,Bündnis 90/Die Grünen,,2014-07-01,2018-07-02,Verts/ALE,1982-12-20,MALE
11,30190,Louis,Aliot,Louis ALIOT,no,France,Front national,,2014-07-01,2017-07-20,NI,1969-09-04,MALE
15,124994,Max,Andersson,Max ANDERSSON,yes,Sweden,Miljöpartiet de gröna,Greens–EFA,2014-07-01,2019-12-31,Verts/ALE,1973-10-26,MALE
...,...,...,...,...,...,...,...,...,...,...,...,...,...
853,196042,Aleksejs,Loskutovs,Aleksejs LOSKUTOVS,yes,Latvia,,,2019-01-24,2019-12-31,EPP,1962-08-22,MALE
854,195454,Ralph,Packet,Ralph PACKET,yes,Belgium,,,2018-11-22,2019-12-31,ECR,1990-07-17,MALE
855,111033,Kārlis,Šadurskis,Kārlis ŠADURSKIS,yes,Latvia,,,2018-11-28,2019-12-31,EPP,1959-10-11,MALE
856,28299,Bogusław,Sonik,Bogusław SONIK,yes,Poland,,,2018-11-20,2019-12-31,EPP,1953-12-03,MALE


In [None]:
votings_filtered_list = [
    "VoteId" 
    ,"Date" 
    ,"Title"
    ,"TypeOfVote"
    ,"Rapporteur"
    ,"Link"
    ,"CommitteeResponsabile"
    ,'Subject'
    ,'FinalVote' 
    ,'AmNo'
    ,'Author'
    ,'Vote'   
    ,'Yes'
    ,'No' 
    ,'Abs']

In [None]:
votings = EP_8_votations[votings_filtered_list]


In [None]:
melted_votes.VoteId = melted_votes.VoteId.astype('Int64')


Saving files to .csv


In [None]:
base_directory = os.path.join("Cleaned_data","EP8_clean_data")
os.makedirs(base_directory, exist_ok=True)

In [None]:
matrix_ep_votes.to_csv(os.path.join(base_directory,"matrix_ep8_votes.csv"),index=False)
mep_info_for_wnominate.to_csv(os.path.join(base_directory,"mep_info_for_wnominate.csv"),index=False)

In [None]:
mep_info_db.to_csv(os.path.join(base_directory,"mep_info_EP_8.csv"),index=False)
melted_votes.to_csv(os.path.join(base_directory,"votes_EP_8.csv"),index=False)
votings.to_csv(os.path.join(base_directory,"votings_EP_8.csv"),index=False)
memberships_df.to_csv(os.path.join(base_directory,"memberships_EP_8.csv"),index=False)