In [1]:
import numpy as np
import pandas as pd
import requests
import glob
import openpyxl
import os

Creating relative paths


In [2]:
votes = os.path.join("VoteWatch-EP-voting-data_2004-2022", "EP9_RCVs_2022_06_22.xlsx")
votations = os.path.join("VoteWatch-EP-voting-data_2004-2022","EP9_Voted docs.xlsx")

Importing data from EU API (for correct names)


In [3]:
url = 'https://data.europarl.europa.eu/api/v2/meps?parliamentary-term=9&format=application%2Fld%2Bjson&offset=0'
response = requests.get(url)
meps_data = response.json()

Creating DataFrames (runtime is quite long, .xlsx files)


In [4]:
meps_df_9 = pd.json_normalize(meps_data['data'])
EP_9_votes = pd.read_excel(votes)
EP_9_votations = pd.read_excel(votations)

  warn(msg)


Dropping irrelevant columns (to discuss)
Renaming variables for pd.merge and correcting datatype
Merging
Updating names of MPs


In [5]:
meps_df_9.drop(columns=['id','type','sortLabel','officialFamilyName','officialGivenName'],inplace=True)
meps_df_9.rename(columns={'identifier': 'MepId'},inplace=True)
EP_9_votes.rename(columns={'WebisteEpID': 'MepId'},inplace=True)
meps_df_9.MepId = meps_df_9.MepId.astype(int)
EP_9_votes_fixed_names = pd.merge(EP_9_votes,meps_df_9 , on='MepId', how='left')
EP_9_votes_fixed_names[['Fname','Lname','FullName']] = EP_9_votes_fixed_names[['givenName','familyName','label']]
EP_9_votes_fixed_names.drop(columns=['label','familyName','givenName'],inplace=True)


Encoding missing party names


In [6]:
filtered_df = EP_9_votes_fixed_names[EP_9_votes_fixed_names.Party.astype(str).apply(lambda x: len(x) < 3)]
filtered_df.Party.value_counts()

Party
-    1
Name: count, dtype: int64

In [7]:
EP_9_votes_fixed_names.Party = EP_9_votes_fixed_names.Party.replace("-",np.NAN)

In [8]:
EP_9_votes_fixed_names.columns

Index([   'MepId',    'Fname',    'Lname', 'FullName',    'Activ',  'Country',
          'Party',      'EPG',    'Start',      'End',
       ...
            13450,      13451,      13452,      13453,      13454,      13455,
            13456,      13457,      13458,      13459],
      dtype='object', length=13469)

Dropping irrelevant columns and renaming relevant (also to be discussed)


In [9]:
EP_9_votations.columns.tolist()


['Vote ID',
 'File',
 'Order of vote',
 'Date',
 'Title',
 'Procedure',
 'Leg/Non-Leg/Bud',
 'Type of Vote',
 'Voting Rule',
 'Rapporteur',
 'Code',
 'Interinstitutional file number',
 'Link',
 'Committee responsabile',
 'Policy area',
 'Subject',
 'Final vote?',
 'Am No.',
 'Author',
 'RCV',
 'Vote',
 'Yes',
 'No',
 'Abs']

In [10]:

column_mapping = {
    'Vote ID': 'VoteId',
    'Date': 'Date',
    'Title': 'Title',
    'Procedure': 'Procedure',
    'Leg/Non-Leg/Bud': 'Leg/Non-Leg/Bud',
    'Type of Vote': 'TypeOfVote',
    'Voting Rule': 'VotingRule',
    'Rapporteur': 'Rapporteur',
    'Link': 'Link',
    'Committee responsabile': 'CommitteeResponsabile',
    'Policy area': 'PolicyArea',
    'Subject': 'Subject',
    'Final vote?': 'FinalVote',
    'Am No.': 'AmNo',
    'Author': 'Author',
    'Vote': 'Vote',
    'Yes': 'Yes',
    'No': 'No',
    'Abs': 'Abs'
}

filtered_df = EP_9_votations[list(column_mapping.keys())]
final_df = filtered_df.rename(columns=column_mapping)

In [11]:
EP_9_votations = final_df

Fixing dtype


In [12]:
EP_9_votations['Date'] = pd.to_datetime(EP_9_votations['Date'], format='mixed') 


In [13]:
EP_9_votations.tail()


Unnamed: 0,VoteId,Date,Title,Procedure,Leg/Non-Leg/Bud,TypeOfVote,VotingRule,Rapporteur,Link,CommitteeResponsabile,PolicyArea,Subject,FinalVote,AmNo,Author,Vote,Yes,No,Abs
13454,13455,2022-09-06,The call for a Convention for the revision of ...,,Non,Motion for resolution,s,0,https://www.europarl.europa.eu/doceo/document/...,0,Constitutional and inter-institutional affairs,Paragraph 6,0.0,36.0,The Left,-,170,358,32
13455,13456,2022-09-06,The call for a Convention for the revision of ...,,Non,Motion for resolution,s,0,https://www.europarl.europa.eu/doceo/document/...,0,Constitutional and inter-institutional affairs,Paragraph 6,0.0,39.0,"EPP, S&D, Renew",+,391,110,56
13456,13457,2022-09-06,The call for a Convention for the revision of ...,,Non,Motion for resolution,s,0,https://www.europarl.europa.eu/doceo/document/...,0,Constitutional and inter-institutional affairs,After recital D,0.0,8.0,The Left,-,69,444,43
13457,13458,2022-09-06,The call for a Convention for the revision of ...,,Non,Motion for resolution,s,0,https://www.europarl.europa.eu/doceo/document/...,0,Constitutional and inter-institutional affairs,vote: resolution (as a whole),0.0,,0,+,355,154,48
13458,13459,2022-09-06,A new trade instrument to ban products made by...,,Non,Motion for resolution,s,0,https://www.europarl.europa.eu/doceo/document/...,0,Employment & Social Affairs,vote: resolution (as a whole),0.0,,0,+,503,6,4


Recoding missing data


In [14]:
def recode_votations(df):
    df['TypeOfVote'] = df['TypeOfVote'].replace(0,np.NAN)
    df['VotingRule'] = df['VotingRule'].replace(0,np.NAN)
    df['Rapporteur'] = df['Rapporteur'].replace(0,np.NAN)
    df['CommitteeResponsabile'] = df['CommitteeResponsabile'].replace(0,np.NAN)
    df['Author'] = df['Author'].replace(0,np.NAN)
    df['Subject'] = df['Subject'].replace(0,np.NAN)
    df['AmNo'] = df['AmNo'].replace(0,np.NAN)
    df['Vote'] = df['Vote'].replace({'+': 1, '-': 0,"_":0})

In [15]:
recode_votations(EP_9_votations)

  df['Vote'] = df['Vote'].replace({'+': 1, '-': 0,"_":0})


Checking and fixing the datatypes


In [16]:
EP_9_votations.dtypes

VoteId                            int64
Date                     datetime64[ns]
Title                            object
Procedure                        object
Leg/Non-Leg/Bud                  object
TypeOfVote                       object
VotingRule                       object
Rapporteur                       object
Link                             object
CommitteeResponsabile            object
PolicyArea                       object
Subject                          object
FinalVote                       float64
AmNo                             object
Author                           object
Vote                              int64
Yes                               int64
No                                int64
Abs                               int64
dtype: object

In [17]:
EP_9_votations.FinalVote = EP_9_votations['FinalVote'].astype('Int64')
EP_9_votations.Vote = EP_9_votations['Vote'].astype('Int64')


Renaming EPG groups to fit the API convention


In [18]:
EP_9_votes_fixed_names.EPG.value_counts()

EPG
EPP           183
S&D           165
REG           124
Greens/EFA     86
ECR            75
NI             69
IDG            65
The Left       40
Name: count, dtype: int64

In [19]:
EPG_rename_dict = {
    "REG":"RE",
    "The Left":"GUE–NGL",
}

In [20]:
EP_9_votes_fixed_names.EPG = EP_9_votes_fixed_names.EPG.replace(EPG_rename_dict,regex=False)

In [21]:
EP_9_votes_fixed_names.EPG.value_counts()

EPG
EPP           183
S&D           165
RE            124
Greens/EFA     86
ECR            75
NI             69
IDG            65
GUE–NGL        40
Name: count, dtype: int64

Creating a list to filter the non observation columns from Votes


In [22]:
filtered_list = ['MepId',
 'Fname',
 'Lname',
 'FullName',
 'Activ',
 'Country',
 'Party',
 'EPG',
 'Start',
 'End']

Melting votes into long format


In [23]:
mep_info = EP_9_votes_fixed_names[filtered_list]
vote_columns = [col for col in EP_9_votes_fixed_names.columns if col not in filtered_list]
melted_votes = pd.melt(EP_9_votes_fixed_names, id_vars='MepId', value_vars=vote_columns, var_name='VoteId', value_name='Vote')

Adding EP_ID


In [24]:
melted_votes['EP_ID'] = 9
EP_9_votations['EP_ID'] = 9
mep_info['EP_ID'] = 9


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mep_info['EP_ID'] = 9


Saving files to .csv


In [25]:
base_directory = os.path.join("Cleaned_data","EP9_clean_data")
os.makedirs(base_directory, exist_ok=True)

In [26]:
mep_info.to_csv(os.path.join(base_directory, "mep_info_EP_9.csv"), index=False)
melted_votes.to_csv(os.path.join(base_directory, "votes_EP_9.csv"), index=False)
EP_9_votations.to_csv(os.path.join(base_directory, "votations_EP_9.csv"), index=False)