In [1]:
import numpy as np
import pandas as pd
import requests
import glob
import openpyxl
import os
import helperfunctions as hf

Creating relative paths


In [2]:
votes = os.path.join("VoteWatch-EP-voting-data_2004-2022", "EP9_RCVs_2022_06_22.xlsx")
votations = os.path.join("VoteWatch-EP-voting-data_2004-2022","EP9_Voted docs.xlsx")

Importing data from EU API (for correct names)


In [3]:
url = 'https://data.europarl.europa.eu/api/v2/meps?parliamentary-term=9&format=application%2Fld%2Bjson&offset=0'
response = requests.get(url)
meps_data = response.json()

Creating DataFrames (runtime is quite long, .xlsx files)


In [4]:
meps_df_9 = pd.json_normalize(meps_data['data'])
EP_9_votes = pd.read_excel(votes)
EP_9_votations = pd.read_excel(votations)

  warn(msg)


Dropping irrelevant columns (to discuss)
Renaming variables for pd.merge and correcting datatype
Merging
Updating names of MPs


In [5]:
meps_df_9.drop(columns=['id','type','sortLabel','officialFamilyName','officialGivenName'],inplace=True)
meps_df_9.rename(columns={'identifier': 'MepId'},inplace=True)
EP_9_votes.rename(columns={'WebisteEpID': 'MepId'},inplace=True)
meps_df_9.MepId = meps_df_9.MepId.astype(int)
EP_9_votes_fixed_names = pd.merge(EP_9_votes,meps_df_9 , on='MepId', how='left')
EP_9_votes_fixed_names[['Fname','Lname','FullName']] = EP_9_votes_fixed_names[['givenName','familyName','label']]
EP_9_votes_fixed_names.drop(columns=['label','familyName','givenName'],inplace=True)


Encoding missing party names


In [6]:
filtered_df = EP_9_votes_fixed_names[EP_9_votes_fixed_names.Party.astype(str).apply(lambda x: len(x) < 3)]
filtered_df.Party.value_counts()

Party
-    1
Name: count, dtype: int64

In [7]:
EP_9_votes_fixed_names.Party = EP_9_votes_fixed_names.Party.replace("-",np.NAN)

In [8]:
EP_9_votes_fixed_names.columns

Index([   'MepId',    'Fname',    'Lname', 'FullName',    'Activ',  'Country',
          'Party',      'EPG',    'Start',      'End',
       ...
            13450,      13451,      13452,      13453,      13454,      13455,
            13456,      13457,      13458,      13459],
      dtype='object', length=13469)

Dropping irrelevant columns and renaming relevant (also to be discussed)


In [9]:
EP_9_votations.columns.tolist()


['Vote ID',
 'File',
 'Order of vote',
 'Date',
 'Title',
 'Procedure',
 'Leg/Non-Leg/Bud',
 'Type of Vote',
 'Voting Rule',
 'Rapporteur',
 'Code',
 'Interinstitutional file number',
 'Link',
 'Committee responsabile',
 'Policy area',
 'Subject',
 'Final vote?',
 'Am No.',
 'Author',
 'RCV',
 'Vote',
 'Yes',
 'No',
 'Abs']

In [10]:

column_mapping = {
    'Vote ID': 'VoteId',
    'Date': 'Date',
    'Title': 'Title',
    'Procedure': 'Procedure',
    'Leg/Non-Leg/Bud': 'Leg/Non-Leg/Bud',
    'Type of Vote': 'TypeOfVote',
    'Voting Rule': 'VotingRule',
    'Rapporteur': 'Rapporteur',
    'Link': 'Link',
    'Committee responsabile': 'CommitteeResponsabile',
    'Policy area': 'PolicyArea',
    'Subject': 'Subject',
    'Final vote?': 'FinalVote',
    'Am No.': 'AmNo',
    'Author': 'Author',
    'Vote': 'Vote',
    'Yes': 'Yes',
    'No': 'No',
    'Abs': 'Abs'
}

filtered_df = EP_9_votations[list(column_mapping.keys())]
final_df = filtered_df.rename(columns=column_mapping)

In [11]:
EP_9_votations = final_df

Fixing dtype


In [12]:
EP_9_votations['Date'] = pd.to_datetime(EP_9_votations['Date'],format='%d.%m.%Y') 


In [13]:
EP_9_votations.tail()


Unnamed: 0,VoteId,Date,Title,Procedure,Leg/Non-Leg/Bud,TypeOfVote,VotingRule,Rapporteur,Link,CommitteeResponsabile,PolicyArea,Subject,FinalVote,AmNo,Author,Vote,Yes,No,Abs
13454,13455,2022-06-09,The call for a Convention for the revision of ...,,Non,Motion for resolution,s,0,https://www.europarl.europa.eu/doceo/document/...,0,Constitutional and inter-institutional affairs,Paragraph 6,0.0,36.0,The Left,-,170,358,32
13455,13456,2022-06-09,The call for a Convention for the revision of ...,,Non,Motion for resolution,s,0,https://www.europarl.europa.eu/doceo/document/...,0,Constitutional and inter-institutional affairs,Paragraph 6,0.0,39.0,"EPP, S&D, Renew",+,391,110,56
13456,13457,2022-06-09,The call for a Convention for the revision of ...,,Non,Motion for resolution,s,0,https://www.europarl.europa.eu/doceo/document/...,0,Constitutional and inter-institutional affairs,After recital D,0.0,8.0,The Left,-,69,444,43
13457,13458,2022-06-09,The call for a Convention for the revision of ...,,Non,Motion for resolution,s,0,https://www.europarl.europa.eu/doceo/document/...,0,Constitutional and inter-institutional affairs,vote: resolution (as a whole),0.0,,0,+,355,154,48
13458,13459,2022-06-09,A new trade instrument to ban products made by...,,Non,Motion for resolution,s,0,https://www.europarl.europa.eu/doceo/document/...,0,Employment & Social Affairs,vote: resolution (as a whole),0.0,,0,+,503,6,4


Recoding missing data


In [14]:
def recode_votations(df):
    df['TypeOfVote'] = df['TypeOfVote'].replace(0,np.NAN)
    df['VotingRule'] = df['VotingRule'].replace(0,np.NAN)
    df['Rapporteur'] = df['Rapporteur'].replace(0,np.NAN)
    df['CommitteeResponsabile'] = df['CommitteeResponsabile'].replace(0,np.NAN)
    df['Author'] = df['Author'].replace(0,np.NAN)
    df['Subject'] = df['Subject'].replace(0,np.NAN)
    df['AmNo'] = df['AmNo'].replace(0,np.NAN)
    df['Vote'] = df['Vote'].replace({'+': 1, '-': 0,"_":0})

In [15]:
recode_votations(EP_9_votations)

  df['Vote'] = df['Vote'].replace({'+': 1, '-': 0,"_":0})


Checking and fixing the datatypes


In [16]:
EP_9_votations.dtypes

VoteId                            int64
Date                     datetime64[ns]
Title                            object
Procedure                        object
Leg/Non-Leg/Bud                  object
TypeOfVote                       object
VotingRule                       object
Rapporteur                       object
Link                             object
CommitteeResponsabile            object
PolicyArea                       object
Subject                          object
FinalVote                       float64
AmNo                             object
Author                           object
Vote                              int64
Yes                               int64
No                                int64
Abs                               int64
dtype: object

In [17]:
EP_9_votations.FinalVote = EP_9_votations['FinalVote'].astype('Int64')
EP_9_votations.Vote = EP_9_votations['Vote'].astype('Int64')


Renaming EPG groups to fit the API convention


In [18]:
EP_9_votes_fixed_names.EPG.value_counts()

EPG
EPP           183
S&D           165
REG           124
Greens/EFA     86
ECR            75
NI             69
IDG            65
The Left       40
Name: count, dtype: int64

In [19]:
EPG_rename_dict = {
    "REG":"RE",
    "The Left":"GUE–NGL",
}

In [20]:
EP_9_votes_fixed_names.EPG = EP_9_votes_fixed_names.EPG.replace(EPG_rename_dict,regex=False)

In [21]:
EP_9_votes_fixed_names.EPG.value_counts()

EPG
EPP           183
S&D           165
RE            124
Greens/EFA     86
ECR            75
NI             69
IDG            65
GUE–NGL        40
Name: count, dtype: int64

Creating a list to filter the non observation columns from Votes


In [22]:
filtered_list = ['MepId',
 'Fname',
 'Lname',
 'FullName',
 'Activ',
 'Country',
 'Party',
 'EPG',
 'Start',
 'End']
filtered_list2 = [
 'Fname',
 'Lname',
 'FullName',
 'Activ',
 'Country',
 'Party',
 'EPG',
 'Start',
 'End']

Melting votes into long format


In [23]:
voted_docs_csv = pd.read_csv(os.path.join("VoteWatch-EP-voting-data_2004-2022","Voted_doc_15-03-2024.csv"),encoding='latin_1')
votes_b = pd.read_excel(os.path.join("VoteWatch-EP-voting-data_2004-2022","RCV9B_140324.xlsx"))



In [24]:
votings_csv = pd.DataFrame()

def add_final_vote_column(df):
    # Sort the dataframe by the file URL and `V_order0` to ensure the order is correct
    df = df.sort_values(by=['File', 'V_order0'])

    # Create a column to identify the last V_order0 within each file URL
    df['FinalVote'] = df.groupby('File')['V_order0'].transform(lambda x: x == x.max()).astype(int)
    df['Vote'] = (df['Yes'] > df['No']).astype(int)

    return df
voted_csv_transformed = add_final_vote_column(voted_docs_csv)

In [25]:
def extract_voteid(vote_committee):
    # Substrings to remove
    substring_to_remove = "V"
    # Safely handle None values
    safe_vote_committee = vote_committee or ""

    if substring_to_remove in safe_vote_committee:
        result_string = safe_vote_committee.replace(substring_to_remove, "")
        try:
            # Convert the result to an integer, add 3, and convert back to string
            result_number = int(result_string) + 13412
            return str(result_number)
        except ValueError:
            # If conversion fails, return the original safe_vote_committee
            return safe_vote_committee
    else:
        return safe_vote_committee  # Return the input or an empty string if None was input


def extract_leg_csv(row):
    substring = "budget"
    # Check if 'Procedure' is not None
    if row['Legislative'] is not None:
        return "Leg"

    # Safely check for 'substring' in 'vote_title' and 'vote_committee'
    title_contains_substring = substring in (row['Title'] or "").lower()
    committee_contains_substring = substring in (row['Committee'] or "").lower()

    # Determine the return value based on the conditions
    if title_contains_substring or committee_contains_substring:
        return "Bud"
    else:
        return "Non-Leg"


replacements = {
    "â€™": "'",
    "â": "'",
    "Ã¼": "ü",
    "Ã©": "é",
    "Ã¨": "è",
    "Ã¶": "ö",
    "Ã": "à",
    "Ð": "-",
    "Õ": "'",
    "â": "'",
    "": "é",
    "": "é",
    "": "-",
    "": "ß",
    "": "-"
}
mapping_dict2 = {
    "Environment, Public Health and Food Safety": "Environment & public health",
    "Constitutional Affairs": "Constitutional & Inter-Institutional Affairs",
    "Foreign Affairs": "Foreign & security policy",
    "Budgets": "Budget",
    "Employment and Social Affairs": "Employment & Social Affairs",
    "Industry, Research and Energy": "Industry, Research & Energy",
    "Budgetary Control": "Budgetary Control",
    "Transport and Tourism": "Transport and Tourism",
    "Civil Liberties, Justice and Home Affairs": "Civil liberties, justice & home affairs",
    "Internal Market and Consumer Protection": "Internal market & consumer protection",
    "Economic and Monetary Affairs": "Economic & Monetary Affairs",
    "Legal Affairs": "Legal affairs",
    "Special the COVID-19 pandemic: lessons learned and recommendations for the futureÊ": "Special the COVID-19 pandemic: lessons learned and recommendations for the future",
    "Culture and Education": "Culture & education",
    "Agriculture and Rural Development": "Agriculture",
    "Fisheries": "Fisheries",
    "Special foreign interference in all democratic processes in the European Union, including disinformation, and the strengthening of integrity, transparency and accountability in the European ParliamentÊ": "Special foreign interference in all democratic processes in the European Union, including disinformation, and the strengthening of integrity, transparency and accountability in the European Parliament",
    "International Trade": "International trade",
    "Women's Rights and Gender Equality": "Women’s Rights and Gender Equality",
    "Regional Development": "Regional Development",
    "Petitions": "Petitions",
    "Development": "Development"
}


def replace_bad_chars(text, replacements):
    for old_char, new_char in replacements.items():
        text = text.replace(old_char, new_char)
    return text

In [26]:
votings_csv['VoteId'] = voted_csv_transformed['Vote.ID'].apply(extract_voteid)
votings_csv['Date'] = pd.to_datetime(voted_csv_transformed['Date'], format='%d/%m/%Y')
votings_csv['Title'] = voted_csv_transformed['Title']
oj_mask = votings_csv['Title'] == 'OJ'
voted_csv_transformed['desc'] = voted_csv_transformed['desc'].apply(lambda x: replace_bad_chars(x, replacements))
votings_csv.loc[oj_mask, 'Title'] = voted_docs_csv.loc[voted_csv_transformed['Title'] == 'OJ', 'desc'].values
votings_csv['Procedure'] = voted_csv_transformed['Legislative']
voted_csv_transformed["Leg/Non-Leg/Bud"] = voted_csv_transformed.apply(extract_leg_csv, axis=1)
votings_csv["Leg/Non-Leg/Bud"] = voted_csv_transformed["Leg/Non-Leg/Bud"]
votings_csv["TypeOfVote"] = voted_csv_transformed['Reolutions']
votings_csv["VotingRule"] = "s"
votings_csv["Rapporteur"] = voted_csv_transformed["Rapporteur"].replace("No Rapporteur", "")
votings_csv["Link"] = voted_csv_transformed["urls"]
votings_csv["CommitteeResponsabile"] = voted_csv_transformed["Committee"].replace("0", "")
votings_csv["PolicyArea"] = voted_csv_transformed["Committee"].replace("0", "")
votings_csv["Subject"] = voted_csv_transformed["Reolutions"]
votings_csv["FinalVote"] = voted_csv_transformed["FinalVote"]
votings_csv["AmNo"] = voted_csv_transformed["Amendment"].replace("-", "")
votings_csv['Author'] = ""
votings_csv["Vote"] = voted_csv_transformed["Vote"]
votings_csv["Yes"] = voted_csv_transformed["Yes"]
votings_csv["No"] = voted_csv_transformed["No"]
votings_csv["Abs"] = voted_csv_transformed["Abstain"]

votings_csv['PolicyArea'].replace(mapping_dict2, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  votings_csv['PolicyArea'].replace(mapping_dict2, inplace=True)


In [27]:
EP_9_votations = EP_9_votations[:13412]

In [28]:
EP_9_votations

Unnamed: 0,VoteId,Date,Title,Procedure,Leg/Non-Leg/Bud,TypeOfVote,VotingRule,Rapporteur,Link,CommitteeResponsabile,PolicyArea,Subject,FinalVote,AmNo,Author,Vote,Yes,No,Abs
0,1,2019-07-15,Tuesday - request by the GUE/NGL group,,Non,Proposal for a decision,s,,http://www.europarl.europa.eu/doceo/document/P...,,Foreign & security policy,Procedural vote,1,,,0,83,142,72
1,2,2019-07-18,Situation at the USA-Mexican border,,Non,Joint motion for a resolution,s,,http://www.europarl.europa.eu/doceo/document/R...,,Foreign & security policy,Paragraph 13,0,,original text,1,311,269,33
2,3,2019-07-18,Situation at the USA-Mexican border,,Non,Joint motion for a resolution,s,,http://www.europarl.europa.eu/doceo/document/R...,,Foreign & security policy,Paragraph 14,0,,original text,1,437,155,33
3,4,2019-07-18,Situation at the USA-Mexican border,,Non,Joint motion for a resolution,s,,http://www.europarl.europa.eu/doceo/document/R...,,Foreign & security policy,Paragraph 17,0,,original text,1,441,158,27
4,5,2019-07-18,Situation at the USA-Mexican border,,Non,Joint motion for a resolution,s,,http://www.europarl.europa.eu/doceo/document/R...,,Foreign & security policy,Paragraph 19,0,,original text,1,466,146,41
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13407,13408,2022-06-08,"The EU’s Foreign, Security and Defence Policy ...",,Non,Draft recommendation,s,McAllister/Loiseau,https://www.europarl.europa.eu/doceo/document/...,Committee on Foreign Affairs,Foreign & security policy,"Paragraph 1, point o",0,,original text,1,511,64,15
13408,13409,2022-06-08,"The EU’s Foreign, Security and Defence Policy ...",,Non,Draft recommendation,s,McAllister/Loiseau,https://www.europarl.europa.eu/doceo/document/...,Committee on Foreign Affairs,Foreign & security policy,"Paragraph 1, point av",0,39,"EPP, Renew",1,452,121,22
13409,13410,2022-06-08,"The EU’s Foreign, Security and Defence Policy ...",,Non,Draft recommendation,s,McAllister/Loiseau,https://www.europarl.europa.eu/doceo/document/...,Committee on Foreign Affairs,Foreign & security policy,"Paragraph 1, point ay",0,,original text,1,508,56,19
13410,13411,2022-06-08,"The EU’s Foreign, Security and Defence Policy ...",,Non,Draft recommendation,s,McAllister/Loiseau,https://www.europarl.europa.eu/doceo/document/...,Committee on Foreign Affairs,Foreign & security policy,vote: resolution (as a whole),1,,,1,438,65,94


In [29]:
votings_csv.tail(5)

Unnamed: 0,VoteId,Date,Title,Procedure,Leg/Non-Leg/Bud,TypeOfVote,VotingRule,Rapporteur,Link,CommitteeResponsabile,PolicyArea,Subject,FinalVote,AmNo,Author,Vote,Yes,No,Abs
5257,18670,2024-03-14,Creation of a European initiative for an annua...,166704,Leg,,s,,https://www.europarl.europa.eu/doceo/document/...,,,,0,B9-0174/2024 - Â§ 7/2,,1,309,191,24
5258,18671,2024-03-14,Creation of a European initiative for an annua...,166705,Leg,,s,,https://www.europarl.europa.eu/doceo/document/...,,,,0,B9-0174/2024 - Â§ 11,,1,393,117,13
5259,18672,2024-03-14,Creation of a European initiative for an annua...,166087,Leg,mise en Åuvre et rÃ©sultats dans les Ãtats m...,s,Andrey Novakov,https://www.europarl.europa.eu/doceo/document/...,Regional Development,Regional Development,mise en Åuvre et rÃ©sultats dans les Ãtats m...,0,Politique de cohÃ©sion 2014-2020 â mise en Å...,,1,433,36,51
5260,18673,2024-03-14,Creation of a European initiative for an annua...,166692,Leg,,s,,https://www.europarl.europa.eu/doceo/document/...,,,,0,Adoption de la mesure spÃ©ciale en faveur de l...,,0,237,259,13
5261,18674,2024-03-14,Creation of a European initiative for an annua...,166693,Leg,,s,,https://www.europarl.europa.eu/doceo/document/...,,,,1,B9-0173/2024 - AprÃ¨s le considÃ©rant N - Am 1,,0,230,249,13


In [30]:
EP_9_votes_fixed_names.shape

(811, 13469)

In [31]:
EP_9_votes_fixed_names.iloc[:, :-47].shape

(811, 13422)

In [32]:
votes_b.shape

(866, 5267)

In [33]:
vote_columns = [col for col in votes_b.columns if 'Vote_' in col]

# Create a dictionary to map old column names to new column names (add 3 to the number part)
start_number = 13413
rename_dict = {col: str(start_number + i) for i, col in enumerate(vote_columns)}

# Filter the DataFrame to include only the 'Vote_' columns and rename them

b = votes_b[vote_columns].rename(columns=rename_dict)

In [34]:
b=b.astype("Int64")

In [35]:
mep_info_wnominate = votes_b[['PersID','EPG','Country','F.Name','L.Name']]

In [36]:
mep_info_wnominate.shape

(866, 5)

In [37]:
b.insert(0, 'MepId',mep_info_wnominate.PersID)

In [38]:
b.columns

Index(['MepId', '13413', '13414', '13415', '13416', '13417', '13418', '13419',
       '13420', '13421',
       ...
       '18665', '18666', '18667', '18668', '18669', '18670', '18671', '18672',
       '18673', '18674'],
      dtype='object', length=5263)

In [39]:
votes_andID = EP_9_votes_fixed_names.drop(columns=[col for col in EP_9_votes_fixed_names.columns if col in filtered_list2])

In [40]:
votes_andID = votes_andID.iloc[:, :-47]

In [41]:
test = pd.merge(votes_andID, b, on='MepId', how= 'outer')

In [42]:
votes_for_wnominate = test.astype("Int64")

In [43]:
mep_info_wnominate.rename(columns= {"PersID":"MepId"},inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mep_info_wnominate.rename(columns= {"PersID":"MepId"},inplace=True)


In [44]:
mep_info_wnominate["FullName"] = mep_info_wnominate["F.Name"] + " " + mep_info_wnominate["L.Name"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mep_info_wnominate["FullName"] = mep_info_wnominate["F.Name"] + " " + mep_info_wnominate["L.Name"]


In [45]:
mep_info_wnominate

Unnamed: 0,MepId,EPG,Country,F.Name,L.Name,FullName
0,840,RE,Luxembourg,Charles,GOERENS,Charles GOERENS
1,1394,RE,United Kingdom,Bill,NEWTON DUNN,Bill NEWTON DUNN
2,1854,S&D,Germany,Constanze,KREHL,Constanze KREHL
3,1892,EPP,Portugal,Coelho,COELHO,Coelho COELHO
4,1909,S&D,Germany,Bernd,LANGE,Bernd LANGE
...,...,...,...,...,...,...
861,251324,EPP,Poland,Krzysztof,Brejza,Krzysztof Brejza
862,251859,RE,France,Guy,Lavocat,Guy Lavocat
863,251874,EPP,Germany,Niels,Geuking,Niels Geuking
864,253008,S&D,Sweden,Linus,Glanzelius,Linus Glanzelius


In [46]:
mep_info = EP_9_votes_fixed_names[filtered_list]
vote_columns = [col for col in EP_9_votes_fixed_names.columns if col not in filtered_list]
melted_votes = pd.melt(EP_9_votes_fixed_names, id_vars='MepId', value_vars=vote_columns, var_name='VoteId', value_name='Vote')

Saving files to .csv


In [47]:
base_directory = os.path.join("Cleaned_data","EP9_clean_data")
os.makedirs(base_directory, exist_ok=True)

In [48]:
votes_for_wnominate.to_csv(os.path.join(base_directory,"matrix_ep9_votes.csv"),index=False)
mep_info_wnominate.to_csv(os.path.join(base_directory,"mep_info_for_wnominate.csv"),index=False)

In [49]:
# mep_info.to_csv(os.path.join(base_directory, "mep_info_EP_9.csv"), index=False)
# melted_votes.to_csv(os.path.join(base_directory, "votes_EP_9.csv"), index=False)
# EP_9_votations.to_csv(os.path.join(base_directory, "votations_EP_9.csv"), index=False)