In [76]:
import boto3
import pandas as pd
import numpy as np


In [77]:
def download_file_from_s3(bucket, object_name, local_file_name):
    """
    Download a file from S3 to the local file system.

    :param bucket: Name of the S3 bucket
    :param object_name: S3 object name
    :param local_file_name: Local file name to save the downloaded file
    """
    # # When using IAM roles, boto3 retrieves credentials from the instance metadata
    # s3_client = boto3.client('s3')

    #When setting up credentials locally, use the following code
    session = boto3.Session()
    s3_client = session.client('s3')

    s3_client.download_file(bucket, object_name, local_file_name)

bucket_name = 'myukdata'
s3_file_name = 'Bills/BillAllStages/BillsAllStages.csv'
local_file = 'BillAllStages.csv'

download_file_from_s3(bucket_name, s3_file_name, local_file)
download_file_from_s3(bucket_name, 'Bills/BillsLatestStage_Combined/BillsLatestStage_Combined.csv', 'BillsLatestStage_Combined.csv')


In [78]:
#Read the csv files into pandas dataframes
df_latest = pd.read_csv('BillsLatestStage_Combined.csv')
df_all = pd.read_csv('BillAllStages.csv')
print(f"{len(df_latest)} and {len(df_latest.columns)}")
print(f"{len(df_all)} and {len(df_all.columns)}")

4038 and 29
13604 and 9


In [79]:
#Merge the two dataframes based on the billId
df_merged = df_latest.merge(df_all, left_on='billId', right_on='BillID', how='left')
#Drop the redundant BillID column to mimick natural join
df_merged.drop(columns=['BillID'], inplace=True)
print(f"{len(df_merged)} and {len(df_merged.columns)}")
print(df_merged.columns)
print(df_merged.info())
# Set the display options
pd.set_option('display.max_columns', None)
df_merged.head()

21375 and 37
Index(['billId', 'shortTitle', 'currentHouse', 'originatingHouse',
       'lastUpdate', 'billWithdrawn', 'isDefeated', 'billTypeId',
       'introducedSessionId', 'includedSessionIds', 'isAct', 'currentStage.id',
       'currentStage.stageId', 'currentStage.sessionId',
       'currentStage.description', 'currentStage.abbreviation',
       'currentStage.house', 'currentStage.stageSittings',
       'currentStage.sortOrder', 'member.memberId', 'longTitle', 'summary',
       'petitioningPeriod', 'petitionInformation', 'agent', 'member',
       'sortOrder_x', 'organisation.name', 'organisation.url', 'id', 'stageId',
       'sessionId', 'description', 'abbreviation', 'house', 'stageSittings',
       'sortOrder_y'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
Int64Index: 21375 entries, 0 to 21374
Data columns (total 37 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   billId            

Unnamed: 0,billId,shortTitle,currentHouse,originatingHouse,lastUpdate,billWithdrawn,isDefeated,billTypeId,introducedSessionId,includedSessionIds,isAct,currentStage.id,currentStage.stageId,currentStage.sessionId,currentStage.description,currentStage.abbreviation,currentStage.house,currentStage.stageSittings,currentStage.sortOrder,member.memberId,longTitle,summary,petitioningPeriod,petitionInformation,agent,member,sortOrder_x,organisation.name,organisation.url,id,stageId,sessionId,description,abbreviation,house,stageSittings,sortOrder_y
0,29.0,Alcohol Labelling Bill [HL],Lords,Lords,2007-10-10T09:11:00,,False,2.0,20.0,[20],False,146.0,2.0,20.0,2nd reading,2R,Lords,"[{'id': 5, 'stageId': 2, 'billStageId': 146, '...",2.0,2570.0,To make provision for the labelling of alcohol...,,,,,,,,,145.0,1.0,20.0,1st reading,1R,Lords,"[{'id': 4, 'stageId': 1, 'billStageId': 145, '...",1.0
1,29.0,Alcohol Labelling Bill [HL],Lords,Lords,2007-10-10T09:11:00,,False,2.0,20.0,[20],False,146.0,2.0,20.0,2nd reading,2R,Lords,"[{'id': 5, 'stageId': 2, 'billStageId': 146, '...",2.0,2570.0,To make provision for the labelling of alcohol...,,,,,,,,,146.0,2.0,20.0,2nd reading,2R,Lords,"[{'id': 5, 'stageId': 2, 'billStageId': 146, '...",2.0
2,35.0,Development Orders (Microgeneration) (formerly...,Lords,Lords,2007-10-13T16:36:00,,False,2.0,20.0,[20],False,170.0,3.0,20.0,Committee stage,CS,Lords,"[{'id': 25, 'stageId': 3, 'billStageId': 170, ...",3.0,3271.0,Make provision for a review of permitted devel...,,,,,,,,,168.0,1.0,20.0,1st reading,1R,Lords,"[{'id': 23, 'stageId': 1, 'billStageId': 168, ...",1.0
3,35.0,Development Orders (Microgeneration) (formerly...,Lords,Lords,2007-10-13T16:36:00,,False,2.0,20.0,[20],False,170.0,3.0,20.0,Committee stage,CS,Lords,"[{'id': 25, 'stageId': 3, 'billStageId': 170, ...",3.0,3271.0,Make provision for a review of permitted devel...,,,,,,,,,169.0,2.0,20.0,2nd reading,2R,Lords,"[{'id': 24, 'stageId': 2, 'billStageId': 169, ...",2.0
4,35.0,Development Orders (Microgeneration) (formerly...,Lords,Lords,2007-10-13T16:36:00,,False,2.0,20.0,[20],False,170.0,3.0,20.0,Committee stage,CS,Lords,"[{'id': 25, 'stageId': 3, 'billStageId': 170, ...",3.0,3271.0,Make provision for a review of permitted devel...,,,,,,,,,170.0,3.0,20.0,Committee stage,CS,Lords,"[{'id': 25, 'stageId': 3, 'billStageId': 170, ...",3.0


In [80]:
#Select the columns that are needed
df_subset = df_merged.iloc[:, [0,1,3,6,7,8,9,10,19,20]]
# Keep only the first occurrence of each 'billId'
df_unique = df_subset.drop_duplicates(subset='billId', keep='first')
#Drope the duplicate rows
#df_subset = df_subset.drop_duplicates("billId", keep='first', inplace=True)
#print(f"{len(df_subset)} and {len(df_subset.columns)}")
print(df_unique.info())
df_unique.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3431 entries, 0 to 21373
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   billId               3431 non-null   float64
 1   shortTitle           3431 non-null   object 
 2   originatingHouse     3431 non-null   object 
 3   isDefeated           3431 non-null   bool   
 4   billTypeId           3431 non-null   float64
 5   introducedSessionId  3431 non-null   float64
 6   includedSessionIds   3431 non-null   object 
 7   isAct                3431 non-null   bool   
 8   member.memberId      3380 non-null   float64
 9   longTitle            3265 non-null   object 
dtypes: bool(2), float64(4), object(4)
memory usage: 247.9+ KB
None


Unnamed: 0,billId,shortTitle,originatingHouse,isDefeated,billTypeId,introducedSessionId,includedSessionIds,isAct,member.memberId,longTitle
0,29.0,Alcohol Labelling Bill [HL],Lords,False,2.0,20.0,[20],False,2570.0,To make provision for the labelling of alcohol...
2,35.0,Development Orders (Microgeneration) (formerly...,Lords,False,2.0,20.0,[20],False,3271.0,Make provision for a review of permitted devel...
5,63.0,Royal Commission (Slavery) Bill [HL],Lords,False,2.0,20.0,[20],False,3364.0,Make provision for the establishment of a Roya...
7,37.0,European Union (Implications of Withdrawal) Bi...,Lords,False,2.0,20.0,[20],False,3153.0,Establish a Committee of Inquiry into the impl...
9,54.0,Light Bulb (Regulation) Bill,Commons,False,8.0,20.0,[20],False,1412.0,


In [81]:
# Rename columns
# In particular, the currentStageId is a unique identifier for the stage of the bill as organized on the website
# The currentStageStageId is the stage of the bill as defined in the schema
df_unique = df_unique.rename(columns={'member.memberId': 'memberId'})
df_unique.info()
#I want to check that each billId is unique
print(df_unique['billId'].is_unique)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3431 entries, 0 to 21373
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   billId               3431 non-null   float64
 1   shortTitle           3431 non-null   object 
 2   originatingHouse     3431 non-null   object 
 3   isDefeated           3431 non-null   bool   
 4   billTypeId           3431 non-null   float64
 5   introducedSessionId  3431 non-null   float64
 6   includedSessionIds   3431 non-null   object 
 7   isAct                3431 non-null   bool   
 8   memberId             3380 non-null   float64
 9   longTitle            3265 non-null   object 
dtypes: bool(2), float64(4), object(4)
memory usage: 247.9+ KB
True


In [85]:
#Let us modify the billsallstages, using billId as the foreign key
#Change the column name of BillAllStages to billId to match that of BillsLatestStage_Combined
df_all = pd.read_csv('BillAllStages.csv')
df_all = df_all.rename(columns={'BillID': 'billId'})
df_all = df_all.drop(columns=['abbreviation','sortOrder'])

#Change the column name of BillAllStages to match that of BillsLatestStage_Combined
df_all = df_all.rename(columns={'BillID': 'billId',
                        'id':'urlStageId',
                        'stageId':'stageId',
                        'sessionId':'sessionId',
                        'description':'stageName',
                        'stageSittings':'stageSittings'})
print(df_all.info())
df_all.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13604 entries, 0 to 13603
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   billId         13604 non-null  int64  
 1   urlStageId     13604 non-null  float64
 2   stageId        13604 non-null  float64
 3   sessionId      13604 non-null  float64
 4   stageName      13604 non-null  object 
 5   house          13604 non-null  object 
 6   stageSittings  13604 non-null  object 
dtypes: float64(3), int64(1), object(3)
memory usage: 744.1+ KB
None


Unnamed: 0,billId,urlStageId,stageId,sessionId,stageName,house,stageSittings
0,26,137.0,6.0,20.0,1st reading,Commons,"[{'id': 1, 'stageId': 6, 'billStageId': 137, '..."
1,26,12944.0,7.0,20.0,2nd reading,Commons,[]
2,27,139.0,6.0,20.0,1st reading,Commons,"[{'id': 2, 'stageId': 6, 'billStageId': 139, '..."
3,27,12945.0,7.0,20.0,2nd reading,Commons,[]
4,28,141.0,6.0,20.0,1st reading,Commons,"[{'id': 3, 'stageId': 6, 'billStageId': 141, '..."


In [86]:
df_all.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13604 entries, 0 to 13603
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   billId         13604 non-null  int64  
 1   urlStageId     13604 non-null  float64
 2   stageId        13604 non-null  float64
 3   sessionId      13604 non-null  float64
 4   stageName      13604 non-null  object 
 5   house          13604 non-null  object 
 6   stageSittings  13604 non-null  object 
dtypes: float64(3), int64(1), object(3)
memory usage: 744.1+ KB


In [84]:
#Let us compare the stageID and other stuff with the reference table