In [27]:
'''
Author: Conny Zhou
Email: junyi.zhou@emory.edu
Last Updated: 03/14/2024
'''

'\nAuthor: Conny Zhou\nEmail: junyi.zhou@emory.edu\nLast Updated: 03/14/2024\n'

In [28]:
import boto3
import pandas as pd
import numpy as np
import re
import boto3
import logging
from botocore.exceptions import ClientError
import io


In [29]:
def download_file_from_s3(bucket, object_name, local_file_name):
    """
    Download a file from S3 to the local file system.

    :param bucket: Name of the S3 bucket
    :param object_name: S3 object name
    :param local_file_name: Local file name to save the downloaded file
    """
    # # When using IAM roles, boto3 retrieves credentials from the instance metadata
    # s3_client = boto3.client('s3')

    #When setting up credentials locally, use the following code
    session = boto3.Session()
    s3_client = session.client('s3')

    s3_client.download_file(bucket, object_name, local_file_name)

bucket_name = 'myukdata'
s3_file_name = 'Original/Publication/Publication.csv'
local_file = 'Publication.csv'

download_file_from_s3(bucket_name, s3_file_name, local_file)



In [30]:
df = pd.read_csv('Publication.csv')
df.info()
pd.set_option('display.max_rows', None)
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56330 entries, 0 to 56329
Data columns (total 10 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   billId                       56330 non-null  int64  
 1   house                        56330 non-null  object 
 2   id                           56330 non-null  float64
 3   title                        56330 non-null  object 
 4   displayDate                  56330 non-null  object 
 5   links                        56330 non-null  object 
 6   files                        56330 non-null  object 
 7   publicationType.id           56330 non-null  float64
 8   publicationType.name         56330 non-null  object 
 9   publicationType.description  56248 non-null  object 
dtypes: float64(2), int64(1), object(7)
memory usage: 4.3+ MB


Unnamed: 0,billId,house,id,title,displayDate,links,files,publicationType.id,publicationType.name,publicationType.description
0,170,Commons,99.0,Bill as introduced - html version,2007-10-18T00:00:00,"[{'id': 103, 'title': 'Bill as introduced - ht...",[],5.0,Bill,Full text of the Bill as introduced and furthe...
1,170,Commons,100.0,Bill as introduced - pdf version,2007-10-18T00:00:00,"[{'id': 104, 'title': 'Bill as introduced - pd...",[],5.0,Bill,Full text of the Bill as introduced and furthe...
2,104,Commons,1.0,Lords Amendments - html version,2007-10-10T00:00:00,"[{'id': 1, 'title': 'Lords Amendments - html v...",[],5.0,Bill,Full text of the Bill as introduced and furthe...
3,104,Commons,2.0,Lords Amendments - pdf version,2007-10-10T00:00:00,"[{'id': 2, 'title': 'Lords Amendments - pdf ve...",[],5.0,Bill,Full text of the Bill as introduced and furthe...
4,104,Commons,1.0,Lords Amendments - html version,2007-10-10T00:00:00,"[{'id': 1, 'title': 'Lords Amendments - html v...",[],5.0,Bill,Full text of the Bill as introduced and furthe...


In [31]:
df['publicationType.name'].unique()

array(['Bill', 'Amendment Paper', 'Explanatory Notes', 'Briefing papers',
       'Bill proceedings: Commons', 'Membership', 'Act of Parliament',
       'Select Committee report', 'Written evidence',
       'Petition against the Bill', 'Selection of amendments: Commons',
       'Tracked changes versions of Bills: Commons', 'Impact Assessments',
       'Press notices', "Examiners' Certificate (regarding hybridity)",
       'Report of proceedings', 'Committee Debate', 'Relevant documents',
       'Standard Notes', 'Will write letters',
       'Legislative Consent Motions-devolved legislatures',
       'Library Notes', 'Keeling schedules',
       'Tracked changes versions of Bills: Lords',
       'English votes for English laws', 'Delegated Powers Memorandum',
       'Delegated Powers Note', 'Human rights memorandum',
       'Tracked changes version of EU Bill: Commons       ',
       'Additional Provision to the Bill',
       'Selection of amendments for Consideration', 'Other documents',

In [32]:
# Extracting the links from the 'links' column
df_modified = df.copy() # Create a copy of the dataframe
no_link_bill = []
file_type = []

# Regular expression to match URLs
url_pattern = r'https?://[^\s,}\']+'

for i in range(len(df_modified['links'])):
    link = re.findall(url_pattern, df_modified['links'][i])
    # be aware that this link is a list of strings
    if link:  # Check if link is not empty
        df_modified['links'][i] = link[0]
        # we only want the first url
        print(f"Link found for {df_modified['billId'][i]}: {link[0]}")
        if '.pdf' in link[0]:
            file_type.append('pdf')
        elif '.html' in link[0]:
            file_type.append('html')
        else:
            file_type.append('htm')
    else:
        df_modified['links'][i] = np.nan
        print(f"Link not found for {df_modified['billId'][i]}")
        no_link_bill.append(df_modified['billId'][i])
        file_type.append('no link')

# Add the new column to the dataframe
df_modified['file_type'] = file_type

Link found for 170: https://www.publications.parliament.uk/pa/cm200607/cmbills/162/07162.i-i.html
Link found for 170: https://www.publications.parliament.uk/pa/cm200607/cmbills/162/2007162.pdf
Link found for 104: https://www.publications.parliament.uk/pa/cm200607/cmbills/158/07158.1-4.html
Link found for 104: https://www.publications.parliament.uk/pa/cm200607/cmbills/158/2007158.pdf
Link found for 104: https://www.publications.parliament.uk/pa/cm200607/cmbills/158/07158.1-4.html
Link found for 104: https://www.publications.parliament.uk/pa/cm200607/cmbills/158/2007158.pdf
Link found for 62: https://publications.parliament.uk/pa/ld200607/ldbills/109/amend/ml109-i.htm
Link found for 28: https://publications.parliament.uk/pa/ld200607/ldbills/109/ml109-i.htm
Link found for 232: https://www.publications.parliament.uk/pa/cm200708/cmbills/021/08021.i-i.html
Link found for 232: https://www.publications.parliament.uk/pa/cm200708/cmbills/021/2008021.pdf
Link found for 225: https://www.publicatio

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_modified['links'][i] = link[0]


Link found for 192: https://www.publications.parliament.uk/pa/ld200708/ldbills/007/amend/ml007-ir.htm
Link found for 192: https://www.publications.parliament.uk/pa/ld200708/ldbills/007/amend/ml007-ii.htm
Link found for 192: https://www.publications.parliament.uk/pa/ld200708/ldbills/007/amend/ml007-iii.htm
Link found for 192: https://www.publications.parliament.uk/pa/ld200708/ldbills/007/amend/ml007-iv.htm
Link found for 192: https://www.publications.parliament.uk/pa/ld200708/ldbills/007/amend/ml007-v.htm
Link found for 192: https://www.publications.parliament.uk/pa/ld200708/ldbills/031/08031.i-iv.html
Link found for 192: https://www.publications.parliament.uk/pa/ld200708/ldbills/031/2008031.pdf
Link found for 192: https://www.publications.parliament.uk/pa/ld200708/ldbills/031/amend/ml031-i.htm
Link found for 192: https://www.publications.parliament.uk/pa/ld200708/ldbills/031/amend/su031-ia.htm
Link found for 192: https://www.publications.parliament.uk/pa/ld200708/ldbills/031/amend/ml03

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_modified['links'][i] = np.nan


Link found for 3030: https://www.publications.parliament.uk/pa/bills/cbill/58-02/0147/210147.pdf
Link found for 3100: https://publications.parliament.uk/pa/bills/cbill/58-02/0233/210233.pdf
Link found for 3005: https://publications.parliament.uk/pa/bills/cbill/58-02/0123/210123.pdf
Link found for 3104: https://publications.parliament.uk/pa/bills/cbill/58-02/0243/210243v2.pdf
Link found for 3106: https://publications.parliament.uk/pa/bills/cbill/58-02/0245/210245.pdf
Link found for 3114: https://publications.parliament.uk/pa/bills/cbill/58-02/0254/210254.pdf
Link found for 2929: https://publications.parliament.uk/pa/bills/cbill/58-02/0047/21047.pdf
Link not found for 2876
Link found for 2876: https://lordslibrary.parliament.uk/status-of-workers-bill-hl/
Link found for 2876: https://publications.parliament.uk/pa/ld5802/ldselect/lddelreg/101/10105.htm
Link not found for 2876
Link found for 2876: https://publications.parliament.uk/pa/bills/cbill/58-02/0242/210242.pdf
Link not found for 287

In [33]:
df_modified.head() 

Unnamed: 0,billId,house,id,title,displayDate,links,files,publicationType.id,publicationType.name,publicationType.description,file_type
0,170,Commons,99.0,Bill as introduced - html version,2007-10-18T00:00:00,https://www.publications.parliament.uk/pa/cm20...,[],5.0,Bill,Full text of the Bill as introduced and furthe...,html
1,170,Commons,100.0,Bill as introduced - pdf version,2007-10-18T00:00:00,https://www.publications.parliament.uk/pa/cm20...,[],5.0,Bill,Full text of the Bill as introduced and furthe...,pdf
2,104,Commons,1.0,Lords Amendments - html version,2007-10-10T00:00:00,https://www.publications.parliament.uk/pa/cm20...,[],5.0,Bill,Full text of the Bill as introduced and furthe...,html
3,104,Commons,2.0,Lords Amendments - pdf version,2007-10-10T00:00:00,https://www.publications.parliament.uk/pa/cm20...,[],5.0,Bill,Full text of the Bill as introduced and furthe...,pdf
4,104,Commons,1.0,Lords Amendments - html version,2007-10-10T00:00:00,https://www.publications.parliament.uk/pa/cm20...,[],5.0,Bill,Full text of the Bill as introduced and furthe...,html


In [34]:
# Convert the 'displayDate' column to datetime
df_modified['displayDate'] = pd.to_datetime(df['displayDate'])
df_modified.head() 

Unnamed: 0,billId,house,id,title,displayDate,links,files,publicationType.id,publicationType.name,publicationType.description,file_type
0,170,Commons,99.0,Bill as introduced - html version,2007-10-18,https://www.publications.parliament.uk/pa/cm20...,[],5.0,Bill,Full text of the Bill as introduced and furthe...,html
1,170,Commons,100.0,Bill as introduced - pdf version,2007-10-18,https://www.publications.parliament.uk/pa/cm20...,[],5.0,Bill,Full text of the Bill as introduced and furthe...,pdf
2,104,Commons,1.0,Lords Amendments - html version,2007-10-10,https://www.publications.parliament.uk/pa/cm20...,[],5.0,Bill,Full text of the Bill as introduced and furthe...,html
3,104,Commons,2.0,Lords Amendments - pdf version,2007-10-10,https://www.publications.parliament.uk/pa/cm20...,[],5.0,Bill,Full text of the Bill as introduced and furthe...,pdf
4,104,Commons,1.0,Lords Amendments - html version,2007-10-10,https://www.publications.parliament.uk/pa/cm20...,[],5.0,Bill,Full text of the Bill as introduced and furthe...,html


In [35]:
# Drop the 'links' column, as it contains a lot of missing values
# Drop the 'id' column as it is not useful and can be misleading compared with the 'billId' and 'publicationTypeId'column
df_modified['files']
df_modified.drop(columns=['files'], inplace=True)

In [36]:
#Change id and publicationType.id tp int64
df_modified['publicationType.id'] = df_modified['publicationType.id'].astype('int64')
df_modified['id'] = df_modified['id'].astype('int64')
#Rename some of the columns to make them more readable
df_modified.rename(columns={'publicationType.id': 'publicationTypeId',
                     'publicationType.name': 'publicationTypeName',
                     'publicationType.description':'publicationTypeDescription',
                     'file_type':'fileType'}, inplace=True)
df_modified.head()

Unnamed: 0,billId,house,id,title,displayDate,links,publicationTypeId,publicationTypeName,publicationTypeDescription,fileType
0,170,Commons,99,Bill as introduced - html version,2007-10-18,https://www.publications.parliament.uk/pa/cm20...,5,Bill,Full text of the Bill as introduced and furthe...,html
1,170,Commons,100,Bill as introduced - pdf version,2007-10-18,https://www.publications.parliament.uk/pa/cm20...,5,Bill,Full text of the Bill as introduced and furthe...,pdf
2,104,Commons,1,Lords Amendments - html version,2007-10-10,https://www.publications.parliament.uk/pa/cm20...,5,Bill,Full text of the Bill as introduced and furthe...,html
3,104,Commons,2,Lords Amendments - pdf version,2007-10-10,https://www.publications.parliament.uk/pa/cm20...,5,Bill,Full text of the Bill as introduced and furthe...,pdf
4,104,Commons,1,Lords Amendments - html version,2007-10-10,https://www.publications.parliament.uk/pa/cm20...,5,Bill,Full text of the Bill as introduced and furthe...,html


In [40]:
def upload_df_to_s3(df, bucket, object_name):
    """
    Upload a DataFrame to an S3 bucket as CSV.

    :param df: DataFrame to upload
    :param bucket: Bucket to upload to
    :param object_name: S3 object name
    :return: True if the DataFrame was uploaded, else False
    """
    # Create a buffer
    csv_buffer = io.StringIO()
    df.to_csv(csv_buffer, index=False)

    # Move to the start of the buffer
    csv_buffer.seek(0)

    #When setting up credentials locally, use the following code
    session = boto3.Session()
    s3_client = session.client('s3')
    # # # When using IAM roles, boto3 retrieves credentials from the instance metadata
    # s3_client = boto3.client('s3')


    try:
        s3_client.put_object(Bucket=bucket, Key=object_name, Body=csv_buffer.getvalue())
    except ClientError as e:
        logging.error(e)
        return False
    return True

In [44]:
print(df_modified['id'].nunique())
print(len(df_modified))

29824
56330


In [47]:
# I realized that there are some duplicate rows in the dataframe, so I will drop them
# They are named as 'perfect' but when uploaded, they will still get the name 'Publication_modified.csv'
df_perfect = df_modified.drop_duplicates()
print(df_perfect['id'].nunique())
print(len(df_perfect))

29824
29824


In [48]:
# Upload the modified dataframe to S3
bucket_name = 'myukdata'
object_name = 'Original/Publication/Publication_Modified.csv'
upload_df_to_s3(df_perfect, bucket_name, object_name)

True

In [50]:
# Save the modified dataframe to a new csv file
df_perfect.to_csv('Publication_modified.csv', index=False)