In [6]:
'''
Author: Conny Zhou
Email: junyi.zhou@emory.edu
Last Updated: 03/14/2024
'''

'\nAuthor: Conny Zhou\nEmail: junyi.zhou@emory.edu\nLast Updated: 03/14/2024\n'

In [7]:
from curl_cffi import requests
from bs4 import BeautifulSoup
import pandas as pd
from botocore.exceptions import ClientError
import PyPDF2

In [8]:
import boto3
import logging
from botocore.exceptions import ClientError

def upload_pdf_html_to_s3(binary, bucket, object_name):
    """
    Upload a PDF to an S3 bucket.

    :param pdf_binary: Binary stream of the PDF file
    :param bucket: Bucket to upload to
    :param object_name: S3 object name
    :return: True if the PDF was uploaded, else False
    """
    # When setting up credentials locally, use the following code
    session = boto3.Session()
    s3_client = session.client('s3')
    # # When using IAM roles, boto3 retrieves credentials from the instance metadata
    # s3_client = boto3.client('s3')

    try:
        s3_client.put_object(Bucket=bucket, Key=object_name, Body=binary)
    except ClientError as e:
        logging.error(e)
        return False
    return True


In [9]:
def upload_df_to_s3(df, bucket, object_name):
    """
    Upload a DataFrame to an S3 bucket as CSV.

    :param df: DataFrame to upload
    :param bucket: Bucket to upload to
    :param object_name: S3 object name
    :return: True if the DataFrame was uploaded, else False
    """
    # Create a buffer
    csv_buffer = io.StringIO()
    df.to_csv(csv_buffer, index=False)

    # Move to the start of the buffer
    csv_buffer.seek(0)

    #When setting up credentials locally, use the following code
    session = boto3.Session()
    s3_client = session.client('s3')
    # # # When using IAM roles, boto3 retrieves credentials from the instance metadata
    # s3_client = boto3.client('s3')


    try:
        s3_client.put_object(Bucket=bucket, Key=object_name, Body=csv_buffer.getvalue())
    except ClientError as e:
        logging.error(e)
        return False
    return True

In [10]:
# Import Publication_modified.csv
import pandas as pd
df = pd.read_csv('Publication_modified.csv')
df['objectNames'] = df['billId'].astype(str) + '_' + df['publicationTypeId'].astype(str) + '_' + df['id'].astype(str) + '_' + df['fileType']
df.head()

Unnamed: 0,billId,house,id,title,displayDate,links,publicationTypeId,publicationTypeName,publicationTypeDescription,fileType,objectNames
0,170,Commons,99,Bill as introduced - html version,2007-10-18,https://www.publications.parliament.uk/pa/cm20...,5,Bill,Full text of the Bill as introduced and furthe...,html,170_5_99_html
1,170,Commons,100,Bill as introduced - pdf version,2007-10-18,https://www.publications.parliament.uk/pa/cm20...,5,Bill,Full text of the Bill as introduced and furthe...,pdf,170_5_100_pdf
2,104,Commons,1,Lords Amendments - html version,2007-10-10,https://www.publications.parliament.uk/pa/cm20...,5,Bill,Full text of the Bill as introduced and furthe...,html,104_5_1_html
3,104,Commons,2,Lords Amendments - pdf version,2007-10-10,https://www.publications.parliament.uk/pa/cm20...,5,Bill,Full text of the Bill as introduced and furthe...,pdf,104_5_2_pdf
4,62,Lords,4,Marshalled List of Amendments as at 29 October...,2007-10-30,https://publications.parliament.uk/pa/ld200607...,7,Amendment Paper,Full list of amendment papers relating to the ...,htm,62_7_4_htm


In [11]:
for i in range(2):
    print(df.loc[i, 'links'], df.loc[i, 'objectNames'])

https://www.publications.parliament.uk/pa/cm200607/cmbills/162/07162.i-i.html 170_5_99_html
https://www.publications.parliament.uk/pa/cm200607/cmbills/162/2007162.pdf 170_5_100_pdf


In [12]:
# s3://myukdata/Original/Publication/Publication_PDF/
bucket_name = 'myukdata'
folder_path_pdf = 'Original/Publication/Publication_PDF'
folder_path_html = 'Original/Publication/Publication_HTML'

failed_object = []
for i in range(len(df)):
    link = df.loc[i, 'links']
    objectName = df.loc[i, 'objectNames']
    print(link, objectName)

    # Check if the link exists
    if link:
        # Check if the link is a PDF or HTML, upload them into 2 separate folders
        if '.pdf' in link:
            response_pdf = requests.get(link, impersonate='chrome110')
            if response_pdf.status_code == 200:
                print(f"PDF downloaded for {objectName}")
                response_pdf = requests.get(link, impersonate='chrome110')
                # Extract binary content from response
                pdf_binary_content = response_pdf.content
                # Upload the DataFrame to S3
                upload_pdf_html_to_s3(pdf_binary_content, bucket_name, f"{folder_path_pdf}/{objectName}.pdf")
            else:
                print(f"PDF download failed for {objectName}")
                failed_object.append(objectName)
        elif '.html' in link:
            response_html = requests.get(link, impersonate='chrome110')
            if response_html.status_code == 200:
                print(f"HTML downloaded for {objectName}")
                response_html = requests.get(link, impersonate='chrome110')
                # Extract binary content from response
                html_binary_content = response_html.content
                # Upload the DataFrame to S3
                upload_pdf_html_to_s3(html_binary_content, bucket_name, f"{folder_path_html}/{objectName}.html")
            else:
                print(f"HTML download failed for {objectName}")
                failed_object.append(objectName)



# Save failed_object to S3
failed_object = pd.DataFrame(failed_object)
if failed_object.empty == False:
    failed_object.columns = ['objectName']
    upload_df_to_s3(failed_object, bucket_name, f"{folder_path}/failed_object.csv")









https://www.publications.parliament.uk/pa/cm200607/cmbills/162/07162.i-i.html 170_5_99_html
HTML downloaded for 170_5_99_html
https://www.publications.parliament.uk/pa/cm200607/cmbills/162/2007162.pdf 170_5_100_pdf
PDF downloaded for 170_5_100_pdf
https://www.publications.parliament.uk/pa/cm200607/cmbills/158/07158.1-4.html 104_5_1_html
HTML downloaded for 104_5_1_html
https://www.publications.parliament.uk/pa/cm200607/cmbills/158/2007158.pdf 104_5_2_pdf
PDF downloaded for 104_5_2_pdf
https://publications.parliament.uk/pa/ld200607/ldbills/109/amend/ml109-i.htm 62_7_4_htm
https://publications.parliament.uk/pa/ld200607/ldbills/109/ml109-i.htm 28_7_3_htm
https://www.publications.parliament.uk/pa/cm200708/cmbills/021/08021.i-i.html 232_5_1209_html
HTML downloaded for 232_5_1209_html
https://www.publications.parliament.uk/pa/cm200708/cmbills/021/2008021.pdf 232_5_1210_pdf
PDF downloaded for 232_5_1210_pdf
https://www.publications.parliament.uk/pa/cm200708/cmbills/016/08016.i-i.html 225_5_11

KeyboardInterrupt: 