In [4]:
import pandas as pd

import numpy as np
import matplotlib.pyplot as plt
import glob
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import tensorflow as tf
from tensorflow.keras import regularizers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout, LSTM, BatchNormalization, GlobalAveragePooling1D
from sklearn.metrics import classification_report
from scipy.stats import skew, kurtosis
import requests

In [53]:
import pandas as pd
import requests

def extract_bill_pdf_url(bill_id):
    try:
        url = f"https://bills-api.parliament.uk/api/v1/Bills/{bill_id}/Publications"
        response = requests.get(url, headers={"accept": "application/json"})
        if response.status_code != 200:
            print(f" Failed for bill {bill_id} â€” status code: {response.status_code}")
            return None
        
        data = response.json()
        for pub in data.get("publications", []):
            # Must be publicationType id 5 ("Bill")
             if pub.get("publicationType", {}).get("id") == 5 or pub.get("publicationType", {}).get("id") == 19 or pub.get("publicationType", {}).get("id") == 6:
                for link in pub.get("links", []):
                    if link.get("contentType") == "application/pdf":
                        print(f"Found link for  {bill_id}")
                        return {
                            "id": bill_id,
                            "url": link.get("url")
                        }
                for file in pub.get("files", []):
                    print(f"Construct URl for  {bill_id}")
                    if file.get("contentType") == "application/pdf":
                        publication_id = pub["id"]
                        file_id = file["id"]
                        constructed_url = f"https://bills.parliament.uk/publications/{publication_id}/documents/{file_id}"
                        return {"id": bill_id, "url": constructed_url}

        print(f"No valid PDF found for bill {bill_id}")
        return None
    except Exception as e:
        print(f"Error for bill {bill_id}: {e}")
        return None

# === 1. Load CSV ===
csv_path = r"C:\Users\ander\Downloads\MLP\29_17 to 39.csv"
df = pd.read_csv(csv_path)
bill_ids = df.iloc[:, 0].dropna().astype(int).tolist()  # assumes bill ID is in first column

# === 2. Process all bill IDs ===
bill_data = []
for bill_id in bill_ids:
    result = extract_bill_pdf_url(bill_id)
    if result:
        bill_data.append(result)

# === 3. Output result ===
print(f"\n Found {len(bill_data)} downloadable PDFs out of {len(bill_ids)} bills.")
for bill in bill_data:
    print(bill)

    

Found link for  3733
Found link for  3735
Found link for  3744
Construct URl for  3751
Construct URl for  3506
Found link for  3380
Construct URl for  3508
Found link for  3507
Found link for  3509
Found link for  3172
Found link for  3380
Found link for  3366
Construct URl for  3344
Construct URl for  3311
Found link for  3168
Construct URl for  3159
Construct URl for  3310
Found link for  3177
Found link for  3157
Found link for  3158
Construct URl for  2867
Found link for  2877
Found link for  3045
Construct URl for  2866
Construct URl for  2518
Construct URl for  2864
Construct URl for  2865
Construct URl for  3032
Construct URl for  2868
Construct URl for  2533
Found link for  2523
Found link for  2524
Found link for  2525
Found link for  2594
Construct URl for  2518
Found link for  2526
Found link for  2707
Found link for  2717
Found link for  2572
Found link for  2468
Found link for  2002
Found link for  2396
Found link for  2020
Found link for  2231
Found link for  2252
Found l

In [54]:
output_file = "29_17 to 39 url dictionary.txt"

with open(output_file, "w", encoding="utf-8") as f:
    for bill in bill_data:
        f.write(str(bill) + "\n")

In [55]:

import os
import time
import shutil
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.chrome.options import Options

def setup_chrome_driver(chromedriver_path, download_dir):
    chrome_options = Options()
    chrome_options.add_experimental_option("prefs", {
        "download.default_directory": download_dir,
        "download.prompt_for_download": False,
        "plugins.always_open_pdf_externally": True,
        "profile.default_content_setting_values.automatic_downloads": 1
    })
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-gpu")


    service = ChromeService(executable_path=chromedriver_path)
    return webdriver.Chrome(service=service, options=chrome_options)

def wait_for_new_pdf(download_dir, timeout=20, min_size_kb=30):

    start_time = time.time()
    seen = set(os.listdir(download_dir))

    while time.time() - start_time < timeout:
        current = set(os.listdir(download_dir))
        new_files = current - seen

        for file in new_files:
            if file.endswith(".pdf"):
                full_path = os.path.join(download_dir, file)
                if not os.path.exists(full_path + ".crdownload"):
                    size_kb = os.path.getsize(full_path) // 1024
                    if size_kb >= min_size_kb:
                        return full_path
        time.sleep(0.1)
    return None

def download_multiple_pdfs(driver, bill_data, download_dir):
    for bill in bill_data:
        bill_id = bill['id']
        url = bill['url']
        final_filename = f"bill_{bill_id}.pdf"
        final_path = os.path.join(download_dir, final_filename)

        if os.path.exists(final_path):
            print(f"[{bill_id}]  Already exists. Skipping.")
            continue

        print(f"[{bill_id}]  Downloading from: {url}")

        try:
       
            driver.get(url)

            downloaded_path = wait_for_new_pdf(download_dir)
            if downloaded_path:
                shutil.move(downloaded_path, final_path)
                print(f"[{bill_id}]  Downloaded to: {final_path}")
            else:
                print(f"[{bill_id}]  Timed out: {final_filename}")
        except Exception as e:
            print(f"[{bill_id}]  Error: {e}")


if __name__ == "__main__":
    bill_list = []

    with open("29_17 to 39 url dictionary.txt", "r") as f:
        for line in f:
            line = line.strip()
            if line:
                entry = eval(line)  # Safe here since it's trusted input
                bill_list.append(entry)


    bill_data_sample = bill_list[:10]

    print (bill_data_sample)
    
    CHROMEDRIVER_PATH = r"C:\Users\ander\Downloads\chromedriver-win64\chromedriver-win64\chromedriver.exe"
    DOWNLOAD_DIR = r"C:\Users\ander\Downloads\MLP\billTextDownload\29_17 to 39"

  
    os.makedirs(DOWNLOAD_DIR, exist_ok=True)

    driver = setup_chrome_driver(CHROMEDRIVER_PATH, DOWNLOAD_DIR)
    try:
        download_multiple_pdfs(driver, bill_list, DOWNLOAD_DIR)
    finally:
        driver.quit()
        print(" Chrome closed.")


[{'id': 3733, 'url': 'https://publications.parliament.uk/pa/bills/cbill/59-01/0057/240057.pdf'}, {'id': 3735, 'url': 'https://publications.parliament.uk/pa/bills/cbill/59-01/0130/240130.pdf'}, {'id': 3744, 'url': 'https://publications.parliament.uk/pa/bills/cbill/59-01/0110/240110.pdf'}, {'id': 3751, 'url': 'https://bills.parliament.uk/publications/58650/documents/5923'}, {'id': 3506, 'url': 'https://bills.parliament.uk/publications/55379/documents/4804'}, {'id': 3380, 'url': 'https://www.legislation.gov.uk/ukla/2024/1/pdfs/ukla_20240001_en.pdf'}, {'id': 3508, 'url': 'https://bills.parliament.uk/publications/54911/documents/4630'}, {'id': 3507, 'url': 'https://publications.parliament.uk/pa/bills/cbill/58-04/0163/230163.pdf'}, {'id': 3509, 'url': 'https://publications.parliament.uk/pa/bills/cbill/58-04/0153/230153.pdf'}, {'id': 3172, 'url': 'https://publications.parliament.uk/pa/bills/cbill/58-03/0208/220208.pdf'}]
[3733]  Downloading from: https://publications.parliament.uk/pa/bills/cb

[2002]  Downloaded to: C:\Users\ander\Downloads\MLP\billTextDownload\29_17 to 39\bill_2002.pdf
[2396]  Downloading from: https://publications.parliament.uk/pa/bills/cbill/2017-2019/0412/19412.pdf
[2396]  Downloaded to: C:\Users\ander\Downloads\MLP\billTextDownload\29_17 to 39\bill_2396.pdf
[2020]  Downloading from: https://publications.parliament.uk/pa/bills/cbill/2017-2019/0294/18294.pdf
[2020]  Downloaded to: C:\Users\ander\Downloads\MLP\billTextDownload\29_17 to 39\bill_2020.pdf
[2231]  Downloading from: https://publications.parliament.uk/pa/bills/lbill/2017-2019/0135/18135.pdf
[2231]  Downloaded to: C:\Users\ander\Downloads\MLP\billTextDownload\29_17 to 39\bill_2231.pdf
[2252]  Downloading from: https://publications.parliament.uk/pa/bills/cbill/2017-2019/0301/18301.pdf
[2252]  Downloaded to: C:\Users\ander\Downloads\MLP\billTextDownload\29_17 to 39\bill_2252.pdf
[2273]  Downloading from: https://publications.parliament.uk/pa/bills/lbill/2017-2019/0157/18157.pdf
[2273]  Downloaded t

[1165]  Downloaded to: C:\Users\ander\Downloads\MLP\billTextDownload\29_17 to 39\bill_1165.pdf
[798]  Downloading from: https://www.publications.parliament.uk/pa/privbill/201314/cityoflondon/001.pdf
[798]  Downloaded to: C:\Users\ander\Downloads\MLP\billTextDownload\29_17 to 39\bill_798.pdf
[1325]  Downloading from: https://www.publications.parliament.uk/pa/bills/cbill/2013-2014/0181/181table.pdf
[1325]  Downloaded to: C:\Users\ander\Downloads\MLP\billTextDownload\29_17 to 39\bill_1325.pdf
[1280]  Downloading from: https://www.publications.parliament.uk/pa/bills/cbill/2013-2014/0119/14119.pdf
[1280]  Downloaded to: C:\Users\ander\Downloads\MLP\billTextDownload\29_17 to 39\bill_1280.pdf
[1111]  Downloading from: https://www.publications.parliament.uk/pa/privbill/201314/hertsfilm/029563/029563.pdf
[1111]  Downloaded to: C:\Users\ander\Downloads\MLP\billTextDownload\29_17 to 39\bill_1111.pdf
[1287]  Downloading from: https://www.publications.parliament.uk/pa/bills/cbill/2013-2014/0172/141

[550]  Downloaded to: C:\Users\ander\Downloads\MLP\billTextDownload\29_17 to 39\bill_550.pdf
[547]  Downloading from: http://www.opsi.gov.uk/acts/acts2010/pdf/ukpga_20100016_en.pdf
[547]  Downloaded to: C:\Users\ander\Downloads\MLP\billTextDownload\29_17 to 39\bill_547.pdf
[560]  Downloading from: http://www.opsi.gov.uk/acts/acts2010/pdf/ukpga_20100010_en.pdf
[560]  Downloaded to: C:\Users\ander\Downloads\MLP\billTextDownload\29_17 to 39\bill_560.pdf
[356]  Already exists. Skipping.
[395]  Downloading from: http://www.opsi.gov.uk/acts/acts2009/pdf/ukpga_20090011_en.pdf
[395]  Downloaded to: C:\Users\ander\Downloads\MLP\billTextDownload\29_17 to 39\bill_395.pdf
[206]  Already exists. Skipping.
[383]  Downloading from: http://www.opsi.gov.uk/acts/acts2009/pdf/ukpga_20090006_en.pdf
[383]  Downloaded to: C:\Users\ander\Downloads\MLP\billTextDownload\29_17 to 39\bill_383.pdf
[398]  Downloading from: https://www.publications.parliament.uk/pa/cm200809/cmbills/162/2009162.pdf
[398]  Downloaded

In [56]:
import os
import re
import fitz  

def extract_clean_flat_text(pdf_path):
    doc = fitz.open(pdf_path)

    pages = [doc[i].get_text() for i in range(1, len(doc)-1)] if len(doc) > 2 else []
    doc.close()

    full_text = " ".join(pages)
    full_text = full_text.replace("\n", " ").strip()
    full_text = re.sub(r"\([^)]*\)", "", full_text)
    full_text = re.sub(r"\b\d+\b", "", full_text)
    full_text = re.sub(r"[^a-zA-Z.,; ]", " ", full_text)
    full_text = re.sub(r"\s+", " ", full_text)

    return full_text.lower().strip()



input_folder = r"C:\Users\ander\Downloads\MLP\billTextDownload\29_17 to 39"
output_folder = r"C:\Users\ander\Downloads\MLP\cleanedTextFull"
output_file = os.path.join(output_folder, "29_17 to 39 fullText.txt")


os.makedirs(output_folder, exist_ok=True)


all_cleaned_texts = []
for filename in os.listdir(input_folder):
    if filename.endswith(".pdf"):
        pdf_path = os.path.join(input_folder, filename)
        print(f" Processing {filename}...")
        try:
            cleaned = extract_clean_flat_text(pdf_path)
            if cleaned:
                all_cleaned_texts.append(cleaned)
        except Exception as e:
            print(f" Error reading {filename}: {e}")


final_text = "\n\n".join(all_cleaned_texts)


with open(output_file, "w", encoding="utf-8") as f:
    f.write(final_text)

print(f"\n text saved to:\n{output_file}")


 Processing 18294.pdf...
 Error reading 18294.pdf: cannot open empty document
 Processing 200011.pdf...
 Error reading 200011.pdf: cannot open empty document
 Processing bill_1016.pdf...
 Processing bill_1086.pdf...
 Processing bill_1098.pdf...
 Processing bill_1107.pdf...
 Processing bill_1111.pdf...
 Processing bill_1165.pdf...
 Processing bill_1166.pdf...
 Processing bill_1168.pdf...
 Processing bill_1170.pdf...
 Processing bill_1171.pdf...
 Processing bill_1280.pdf...
 Processing bill_1287.pdf...
 Processing bill_1325.pdf...
 Processing bill_1376.pdf...
 Processing bill_1377.pdf...
 Processing bill_1378.pdf...
 Processing bill_1380.pdf...
 Processing bill_1382.pdf...
 Processing bill_1483.pdf...
 Processing bill_1574.pdf...
 Processing bill_1584.pdf...
 Processing bill_1585.pdf...
 Processing bill_1592.pdf...
 Processing bill_1651.pdf...
 Processing bill_1677.pdf...
 Processing bill_1696.pdf...
 Processing bill_1703.pdf...
 Processing bill_1719.pdf...
 Processing bill_1720.pdf...
 