In [30]:
import pandas as pd

import numpy as np
import matplotlib.pyplot as plt
import glob
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import tensorflow as tf
from tensorflow.keras import regularizers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout, LSTM, BatchNormalization, GlobalAveragePooling1D
from sklearn.metrics import classification_report
from scipy.stats import skew, kurtosis
import requests
from tqdm import tqdm

In [53]:
import pandas as pd
import requests

bills_no_pub =[]

def extract_bill_pdf_url(bill_id):
    try:
        url = f"https://bills-api.parliament.uk/api/v1/Bills/{bill_id}/Publications"
        response = requests.get(url, headers={"accept": "application/json"})
        if response.status_code != 200:
            print(f" Failed for bill {bill_id} — status code: {response.status_code}")
            return None
        
        data = response.json()
        for pub in data.get("publications", []):
            pub_id = pub.get("publicationType", {}).get("id")
            if pub_id == 5 or pub_id == 19 or pub_id == 6:
                for link in pub.get("links", []):
                    if link.get("contentType") == "application/pdf":
                        #print(f"Found link for  {bill_id}")
                        return {
                            "id": bill_id,
                            "url": link.get("url")
                        }
                for file in pub.get("files", []):
                    #print(f"Construct URl for  {bill_id}")
                    if file.get("contentType") == "application/pdf":
                        publication_id = pub["id"]
                        file_id = file["id"]
                        constructed_url = f"https://bills.parliament.uk/publications/{publication_id}/documents/{file_id}"
                        return {"id": bill_id, "url": constructed_url}

        print(f"No valid PDF found for bill {bill_id}")
        
        return None
    except Exception as e:
        print(f"Error for bill {bill_id}: {e}")
        return None


csv_path = r"C:\Users\ander\Downloads\MLP\19_17 to 39.csv"
df = pd.read_csv(csv_path)
bill_ids = df.iloc[:, 0].dropna().astype(int).tolist()  


bill_data = []
failed_bills = []

for bill_id in tqdm(bill_ids, desc="Extracting URLS"):
    result = extract_bill_pdf_url(bill_id)
    if result:
        bill_data.append(result)
    else:
        failed_bills.append(bill_id)  


print(f"\n Found {len(bill_data)} downloadable PDFs out of {len(bill_ids)} bills.")
for bill in bill_data:
    print(bill)

if failed_bills:
    print("\nFailed bill IDs:")
    print(len (failed_bills))
    

Extracting URLS:  80%|██████████████████████████████████████████████████▌            | 547/682 [04:49<01:02,  2.17it/s]

No valid PDF found for bill 1078


Extracting URLS:  81%|██████████████████████████████████████████████████▊            | 550/682 [04:51<01:19,  1.67it/s]

No valid PDF found for bill 1006


Extracting URLS:  85%|█████████████████████████████████████████████████████▋         | 581/682 [05:08<00:45,  2.23it/s]

No valid PDF found for bill 914


Extracting URLS:  86%|██████████████████████████████████████████████████████▏        | 587/682 [05:11<00:46,  2.05it/s]

No valid PDF found for bill 810


Extracting URLS:  97%|████████████████████████████████████████████████████████████▉  | 660/682 [05:48<00:13,  1.67it/s]

No valid PDF found for bill 29


Extracting URLS:  97%|█████████████████████████████████████████████████████████████  | 661/682 [05:49<00:14,  1.42it/s]

No valid PDF found for bill 30


Extracting URLS:  97%|█████████████████████████████████████████████████████████████▏ | 662/682 [05:50<00:17,  1.15it/s]

No valid PDF found for bill 31


Extracting URLS:  97%|█████████████████████████████████████████████████████████████▏ | 663/682 [05:51<00:13,  1.38it/s]

No valid PDF found for bill 34


Extracting URLS:  97%|█████████████████████████████████████████████████████████████▎ | 664/682 [05:52<00:12,  1.44it/s]

No valid PDF found for bill 35


Extracting URLS:  98%|█████████████████████████████████████████████████████████████▍ | 665/682 [05:52<00:10,  1.66it/s]

No valid PDF found for bill 36


Extracting URLS:  98%|█████████████████████████████████████████████████████████████▌ | 666/682 [05:53<00:10,  1.59it/s]

No valid PDF found for bill 37


Extracting URLS:  98%|█████████████████████████████████████████████████████████████▌ | 667/682 [05:53<00:08,  1.87it/s]

No valid PDF found for bill 38


Extracting URLS:  98%|█████████████████████████████████████████████████████████████▋ | 668/682 [05:53<00:07,  1.95it/s]

No valid PDF found for bill 41


Extracting URLS:  98%|█████████████████████████████████████████████████████████████▊ | 669/682 [05:54<00:06,  2.16it/s]

No valid PDF found for bill 42


Extracting URLS:  98%|█████████████████████████████████████████████████████████████▉ | 670/682 [05:54<00:05,  2.24it/s]

No valid PDF found for bill 43


Extracting URLS:  98%|█████████████████████████████████████████████████████████████▉ | 671/682 [05:55<00:05,  1.88it/s]

No valid PDF found for bill 44


Extracting URLS:  99%|██████████████████████████████████████████████████████████████ | 672/682 [05:55<00:05,  1.82it/s]

No valid PDF found for bill 46


Extracting URLS:  99%|██████████████████████████████████████████████████████████████▏| 673/682 [05:57<00:06,  1.29it/s]

No valid PDF found for bill 48


Extracting URLS:  99%|██████████████████████████████████████████████████████████████▎| 674/682 [05:57<00:05,  1.34it/s]

No valid PDF found for bill 81


Extracting URLS:  99%|██████████████████████████████████████████████████████████████▎| 675/682 [05:58<00:04,  1.50it/s]

No valid PDF found for bill 88


Extracting URLS:  99%|██████████████████████████████████████████████████████████████▍| 676/682 [05:58<00:03,  1.67it/s]

No valid PDF found for bill 148


Extracting URLS:  99%|██████████████████████████████████████████████████████████████▌| 677/682 [05:59<00:02,  1.82it/s]

No valid PDF found for bill 159


Extracting URLS:  99%|██████████████████████████████████████████████████████████████▋| 678/682 [05:59<00:02,  1.97it/s]

No valid PDF found for bill 63


Extracting URLS: 100%|██████████████████████████████████████████████████████████████▋| 679/682 [06:00<00:01,  2.05it/s]

No valid PDF found for bill 100


Extracting URLS: 100%|██████████████████████████████████████████████████████████████▉| 681/682 [06:00<00:00,  2.43it/s]

No valid PDF found for bill 105


Extracting URLS: 100%|███████████████████████████████████████████████████████████████| 682/682 [06:01<00:00,  1.89it/s]

No valid PDF found for bill 127

 Found 656 downloadable PDFs out of 682 bills.
{'id': 3942, 'url': 'https://bills.parliament.uk/publications/59353/documents/6094'}
{'id': 3741, 'url': 'https://bills.parliament.uk/publications/55997/documents/4978'}
{'id': 3758, 'url': 'https://bills.parliament.uk/publications/56154/documents/5058'}
{'id': 3734, 'url': 'https://publications.parliament.uk/pa/bills/cbill/59-01/0180/240180.pdf'}
{'id': 3910, 'url': 'https://bills.parliament.uk/publications/59064/documents/5996'}
{'id': 3750, 'url': 'https://bills.parliament.uk/publications/56088/documents/5017'}
{'id': 3767, 'url': 'https://bills.parliament.uk/publications/56209/documents/5090'}
{'id': 3739, 'url': 'https://bills.parliament.uk/publications/59167/documents/6043'}
{'id': 3762, 'url': 'https://bills.parliament.uk/publications/56187/documents/5073'}
{'id': 3825, 'url': 'https://publications.parliament.uk/pa/bills/cbill/59-01/0199/240199.pdf'}
{'id': 3769, 'url': 'https://bills.parliament.uk/p




In [55]:
def check_pub_empty(bill_id):
    try:
        url = f"https://bills-api.parliament.uk/api/v1/Bills/{bill_id}/Publications"
        response = requests.get(url, headers={"accept": "application/json"})
        if response.status_code != 200:
            print(f" Failed for bill {bill_id} — status code: {response.status_code}")
            return None
        
        data = response.json()
        if not data.get("publications", []): 
            return True 
    except Exception as e:
        print(f"Error for bill {bill_id}: {e}")
        return None
          


noPubBills = []
billsWithPublication = []


for bill in tqdm(failed_bills, desc="Checking bills", total=len(failed_bills)):
    if check_pub_empty(bill):
        noPubBills.append(bill)
        
    else:
        billsWithPublication.append(bill)

print (billsWithPublication)




Checking bills: 100%|██████████████████████████████████████████████████████████████████| 26/26 [00:04<00:00,  6.14it/s]

[1078, 1006, 914, 810]





In [49]:
print (failed_bills)

[3800, 3814, 3798, 3890, 3848, 3844, 3806, 3805, 3903, 3835, 3905, 3807, 3861, 3904, 3845, 3812, 3840, 3918, 3794, 3925, 3818, 3799, 3828, 3902, 3915, 3820, 3926, 3898, 3847, 3856, 3849, 3877, 3789, 3871, 3846, 3886, 3797, 3943, 3784, 3923, 3843, 3878, 3804, 3869, 3838, 3830, 3935, 3914, 3912, 3919, 3831, 3930, 3822, 3937, 3901, 3870, 3836, 3876, 3866, 3939, 3932, 3880, 3829, 3867, 3917, 3810, 3788, 3837, 3931, 3908, 3827, 3940, 3777, 3872, 3933, 3920, 3655, 3666, 3633, 3722, 3615, 3627, 3723, 3617, 3689, 3686, 3575, 3725, 3726, 3673, 3671, 3614, 3637, 3632, 3613, 3724, 3623, 3619, 3709, 3626, 3684, 3711, 3612, 3620, 3576, 3616, 3714, 3688, 3710, 3618, 3704, 3727, 3721, 3681, 3635, 3693, 3720, 3713, 3657, 3674, 3670, 3596, 3624, 3696, 3679, 3728, 3705, 3716, 3697, 3718, 3677, 3717, 3664, 3729, 3665, 3691, 3700, 3597, 3682, 3622, 3730, 3583, 3708, 3715, 3712, 3582, 3625, 3621, 3585, 3594, 3661, 3239, 3420, 3424, 3466, 3464, 3497, 3257, 3495, 3468, 3479, 3243, 3259, 3501, 3242, 3488, 345

In [45]:

df = pd.read_csv(r"C:\Users\ander\Downloads\MLP\18_17 to 39.csv")


duplicates = df[df.duplicated('Short Title', keep=False)]

duplicates = duplicates.sort_values('Short Title')


duplicate_bills = duplicates[['Bill Id', 'Short Title']].values.tolist()

print(len(duplicate_bills))
for bill in duplicate_bills:
    print(bill)

798
[2194, 'Access to Radiotherapy Bill']
[1931, 'Access to Radiotherapy Bill']
[3800, 'Access to Telecommunications Networks Bill']
[3569, 'Access to Telecommunications Networks Bill']
[3420, 'Affordable Housing (Conversion of Commercial Property) Bill']
[3655, 'Affordable Housing (Conversion of Commercial Property) Bill']
[3424, 'Air Pollution (Local Authority Audits) Bill']
[3105, 'Air Pollution (Local Authority Audits) Bill']
[1747, 'Air Quality (Diesel Emissions in Urban Centres) Bill']
[1902, 'Air Quality (Diesel Emissions in Urban Centres) Bill']
[3546, 'Animal Welfare (Import of Dogs, Cats and Ferrets) Bill']
[3790, 'Animal Welfare (Import of Dogs, Cats and Ferrets) Bill']
[2880, 'Animal Welfare (Kept Animals) Bill']
[2880, 'Animal Welfare (Kept Animals) Bill']
[3584, 'Animal Welfare (Responsibility for Dog Attacks) Bill']
[3466, 'Animal Welfare (Responsibility for Dog Attacks) Bill']
[2430, 'Animal Welfare (Sentencing) Bill']
[2460, 'Animal Welfare (Sentencing) Bill']
[3858, '

In [56]:
for bill in billsWithPublication:
    print(bill)

1078
1006
914
810


In [9]:
output_file = "URLS/18_17 to 39 url dictionary.txt"
failedBillFile = "URLS/18_17 to 39 empty bills.txt"

with open(output_file, "w", encoding="utf-8") as f:
    for bill in bill_data:
        f.write(str(bill) + "\n")

with open(failedBillFile, "w", encoding="utf-8") as f:
    for bill in noPubBills:
        f.write(str(bill) + "\n")

In [55]:

import os
import time
import shutil
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.chrome.options import Options

def setup_chrome_driver(chromedriver_path, download_dir):
    chrome_options = Options()
    chrome_options.add_experimental_option("prefs", {
        "download.default_directory": download_dir,
        "download.prompt_for_download": False,
        "plugins.always_open_pdf_externally": True,
        "profile.default_content_setting_values.automatic_downloads": 1
    })
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-gpu")


    service = ChromeService(executable_path=chromedriver_path)
    return webdriver.Chrome(service=service, options=chrome_options)

def wait_for_new_pdf(download_dir, timeout=20, min_size_kb=30):

    start_time = time.time()
    seen = set(os.listdir(download_dir))

    while time.time() - start_time < timeout:
        current = set(os.listdir(download_dir))
        new_files = current - seen

        for file in new_files:
            if file.endswith(".pdf"):
                full_path = os.path.join(download_dir, file)
                if not os.path.exists(full_path + ".crdownload"):
                    size_kb = os.path.getsize(full_path) // 1024
                    if size_kb >= min_size_kb:
                        return full_path
        time.sleep(0.1)
    return None

def download_multiple_pdfs(driver, bill_data, download_dir):
    skipped_files = [] 
    for bill in bill_data:
        bill_id = bill['id']
        url = bill['url']
        final_filename = f"bill_{bill_id}.pdf"
        final_path = os.path.join(download_dir, final_filename)

        if os.path.exists(final_path):
            print(f"[{bill_id}]  Already exists. Skipping.")
            skipped_files.append(bill_id) 
            continue

        print(f"[{bill_id}]  Downloading from: {url}")

        try:
       
            driver.get(url)

            downloaded_path = wait_for_new_pdf(download_dir)
            if downloaded_path:
                shutil.move(downloaded_path, final_path)
                print(f"[{bill_id}]  Downloaded to: {final_path}")
            else:
                print(f"[{bill_id}]  Timed out: {final_filename}")
        except Exception as e:
            print(f"[{bill_id}]  Error: {e}")
    return skipped_files

if __name__ == "__main__":
    bill_list = []

    with open("29_17 to 39 url dictionary.txt", "r") as f:
        for line in f:
            line = line.strip()
            if line:
                entry = eval(line)  # Safe here since it's trusted input
                bill_list.append(entry)


    bill_data_sample = bill_list[:10]

    print (bill_data_sample)
    
    CHROMEDRIVER_PATH = r"C:\Users\ander\Downloads\chromedriver-win64\chromedriver-win64\chromedriver.exe"
    DOWNLOAD_DIR = r"C:\Users\ander\Downloads\MLP\billTextDownload\29_17 to 39"

  
    os.makedirs(DOWNLOAD_DIR, exist_ok=True)

    driver = setup_chrome_driver(CHROMEDRIVER_PATH, DOWNLOAD_DIR)
    try:
        download_multiple_pdfs(driver, bill_list, DOWNLOAD_DIR)
    finally:
        driver.quit()
        print(" Chrome closed.")


[{'id': 3733, 'url': 'https://publications.parliament.uk/pa/bills/cbill/59-01/0057/240057.pdf'}, {'id': 3735, 'url': 'https://publications.parliament.uk/pa/bills/cbill/59-01/0130/240130.pdf'}, {'id': 3744, 'url': 'https://publications.parliament.uk/pa/bills/cbill/59-01/0110/240110.pdf'}, {'id': 3751, 'url': 'https://bills.parliament.uk/publications/58650/documents/5923'}, {'id': 3506, 'url': 'https://bills.parliament.uk/publications/55379/documents/4804'}, {'id': 3380, 'url': 'https://www.legislation.gov.uk/ukla/2024/1/pdfs/ukla_20240001_en.pdf'}, {'id': 3508, 'url': 'https://bills.parliament.uk/publications/54911/documents/4630'}, {'id': 3507, 'url': 'https://publications.parliament.uk/pa/bills/cbill/58-04/0163/230163.pdf'}, {'id': 3509, 'url': 'https://publications.parliament.uk/pa/bills/cbill/58-04/0153/230153.pdf'}, {'id': 3172, 'url': 'https://publications.parliament.uk/pa/bills/cbill/58-03/0208/220208.pdf'}]
[3733]  Downloading from: https://publications.parliament.uk/pa/bills/cb

[2002]  Downloaded to: C:\Users\ander\Downloads\MLP\billTextDownload\29_17 to 39\bill_2002.pdf
[2396]  Downloading from: https://publications.parliament.uk/pa/bills/cbill/2017-2019/0412/19412.pdf
[2396]  Downloaded to: C:\Users\ander\Downloads\MLP\billTextDownload\29_17 to 39\bill_2396.pdf
[2020]  Downloading from: https://publications.parliament.uk/pa/bills/cbill/2017-2019/0294/18294.pdf
[2020]  Downloaded to: C:\Users\ander\Downloads\MLP\billTextDownload\29_17 to 39\bill_2020.pdf
[2231]  Downloading from: https://publications.parliament.uk/pa/bills/lbill/2017-2019/0135/18135.pdf
[2231]  Downloaded to: C:\Users\ander\Downloads\MLP\billTextDownload\29_17 to 39\bill_2231.pdf
[2252]  Downloading from: https://publications.parliament.uk/pa/bills/cbill/2017-2019/0301/18301.pdf
[2252]  Downloaded to: C:\Users\ander\Downloads\MLP\billTextDownload\29_17 to 39\bill_2252.pdf
[2273]  Downloading from: https://publications.parliament.uk/pa/bills/lbill/2017-2019/0157/18157.pdf
[2273]  Downloaded t

[1165]  Downloaded to: C:\Users\ander\Downloads\MLP\billTextDownload\29_17 to 39\bill_1165.pdf
[798]  Downloading from: https://www.publications.parliament.uk/pa/privbill/201314/cityoflondon/001.pdf
[798]  Downloaded to: C:\Users\ander\Downloads\MLP\billTextDownload\29_17 to 39\bill_798.pdf
[1325]  Downloading from: https://www.publications.parliament.uk/pa/bills/cbill/2013-2014/0181/181table.pdf
[1325]  Downloaded to: C:\Users\ander\Downloads\MLP\billTextDownload\29_17 to 39\bill_1325.pdf
[1280]  Downloading from: https://www.publications.parliament.uk/pa/bills/cbill/2013-2014/0119/14119.pdf
[1280]  Downloaded to: C:\Users\ander\Downloads\MLP\billTextDownload\29_17 to 39\bill_1280.pdf
[1111]  Downloading from: https://www.publications.parliament.uk/pa/privbill/201314/hertsfilm/029563/029563.pdf
[1111]  Downloaded to: C:\Users\ander\Downloads\MLP\billTextDownload\29_17 to 39\bill_1111.pdf
[1287]  Downloading from: https://www.publications.parliament.uk/pa/bills/cbill/2013-2014/0172/141

[550]  Downloaded to: C:\Users\ander\Downloads\MLP\billTextDownload\29_17 to 39\bill_550.pdf
[547]  Downloading from: http://www.opsi.gov.uk/acts/acts2010/pdf/ukpga_20100016_en.pdf
[547]  Downloaded to: C:\Users\ander\Downloads\MLP\billTextDownload\29_17 to 39\bill_547.pdf
[560]  Downloading from: http://www.opsi.gov.uk/acts/acts2010/pdf/ukpga_20100010_en.pdf
[560]  Downloaded to: C:\Users\ander\Downloads\MLP\billTextDownload\29_17 to 39\bill_560.pdf
[356]  Already exists. Skipping.
[395]  Downloading from: http://www.opsi.gov.uk/acts/acts2009/pdf/ukpga_20090011_en.pdf
[395]  Downloaded to: C:\Users\ander\Downloads\MLP\billTextDownload\29_17 to 39\bill_395.pdf
[206]  Already exists. Skipping.
[383]  Downloading from: http://www.opsi.gov.uk/acts/acts2009/pdf/ukpga_20090006_en.pdf
[383]  Downloaded to: C:\Users\ander\Downloads\MLP\billTextDownload\29_17 to 39\bill_383.pdf
[398]  Downloading from: https://www.publications.parliament.uk/pa/cm200809/cmbills/162/2009162.pdf
[398]  Downloaded

In [56]:
import os
import re
import fitz  

def extract_clean_flat_text(pdf_path):
    doc = fitz.open(pdf_path)

    pages = [doc[i].get_text() for i in range(1, len(doc)-1)] if len(doc) > 2 else []
    doc.close()

    full_text = " ".join(pages)
    full_text = full_text.replace("\n", " ").strip()
    full_text = re.sub(r"\([^)]*\)", "", full_text)
    full_text = re.sub(r"\b\d+\b", "", full_text)
    full_text = re.sub(r"[^a-zA-Z.,; ]", " ", full_text)
    full_text = re.sub(r"\s+", " ", full_text)

    return full_text.lower().strip()



input_folder = r"C:\Users\ander\Downloads\MLP\billTextDownload\29_17 to 39"
output_folder = r"C:\Users\ander\Downloads\MLP\cleanedTextFull"
output_file = os.path.join(output_folder, "29_17 to 39 fullText.txt")


os.makedirs(output_folder, exist_ok=True)


all_cleaned_texts = []
for filename in os.listdir(input_folder):
    if filename.endswith(".pdf"):
        pdf_path = os.path.join(input_folder, filename)
        print(f" Processing {filename}...")
        try:
            cleaned = extract_clean_flat_text(pdf_path)
            if cleaned:
                all_cleaned_texts.append(cleaned)
        except Exception as e:
            print(f" Error reading {filename}: {e}")


final_text = "\n\n".join(all_cleaned_texts)


with open(output_file, "w", encoding="utf-8") as f:
    f.write(final_text)

print(f"\n text saved to:\n{output_file}")


 Processing 18294.pdf...
 Error reading 18294.pdf: cannot open empty document
 Processing 200011.pdf...
 Error reading 200011.pdf: cannot open empty document
 Processing bill_1016.pdf...
 Processing bill_1086.pdf...
 Processing bill_1098.pdf...
 Processing bill_1107.pdf...
 Processing bill_1111.pdf...
 Processing bill_1165.pdf...
 Processing bill_1166.pdf...
 Processing bill_1168.pdf...
 Processing bill_1170.pdf...
 Processing bill_1171.pdf...
 Processing bill_1280.pdf...
 Processing bill_1287.pdf...
 Processing bill_1325.pdf...
 Processing bill_1376.pdf...
 Processing bill_1377.pdf...
 Processing bill_1378.pdf...
 Processing bill_1380.pdf...
 Processing bill_1382.pdf...
 Processing bill_1483.pdf...
 Processing bill_1574.pdf...
 Processing bill_1584.pdf...
 Processing bill_1585.pdf...
 Processing bill_1592.pdf...
 Processing bill_1651.pdf...
 Processing bill_1677.pdf...
 Processing bill_1696.pdf...
 Processing bill_1703.pdf...
 Processing bill_1719.pdf...
 Processing bill_1720.pdf...
 

In [None]:
# how many unique ids in csv file 

