In [5]:
from bs4 import BeautifulSoup
import os
import csv

def remove_html_tags(text):
    soup = BeautifulSoup(text, 'html.parser')
    return soup.get_text()

def read_html_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()
    return lines

folder_path = 'raw_html'  # Specify the folder path containing the HTML files

output_folder = 'clean_text'  # Specify the folder where the clean text files will be saved
os.makedirs(output_folder, exist_ok=True)  # Create the output folder if it doesn't exist

html_files = os.listdir(folder_path)  # Get the list of HTML files in the folder

for html_file in html_files:
    html_file_path = os.path.join(folder_path, html_file)
    lines = read_html_file(html_file_path)
    html_text = ''.join(lines)  # Join the lines into a single string
    clean_text = remove_html_tags(html_text)

    # Extract the filename from the html_file_path
    filename = os.path.splitext(html_file)[0]  # Extract the filename without extension
    output_file_path = os.path.join(output_folder, f'clean_text_{filename}.csv')

    with open(output_file_path, 'w', newline='', encoding='utf-8') as output_file:
        writer = csv.writer(output_file)
        for line in clean_text.splitlines():
            writer.writerow([line])

    print(f"Clean text saved in {output_file_path}.")


Clean text saved in clean_text\clean_text_Criminal Justice and Police Act 2001 (c. 16).csv.
Clean text saved in clean_text\clean_text_Human Rights Act 1998 (c. 42).csv.
Clean text saved in clean_text\clean_text_Public Order Act 1986 (c. 64).csv.
Clean text saved in clean_text\clean_text_Public Order Act 2023 (c. 15).csv.
Clean text saved in clean_text\clean_text_Serious Organised Crime and Police Act 2005 (c. 15).csv.


In [20]:
import csv
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from gensim import corpora, models

# Load stopwords
stop_words = set(stopwords.words('english'))

# Function to preprocess text
def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [token for token in tokens if token.isalpha() and token not in stop_words]
    return tokens

# Function to perform topic modeling on a list of tokens
def perform_topic_modeling(tokens):
    if not tokens:
        return []  # Return empty topics if there are no tokens
    dictionary = corpora.Dictionary([tokens])
    corpus = [dictionary.doc2bow(tokens)]
    lda_model = models.LdaModel(corpus, num_topics=1, id2word=dictionary)
    topics = lda_model.print_topics(num_words=5)
    return [topic[1] for topic in topics]

# Read CSV file
data = []
with open('clean_text\\clean_text_Human Rights Act 1998 (c. 42).csv', 'r', encoding='utf-8') as file:
    reader = csv.reader(file)
    for row in reader:
        text = row[0]  # Assuming the text is in the first column
        tokens = preprocess_text(text)
        topics = perform_topic_modeling(tokens)
        data.append([text, topics])

# Create DataFrame
df = pd.DataFrame(data, columns=['Text', 'Topics'])

# Save DataFrame to CSV file
df.to_csv('output.csv', index=False)


Unnamed: 0,Text,Topics
0,,[]
1,"Human Rights Act 1998 (c. 42) xmlns:atom=""ht...","[0.065*""rights"" + 0.054*""human"" + 0.043*""act"" ..."
2,Extent InformationE1For the extent of this Act...,"[0.051*""sch"" + 0.042*""act"" + 0.034*""secretary""..."
3,"(a)Articles 2 to 12 and 14 of the Convention,","[0.500*""convention"" + 0.500*""articles""]"
4,"(b)Articles 1 to 3 of the First Protocol, and","[0.250*""articles"" + 0.250*""b"" + 0.250*""protoco..."
...,...,...
351,(c)the M19Judicial Pensions Act 1981; or,"[0.333*""act"" + 0.333*""c"" + 0.333*""pensions""]"
352,(d)the M20Judicial Pensions and Retirement Act...,"[0.333*""act"" + 0.333*""pensions"" + 0.333*""retir..."
353,(e)[F54the Public Service Pensions Act 2013;] and,"[0.200*""act"" + 0.200*""pensions"" + 0.200*""publi..."
354,“pensions order” means an order made under par...,"[0.273*""order"" + 0.182*""made"" + 0.182*""means"" ..."
