In [5]:
from bs4 import BeautifulSoup
import os
import csv

def remove_html_tags(text):
    soup = BeautifulSoup(text, 'html.parser')
    return soup.get_text()

def read_html_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()
    return lines

folder_path = 'raw_html'  # Specify the folder path containing the HTML files

output_folder = 'clean_text'  # Specify the folder where the clean text files will be saved
os.makedirs(output_folder, exist_ok=True)  # Create the output folder if it doesn't exist

html_files = os.listdir(folder_path)  # Get the list of HTML files in the folder

for html_file in html_files:
    html_file_path = os.path.join(folder_path, html_file)
    lines = read_html_file(html_file_path)
    html_text = ''.join(lines)  # Join the lines into a single string
    clean_text = remove_html_tags(html_text)

    # Extract the filename from the html_file_path
    filename = os.path.splitext(html_file)[0]  # Extract the filename without extension
    output_file_path = os.path.join(output_folder, f'clean_text_{filename}.csv')

    with open(output_file_path, 'w', newline='', encoding='utf-8') as output_file:
        writer = csv.writer(output_file)
        for line in clean_text.splitlines():
            writer.writerow([line])

    print(f"Clean text saved in {output_file_path}.")


Clean text saved in clean_text\clean_text_Criminal Justice and Police Act 2001 (c. 16).csv.
Clean text saved in clean_text\clean_text_Human Rights Act 1998 (c. 42).csv.
Clean text saved in clean_text\clean_text_Public Order Act 1986 (c. 64).csv.
Clean text saved in clean_text\clean_text_Public Order Act 2023 (c. 15).csv.
Clean text saved in clean_text\clean_text_Serious Organised Crime and Police Act 2005 (c. 15).csv.


In [21]:
import csv
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from gensim import corpora, models

# Load stopwords
stop_words = set(stopwords.words('english'))

# Function to preprocess text
def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [token for token in tokens if token.isalpha() and token not in stop_words]
    return tokens

# Function to perform topic modeling on a list of tokens
def perform_topic_modeling(tokens):
    if not tokens:
        return []  # Return empty topics if there are no tokens
    dictionary = corpora.Dictionary([tokens])
    corpus = [dictionary.doc2bow(tokens)]
    lda_model = models.LdaModel(corpus, num_topics=1, id2word=dictionary)
    topics = lda_model.print_topics(num_words=5)
    return [topic[1] for topic in topics]

# Read CSV file
data = []
with open('clean_text\\clean_text_Human Rights Act 1998 (c. 42).csv', 'r', encoding='utf-8') as file:
    reader = csv.reader(file)
    for row in reader:
        text = row[0]  # Assuming the text is in the first column
        tokens = preprocess_text(text)
        topics = perform_topic_modeling(tokens)
        data.append([text, topics])

# Create DataFrame
df = pd.DataFrame(data, columns=['Text', 'Topics'])

# Save DataFrame to CSV file
df.to_csv('output.csv', index=False)


In [4]:
import os
import csv
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from gensim import corpora, models

# Load stopwords
stop_words = set(stopwords.words('english'))

# Function to preprocess text
def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [token for token in tokens if token.isalpha() and token not in stop_words]
    return tokens

# Function to perform topic modeling on a list of tokens
def perform_topic_modeling(tokens):
    if not tokens:
        return []  # Return empty topics if there are no tokens
    dictionary = corpora.Dictionary([tokens])
    corpus = [dictionary.doc2bow(tokens)]
    lda_model = models.LdaModel(corpus, num_topics=1, id2word=dictionary)
    topics = lda_model.print_topics(num_words=5)
    return [topic[1] for topic in topics]

# Read CSV files from the clean_text folder
input_folder = 'clean_text/'
output_folder = 'topic_output/'
output_files = []

for file_name in os.listdir(input_folder):
    if file_name.endswith('.csv'):
        input_file = os.path.join(input_folder, file_name)
        output_file = os.path.join(output_folder, file_name.replace('clean_text_', 'topic_'))
        output_files.append(output_file)
        
        data = []
        with open(input_file, 'r', encoding='utf-8') as file:
            reader = csv.reader(file)
            for row in reader:
                text = row[0]  # Assuming the text is in the first column
                tokens = preprocess_text(text)
                topics = perform_topic_modeling(tokens)
                data.append([text, topics])

        # Create DataFrame
        df = pd.DataFrame(data, columns=['Text', 'Topics'])

        # Save DataFrame to CSV file
        df.to_csv(output_file, index=False)
        print(f"Saved output as {output_file}")


Saved output as topic_output/topic_Criminal Justice and Police Act 2001 (c. 16).csv
Saved output as topic_output/topic_Human Rights Act 1998 (c. 42).csv
Saved output as topic_output/topic_Public Order Act 1986 (c. 64).csv
Saved output as topic_output/topic_Public Order Act 2023 (c. 15).csv
Saved output as topic_output/topic_Serious Organised Crime and Police Act 2005 (c. 15).csv


In [6]:
import os
import glob
import pandas as pd

# Create an empty DataFrame to store all data
dfs = []

# Get the file names of all CSV files in the folder
folder_path = 'topic_output'
file_names = glob.glob(os.path.join(folder_path, '*.csv'))

# Iterate over each file and read the data into a DataFrame
for file_name in file_names:
    # Read the CSV file into a DataFrame
    df = pd.read_csv(file_name)
    
    # Get the base name of the file without the path
    base_name = os.path.basename(file_name)
    
    # Remove the "topic_" prefix from the file name
    base_name = base_name.replace('topic_', '')
    
    # Add a new column with the file name
    df['Source'] = base_name
    
    # Append the DataFrame to the list
    dfs.append(df)

# Concatenate all the DataFrames into a single master DataFrame
master_df = pd.concat(dfs, ignore_index=True)

# Save the master DataFrame to a CSV file
master_df.to_csv('master_output.csv', index=False)


Below is the search query without using word vectorization

In [9]:
import pandas as pd

# Configure pandas display options
pd.set_option('display.max_colwidth', None)

# Load the output.csv file into a DataFrame
df = pd.read_csv('master_output.csv')

# Function to search the DataFrame based on the query
def search_dataframe(query):
    # Filter rows that match the query (case-insensitive)
    matching_rows = df[df['Topics'].str.contains(query, case=False)]
    return matching_rows

# Get the query from user input
query = input("Enter your query: ")

# Search the DataFrame
results = search_dataframe(query)

# Print the query text in a pretty way
print(f"Query: {query}\n")

# Display the matching results without the Topics column
print("Matching Results:")
print(results.drop(columns=['Topics']).to_string(index=False))


Query: fight

Matching Results:
Empty DataFrame
Columns: [Text, Source]
Index: []


Below is the search using word vectarization with the google training dataset

In [10]:
import pandas as pd
from gensim.models import KeyedVectors
from gensim.utils import simple_preprocess

# Load the Word2Vec model (pre-trained or trained on your data)
model_path = 'GoogleNews-vectors-negative300.bin'
word2vec_model = KeyedVectors.load_word2vec_format(model_path, binary=True)

In [33]:
import os
import glob
import pandas as pd

# Configure pandas display options
pd.set_option('display.max_colwidth', None)

# Load the master_output.csv file into a DataFrame
df = pd.read_csv('master_output.csv')

# Function to expand the query with similar words
def expand_query(query):
    expanded_query = []
    for word in simple_preprocess(query):
        similar_words = word2vec_model.most_similar(word)
        expanded_query.extend([word] + [similar_word[0] for similar_word in similar_words])
    return ' '.join(expanded_query)

# Function to search the DataFrame based on the expanded query
def search_dataframe(query):
    # Split the query into individual words
    words = query.split()
    
    # Initialize an empty list to store matching DataFrames
    matching_dfs = []
    
    # Search for each word in the query
    for word in words:
        # Filter rows that match the word (case-insensitive)
        matching_rows = df[df['Topics'].str.contains(word, case=False, regex=False)]
        # Append the matching DataFrame to the list
        matching_dfs.append(matching_rows)
    
    # Concatenate all the matching DataFrames into a single DataFrame
    results = pd.concat(matching_dfs, ignore_index=True)
    
    return results

# Get the query from user input
query = input("Enter your query: ")

# Expand the query with similar words
expanded_query = expand_query(query)

# Search the DataFrame
results = search_dataframe(expanded_query)

# Print the original and expanded queries in a pretty format
print(f"Original Query: {query.capitalize()}")
print(f"Expanded Query: {expanded_query.capitalize()}\n")

# Display the matching results without the Topics column
print("Matching Results:")
display(results.drop(columns=['Topics']))


Original Query: Bike
Expanded Query: Bike bicycle bikes mountain_bike scooter motorcycle bike biking bmx_bike bicycles moped

Matching Results:


Unnamed: 0,Text,Source
