In [1]:
import csv

# Regulations list (with exact wording)
regulations = [
    [1, "The minimum plot size of a lot shall be not less than 150 m2 in extent, unless otherwise specified by the Development Plan in effect for the particular Urban Development Area.", "minimum plot size, development plan"],
    [2, "The location of the existing buildings, if any, shall be indicated in the plan.", "existing buildings, location"],
    [3, "The scale of the plan, north line and the assessment numbers of adjoining lots or buildings ought to be clearly indicated.", "scale, north line, assessment numbers"],
    [4, "The means of access to the site and the width of the access roads shall be indicated.", "access roads, site entry"],
    [5, "All existing and proposed drains and water courses shall be indicated with the directions of the water flow.", "drainage, watercourses, flow direction"],
    [6, "A drainage system shall be indicated as a scheme to drain off natural water and rainwater and such drainage systems shall be connected to a common drain or other common waterways. Where the levels of the existing drains are not a receptacle of the outflow of the proposed drainage system, the space to accommodate an alternative drainage system shall be indicated in the plan.", "drainage system, rainwater, common waterways"]
]

# Save to CSV file
with open("regulations.csv", mode="w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerow(["Regulation_ID", "Regulation", "Keywords"])  # Column headers
    writer.writerows(regulations)

print("CSV file 'regulations.csv' has been created successfully.")


CSV file 'regulations.csv' has been created successfully.


In [2]:
import pandas as pd

# Load the CSV file
df = pd.read_csv("regulations.csv")

# Display the first few rows
df.head()

Unnamed: 0,Regulation_ID,Regulation,Keywords
0,1,The minimum plot size of a lot shall be not le...,"minimum plot size, development plan"
1,2,"The location of the existing buildings, if any...","existing buildings, location"
2,3,"The scale of the plan, north line and the asse...","scale, north line, assessment numbers"
3,4,The means of access to the site and the width ...,"access roads, site entry"
4,5,All existing and proposed drains and water cou...,"drainage, watercourses, flow direction"


In [8]:
import pandas as pd
import re

# List of stop words to exclude
stop_words = {'the', 'and', 'or', 'for', 'in', 'of', 'shall', 'must', 'to', 'by', 'a', 'an', 'on', 'with', 'at', 'it', 'be','otherwise','particular'}

# Load the regulations CSV
df = pd.read_csv("regulations.csv")

# Function to extract important words, excluding stop words
def extract_keywords(text):
    keywords = re.findall(r'\b[a-zA-Z]+\b', text.lower())  # Extract words
    important_words = [word for word in keywords if word not in stop_words and len(word) > 4]  # Filter out stop words and short words
    return ", ".join(set(important_words))  # Convert to string for storage

# Apply keyword extraction to each regulation
df["Extracted_Keywords"] = df["Regulation"].apply(extract_keywords)

# Save the updated CSV
df.to_csv("processed_regulations_cleaned.csv", index=False)

# Display results
print(df[["Regulation", "Extracted_Keywords"]])

                                          Regulation  \
0  The minimum plot size of a lot shall be not le...   
1  The location of the existing buildings, if any...   
2  The scale of the plan, north line and the asse...   
3  The means of access to the site and the width ...   
4  All existing and proposed drains and water cou...   
5  A drainage system shall be indicated as a sche...   

                                  Extracted_Keywords  
0  effect, development, minimum, specified, exten...  
1           indicated, existing, buildings, location  
2  north, clearly, indicated, scale, ought, build...  
3             indicated, means, roads, width, access  
4  indicated, courses, directions, water, propose...  
5  scheme, drain, outflow, alternative, levels, i...  


In [9]:
# If you want to view the entire DataFrame without truncation
print(df.to_string())

   Regulation_ID                                                                                                                                                                                                                                                                                                                                                                                Regulation                                      Keywords                                                                                                                                                                                                    Extracted_Keywords
0              1                                                                                                                                                                                                           The minimum plot size of a lot shall be not less than 150 m2 in extent, unless otherwise specified by the Development Plan in 