In [1]:
import os
import re
import fitz  # PyMuPDF

# Define the path to your documents
docs_path = r'D:\Data Analysis ML AI\irish_law'

# Function to extract text from each PDF
def extract_text_from_pdf(pdf_path):
    text = ""
    with fitz.open(pdf_path) as pdf:
        for page_num in range(pdf.page_count):
            page = pdf.load_page(page_num)
            text += page.get_text("text")
    return text

# Function to clean up each document
def clean_text(text):
    # Remove common non-legal content like headers, footers, and page numbers
    text = re.sub(r'\n\d+\n', ' ', text)  # Remove page numbers
    text = re.sub(r'\s+', ' ', text)       # Remove extra whitespace
    text = re.sub(r'HEADER_TEXT|FOOTER_TEXT', '', text)  # Replace with any common header/footer text patterns

    # Remove other unwanted content if there are any known phrases or symbols (adjust as needed)
    non_legal_phrases = ["Confidential", "Draft Copy"]
    for phrase in non_legal_phrases:
        text = text.replace(phrase, '')

    return text.strip()

# Apply extraction and cleaning function to all PDF documents in the folder
cleaned_docs = {}
for filename in os.listdir(docs_path):
    if filename.endswith('.pdf'):  # Check for PDF files
        pdf_path = os.path.join(docs_path, filename)
        print(f"Processing file: {filename}")
        
        # Extract and clean text
        raw_text = extract_text_from_pdf(pdf_path)
        cleaned_text = clean_text(raw_text)
        cleaned_docs[filename] = cleaned_text

# Check the output
if cleaned_docs:
    for filename, text in cleaned_docs.items():
        print(f"Document: {filename}")
        print("Sample cleaned text:", text[:200], "\n")  # Preview first 200 characters
else:
    print("No documents were processed or the documents are empty.")


Processing file: Policing, Security and Community Safety Act 2024.pdf
Processing file: Digital Services Act 2024.pdf
Processing file: Finance (State Guarantees, International Financial Institution Funds and Miscellaneous Provisions) Act 2024.pdf
Processing file: Coroners (Amendment) Act 2024.pdf
Processing file: Human Tissue (Transplantation, Post-Mortem, Anatomical Examination and Public Display) Act 2024.pdf
Processing file: Social Welfare and Civil Law (Miscellaneous Provisions) Act 2024.pdf
Processing file: Local Government (Mayor of Limerick) and Miscellaneous Provisions Act 2024.pdf
Processing file: Criminal Justice (Engagement of Children in Criminal Activity) Act 2024.pdf
Processing file: European Arrest Warrant (Amendment) Act 2024.pdf
Processing file: Road Traffic Act 2024.pdf
Processing file: Gas (Amendment) and Miscellaneous Provisions Act 2024.pdf
Processing file: Court Proceedings (Delays) Act 2024.pdf
Processing file: Health (Termination of Pregnancy Services) (Safe Acce

In [2]:
# Function to split text into chunks
def chunk_text(text, max_length=500):
    words = text.split()
    chunks = []
    current_chunk = []

    for word in words:
        current_chunk.append(word)
        if len(current_chunk) >= max_length:
            chunks.append(' '.join(current_chunk))
            current_chunk = []

    if current_chunk:
        chunks.append(' '.join(current_chunk))

    return chunks

# Apply chunking function to each cleaned document
chunked_docs = {}
for filename, cleaned_text in cleaned_docs.items():
    chunked_docs[filename] = chunk_text(cleaned_text)

# Optional: Preview chunked output
for doc, chunks in chunked_docs.items():
    print(f"Document: {doc} - Total Chunks: {len(chunks)}")
    print("Sample chunk:", chunks[0][:200], "\n")


Document: Policing, Security and Community Safety Act 2024.pdf - Total Chunks: 217
Sample chunk: Number 1 of 2024 Policing, Security and Community Safety Act 2024 Number 1 of 2024 POLICING, SECURITY AND COMMUNITY SAFETY ACT 2024 CONTENTS PART 1 PRELIMINARY AND GENERAL Section 1. Short title and c 

Document: Digital Services Act 2024.pdf - Total Chunks: 61
Sample chunk: Number 2 of 2024 Digital Services Act 2024 Number 2 of 2024 DIGITAL SERVICES ACT 2024 CONTENTS PART 1 PRELIMINARY AND GENERAL Section 1. Short title and commencement 2. Definitions 3. Regulations 4. S 

Document: Finance (State Guarantees, International Financial Institution Funds and Miscellaneous Provisions) Act 2024.pdf - Total Chunks: 67
Sample chunk: Number 3 of 2024 Finance (State Guarantees, International Financial Institution Funds and Miscellaneous Provisions) Act 2024 Number 3 of 2024 FINANCE (STATE GUARANTEES, INTERNATIONAL FINANCIAL INSTITU 

Document: Coroners (Amendment) Act 2024.pdf - Total Chunks: 9
Samp

In [20]:
# Define keywords for each category
category_keywords = {
    "Employment Law": [ "Leave," "Maternity," "Paternity," "Sick Leave," "Work Hours," "Minimum Wage," "Pension," "Retirement," "Overtime," "Annual Leave," "Holiday Pay," "Compensation," "Severance," "Termination," "Dismissal","Employment", "Redundancies", "Companies", "Labour", "Workplace", "Wages", "Union", "Employee", "Contract", "Dismissal", "Pensions", "Rights", "Protection", "Compensation", "Work Hours", "Officer", "Board", "Agreement"],
    "Employee Rights and Benefits": ["Leave", "Maternity", "Paternity", "Sick Leave", "Work Hours", "Minimum Wage", "Pension", "Retirement", "Overtime", "Annual Leave", "Holiday Pay", "Compensation", "Severance", "Termination", "Dismissal"],
    "Criminal Law": ["Criminal", "Justice", "Warrant", "Court", "Prison", "Prosecution", "Offence", "Sentencing", "Appeal", "Defendant", "Evidence", "Crime", "Probation", "Rehabilitation", "Victim", "Garda"],
    "Labour Relations and Unions": ["Collective Bargaining", "Strike", "Industrial Action", "Union Membership", "Labor Dispute", "Union Representation", "Employee Representation"],
    "Workplace Policies": ["Harassment", "Discrimination", "Equal Opportunity", "Health and Safety", "Workplace Conduct", "Professional Standards", "Ethics in Employment", "Grievance Procedures"],
    "Employment Contracts and Agreements": ["Fixed-Term", "Permanent", "Temporary", "Probationary Period", "Notice Period", "Employment Contract", "Part-Time", "Full-Time", "Independent Contractor"],
    "Employment Law Compliance": ["Statutory Requirements", "Regulatory Compliance", "Employment Tribunal", "Disciplinary Action", "Legal Protection", "Welfare", "Fair Treatment"],
    "Health Law": ["Health", "Pregnancy", "Medical", "Welfare", "Healthcare", "Services", "Patient", "Medicine", "Access", "Mental", "Treatment", "Care", "Public Health", "Well-being"],
    "Public Safety and Community Law": ["Policing", "Safety", "Security", "Digital", "Community", "Government", "Civil", "Public", "Law Enforcement", "Rights", "Compliance", "Local", "Infrastructure", "Regulation", "Social", "Services"],
    "Financial and Economic Law": ["Finance", "Tax", "Economic", "Guarantees", "Funds", "Investment", "Banking", "Money", "Budget", "Revenue", "Assets", "Financial", "Capital", "Insurance", "Fiscal"],
    "Technology and Digital Law": ["Digital", "Data", "Privacy", "Cybersecurity", "Online", "Internet", "Information", "Communications", "Telecommunications"],
    "Social Welfare Law": ["Social Welfare", "Assistance", "Housing", "Disability", "Unemployment", "Pension", "Family Support", "Public Assistance", "Benefits", "Community"],
    "Environmental and Energy Law": ["Environment", "Climate", "Energy", "Conservation", "Waste", "Pollution", "Natural Resources", "Biodiversity", "Sustainability"],
    "Family and Civil Law": ["Family", "Civil", "Marriage", "Divorce", "Custody", "Children", "Support", "Domestic", "Guardianship", "Adoption"],
    "Judicial and Legal Procedures": ["Court", "Judicial", "Coroner", "Hearing", "Proceedings", "Appeal", "Magistrate", "Tribunal", "Sentence"],
    "Administrative and Regulatory Law": ["Regulation", "Administrative", "Amendment", "Statutory", "Provision", "Order", "Compliance", "Notice", "Permit", "Regulations"],
    "Transport and Road Law": ["Road", "Traffic", "Vehicle", "Transport", "Driving", "Speed", "Penalty Points", "Motor"],
    "Research and Innovation Law": ["Research", "Innovation", "Agency", "Science", "Development", "Superannuation", "Board", "Funding", "Minister", "Establishment"],
    "Retirement and Pensions Law": ["Retirement", "Pension", "Savings", "Enrolment", "Superannuation"],
    "Education and Training Law": ["Education", "Training", "Schools", "Teaching", "Apprenticeship", "Learning"],
    "Corporate and Business Law": ["Corporate", "Business", "Trade", "Commerce", "Industry", "Company", "Commercial"],
    "European Union and International Law": ["European", "Commission", "Union", "Council", "Member State", "Article", "Paragraph", "Implementing", "Application", "Specified"],
    "Public Service and Governmental Bodies": ["Statutory", "Notice", "Provisions", "Required," "Permit," "Regulations," "Compliance," "Scheme," "Application," "License," "Authorised","Service", "Public", "Government", "Local", "Office", "Body", "Functions", "Statutory", "Government", "Appropriate"],
    "Legal and Statutory Notices": ["Notice", "Provisions", "Permit", "Required", "Statutory", "Apply", "Regulations"],
    "European Union Law and Regulations": ["European", "Commission", "Union", "Council", "Member State", "Article", "Directive", "Regulation", "Statutory", "Implementing"],
    "Public Service and Local Government Law": [ "Service," "Government," "Public," "Office," "National," "Functions," "Authority," "Local," "Provisions," "State," "Body," "Council","Service", "Public", "Government", "Local", "Office", "Functions", "Authority", "Council", "Statutory Body"],
    "Public Notice and Administrative Orders": ["Notice", "Order", "Publication", "Compliance", "Announcement", "Procedure", "Requirements", "Directive", "Statutory"],
    "Environmental Law and Conservation": ["Environment", "Conservation", "Climate", "Pollution", "Biodiversity", "Waste", "Natural Resources", "Sustainability"],
    "Permits and Licensing Law": ["Permit", "License", "Scheme", "Application", "Approval", "Required", "Statutory", "Authorized", "Conditions"],
    "Policing and Security Law": ["Policing," "Security," "Garda," "Public Safety," "Safety," "Offence," "Court," "Officer," "Enforcement," "Crime," "Justice," "Community",  "Policing", "Security", "Community Safety", "Garda", "Police", "Ombudsman","Commissioner", "Inspectorate", "Law Enforcement", "Investigation","Accountability", "An Garda Síochána", "Fiosrú", "Oifig","Ombudsman Póilíneachta", "Údarás Póilíneachta", "Sábháilteacht Pobail"],
    "Employment and Labour Law": ["Employment", "Labour", "Officer", "Board", "Union", "Agreement", "Rights", "Contract", "Compensation", "Workplace", "Employee", "Pension"],
    "Criminal Law": ["Criminal", "Justice", "Warrant", "Court", "Prison", "Prosecution", "Offence", "Sentencing", "Appeal", "Defendant", "Evidence", "Crime", "Probation", "Rehabilitation", "Victim", "Garda"],
    "European Union Law and Regulations": ["European," "Commission," "Council," "Member," "State," "Implementing," "Regulations," "Article," "Paragraph," "Directive," "Union," "International","European", "Commission", "Union", "Council", "Member State", "Article", "Directive", "Regulation", "Statutory", "Implementing"],
    "Health and Safety Regulations": ["Health", "Safety", "Welfare", "Well-being", "Healthcare", "Treatment", "Standards", "Compliance", "Public Health"],
    "Public Service and Local Government Law": ["Service", "Public", "Government", "Local", "Office", "Functions", "Authority", "Council", "Statutory Body"],
    "Administrative and Regulatory Law": ["Regulation", "Administrative", "Amendment", "Statutory", "Provision", "Order", "Compliance", "Notice", "Permit", "Regulations"],
    "Public Safety and Policing Law": ["Officer","Garda","Service","Public","National","Commission","Authority","Policing", "Security", "Community Safety", "Ombudsman", "Police Forces", "Prisons", "Garda", "Accountability," "Governance," "Safety Services," "Public Order",  ],
    "Criminal Justice and Law Enforcement": ["Offence", "Court", "Police", "Provisions", "Authority", "Penalties", "Judiciary", "Prosecution", "Law Enforcement", "Sentencing Guidelines", "Investigation","Criminal Justice", "Criminal Law", "Offenses", "Sentencing", "Sex Offenders", "Drug Abuse", "Public Office"],
    "Data and Privacy Law": ["Information", "Publications", "Notice", "Statutory", "Scheme", "Regulations", "Directive", "Confidentiality", "Data Retention", "Information Security", "Disclosure","Data Protection", "Freedom of Information", "Protected Disclosures", "Privacy", "Official Secrets"],
    "Employment and Labour Law": ["Employment," "Union," "Agreement," "Officer," "Publications," "Board," "Member," "Wages," "Labour," "Contract," "Workforce," "Employee","Employment", "Union", "Agreement", "Wages", "Labour", "Contract", "Statutory", "Collective Bargaining", "Workplace", "Employee Benefits", "Labour Standards","Industrial Relations", "Working Time", "Paternity Leave", "Employee Rights", "Standards in Public Office"],
    "Human Rights and Equality Law": ["Public", "Rights", "Office", "Body", "Union", "Provision", "Concerned", "Anti-Discrimination", "Accessibility", "Equal Treatment", "Social Justice","Disability", "Human Rights", "Equality", "Ethics in Public Office"],
    "Public Administration and Local Government": ["Local Government", "Ombudsman", "Shared Services", "Interpretation", "Official Secrets"],
    "Healthcare and Social Welfare Law": ["Scheme", "Benefits", "Services", "Health", "Social", "Statutory", "Medical Assistance", "Welfare Benefits", "Healthcare Services", "Social Security","Healthcare", "Social Welfare", "Public Health", "Welfare of Greyhounds"],
    "Education and Research Law": ["Board", "National", "Commission", "Authority", "Statutory", "Higher Education", "Academic Standards", "Research Funding", "Training Programs","Education", "Training", "Law Reform", "Research and Innovation"],
    "Environmental and Animal Welfare Law": ["Permit", "Scheme", "Regulations", "Environment", "Natural Resources", "Conservation", "Environmental Protection", "Wildlife Management", "Resource Allocation","Animal Welfare", "Greyhound Welfare", "Inland Fisheries"],
    "Corporate and Business Law": ["Board", "Member", "Union", "Agreement", "Notice", "Corporate Governance", "Commerce", "Commercial Regulations", "Shareholder Rights","Companies Act", "Corporate", "Business", "Industrial Relations"]
}
# Function to categorize based on keywords in the document title
def categorize_by_keywords(doc_title):
    for category, keywords in category_keywords.items():
        if any(keyword in doc_title for keyword in keywords):
            return category
    return "Miscellaneous"  # Default category for unmatched

# Apply categorization
categorized_docs = {}
for doc_title, chunks in chunked_docs.items():
    category = categorize_by_keywords(doc_title)
    if category not in categorized_docs:
        categorized_docs[category] = []
    categorized_docs[category].extend(chunks)

# Review the categorized chunks
for category, chunks in categorized_docs.items():
    print(f"\nCategory: {category} - Total Chunks: {len(chunks)}")
    print("Sample chunk:", chunks[0][:200], "\n")



Category: Public Safety and Community Law - Total Chunks: 477
Sample chunk: Number 1 of 2024 Policing, Security and Community Safety Act 2024 Number 1 of 2024 POLICING, SECURITY AND COMMUNITY SAFETY ACT 2024 CONTENTS PART 1 PRELIMINARY AND GENERAL Section 1. Short title and c 


Category: Health Law - Total Chunks: 261
Sample chunk: Number 2 of 2024 Digital Services Act 2024 Number 2 of 2024 DIGITAL SERVICES ACT 2024 CONTENTS PART 1 PRELIMINARY AND GENERAL Section 1. Short title and commencement 2. Definitions 3. Regulations 4. S 


Category: Financial and Economic Law - Total Chunks: 67
Sample chunk: Number 3 of 2024 Finance (State Guarantees, International Financial Institution Funds and Miscellaneous Provisions) Act 2024 Number 3 of 2024 FINANCE (STATE GUARANTEES, INTERNATIONAL FINANCIAL INSTITU 


Category: Judicial and Legal Procedures - Total Chunks: 9
Sample chunk: Number 4 of 2024 Coroners (Amendment) Act 2024 Number 4 of 2024 CORONERS (AMENDMENT) ACT 2024 CONTENTS Section 1. 

In [4]:
# Sample a subset of Miscellaneous chunks for analysis
sample_size =15
misc_chunks = categorized_docs["Miscellaneous"][:sample_size]

for i, chunk in enumerate(misc_chunks):
    print(f"\nSample {i+1}:\n{chunk[:500]}\n")  # Show the first 500 characters of each sample



Sample 1:
Number 1 of 2024 Policing, Security and Community Safety Act 2024 Number 1 of 2024 POLICING, SECURITY AND COMMUNITY SAFETY ACT 2024 CONTENTS PART 1 PRELIMINARY AND GENERAL Section 1. Short title and commencement 2. Interpretation (General) 3. Security services 4. Policing principles 5. Repeals 6. Expenses PART 2 AN GARDA SÍOCHÁNA CHAPTER 1 Preliminary and General (Part 2) 7. Definitions (Part 2) 8. Continuation of An Garda Síochána 9. Function of An Garda Síochána 10. Prosecution of offences by 


Sample 2:
Garda Síochána 57. Duty of members of garda personnel to account 58. Undertaking by members of garda staff 59. Representative associations 60. Proof of membership, rank or grade in An Garda Síochána CHAPTER 6 Governance and accountability of An Garda Síochána 61. Setting of priorities by Authority for policing services 62. Determination of priorities by Minister for security services 63. Strategic plan 64. Publication and implementation of strategic plan 65. Annual servic

In [5]:
import re

# Initialize a set to store unique act titles
act_titles = set()

# Regular expression to match act titles
act_pattern = re.compile(r'([A-Z][a-zA-Z\s,]+ Act \d{4})')

# Loop through each chunk in Miscellaneous and search for act titles
for chunk in misc_chunks:  # Use 'misc_chunks' here, which is defined above
    matches = act_pattern.findall(chunk)
    for match in matches:
        act_titles.add(match.strip())

# Convert the set to a sorted list for easier readability
sorted_act_titles = sorted(act_titles)

# Display the list of unique act titles
for title in sorted_act_titles:
    print(title)


Amendment of Coroners Act 1962
Amendment of Criminal Justice Act 1984
Amendment of Freedom of Information Act 2014
Amendment of Industrial Relations Act 1990
Amendment of Schedule to National Archives Act 1986
Amendment of Second Schedule to Electoral Act 1992
Amendment of Second Schedule to Ombudsman Act 1980
Animal Health and Welfare Act 2013
Application of Freedom of Information Act 2014
C of Children Act 2001
Children Act 2001
Civil Partnership and Certain Rights and Obligations of Cohabitants Act 2010
Commissions of Investigation Act 2004
Community Safety Act 2024
Companies Act 2014
Coroners Act 1962
Courts of Justice Act 1947
Criminal Justice Act 1984
Criminal Justice Act 2007
Criminal Law Act 1976
Criminal Law Act 1997
Criminal Procedure Act 1967
Data Protection Act 1988
Data Protection Act 2018
Disability Act 2005
Education and Training Boards Act 2013
Electoral Act 1992
Ethics in Public Office Act 1995
European Parliament Elections Act 1997
Europol Act 2012
Freedom of Informat

In [6]:
#Futher analysing Miscellaneous chunks to catergorise them

In [7]:
# Expanded stop words for legal and procedural text analysis
stop_words = set([
    "the", "and", "of", "to", "in", "for", "with", "by", "as", "on", "that", "is", "or", "be", "this", 
    "are", "at", "it", "from", "an", "may", "which", "has", "such", "not", "was", "have", "shall", "act", 
    "section", "subsection", "part", "regulation", "provision", "law", "title", "contents", "general", 
    "amendment", "thereof", "pursuant", "hereby", "herein", "therein", "order", "minister", "prescribed", 
    "includes", "provided", "means", "applies", "further", "within", "under", "applicable", "amended", 
    "inserted", "following", "effective", "authority", "schedule", "accordance", "respect", "relevant", 
    "any", "every", "each", "other", "been", "where", "but", "all", "upon", "into", "make", "period", 
    "certain", "only", "deemed", "case", "established", "whereas", "subject", "etc", "including", "among", 
    "through", "same", "time", "person", "persons", "those", "these", "out", "given", "new", "being"
])


In [8]:
from collections import Counter
import re

# Gather all Miscellaneous chunks into a single text body
misc_text = " ".join(categorized_docs["Miscellaneous"])

# Tokenize the text and remove expanded stop words
words = re.findall(r'\b\w+\b', misc_text)
filtered_words = [word.lower() for word in words if word.lower() not in stop_words and len(word) > 3]

# Count word frequencies
word_counts = Counter(filtered_words)

# Display the 50 most common words
common_words = word_counts.most_common(50)
for word, freq in common_words:
    print(f"{word}: {freq}")


2024: 9441
regulations: 6921
council: 4807
paragraph: 3845
article: 3173
commission: 3129
european: 3011
member: 2986
state: 2975
referred: 2834
notice: 2819
implementing: 2816
information: 2773
made: 2746
than: 2688
application: 2666
specified: 2623
employment: 2561
service: 2351
date: 2345
2022: 2270
officer: 2192
after: 2190
2023: 2118
offence: 2110
court: 2104
relation: 2054
garda: 1985
board: 1959
provisions: 1801
government: 1776
purposes: 1757
public: 1733
body: 1645
local: 1537
union: 1527
publications: 1513
national: 1510
office: 1501
before: 1462
functions: 1455
statutory: 1449
2014: 1404
appropriate: 1364
agreement: 1338
concerned: 1328
required: 1321
scheme: 1308
apply: 1279
permit: 1276


In [None]:
#Using LegalBert to further classify Misc terms