# 1. Define the Chatbot’s Purpose

Identify Goals: Decide what questions the chatbot should answer. For example, it might provide information about rights, duties, or specific articles of the constitution.
Target Audience: Consider who will use the chatbot (students, legal professionals, etc.) to tailor the language and complexity of responses.

# 2. Extract Text from the PDF

# 3. Text Preprocessing
Cleaning the Text:
Remove unnecessary whitespace, special characters, and irrelevant sections.
Normalize the text (e.g., lowercasing).
Tokenization: Split the text into sentences or phrases for easier searching.

# 4. Design the Question-Answering Mechanism
Keyword Matching: Use keywords or phrases to match user questions with relevant sections of the constitution.
NLP Techniques: For a more advanced approach, implement Natural Language Processing (NLP) techniques using libraries like nltk, spaCy, or transformers to understand user queries.

# 5. Build the Chatbot Interface
Web Application: Use Flask or Django to create a web interface for your chatbot.
Command-Line Interface: A simpler option where users can interact with the chatbot via the terminal.


# 6. Testing and Refinement
Test: Interact with the chatbot using various questions to evaluate its performance.
Feedback Loop: Gather user feedback to identify areas for improvement.

# 7.Enhancements (Optional)
Machine Learning Model: Consider implementing a more sophisticated model for understanding user queries.
Context Handling: Improve the chatbot's ability to handle context and follow-up questions.

# 8. Deployment
Choose a Hosting Platform: Consider deploying your chatbot on platforms like Heroku, AWS, or DigitalOcean.
Prepare for Scaling: Ensure the chatbot can handle multiple users and interactions concurrently.

# 9. Documentation and User Guidance
Create documentation for users to understand how to interact with the chatbot.
Include examples of common questions and expected responses.

# 10.  Maintenance
Regularly update the chatbot based on user interactions and feedback.
Consider adding more features or information over time to enhance the user experience.

## FULL CONSTITUTION AS TEXT

In [95]:
import pdfplumber
import spacy
from spellchecker import SpellChecker

# Path to your PDF file
pdf_path = "E:\SCHOOL\Phase 5\Kenya Constitution\Kenya Constitution.pdf"

# Function to extract text from specific pages
def extract_specific_pages(pdf_path, start_page, end_page):
    with pdfplumber.open(pdf_path) as pdf:
        extracted_text = ""
        for page_num in range(start_page, end_page):
            page = pdf.pages[page_num]
            extracted_text += page.extract_text() + "\n"
        return extracted_text

# Define Chapter 1 keywords
chapter_1_keywords = {
    "Sovereignty of the people": "sovereignty",
    "Supremacy of this Constitution": "supremacy",
    "Defence of this Constitution": "defence"
}

# Define Chapter 2 keywords
chapter_2_keywords = {
    "Declaration of the republic": "declaration",
    "Territory of Kenya": "territory",
    "Devolution and access to services": "devolution",
    "National, official and other languages": "languages",
    "State and Religion": "religion",
    "National symbols and national days": "symbol",
    "The national days are": "day",
    "National values and principles": "value",
    "Culture": "culture"
}

# Define Chapter 3 keywords
chapter_3_keywords = {
    "Entitlements of citizens": "entitlement",
    "Retention and acquisition of citizenship": "retention",
    "Citizenship by birth": "birth",
    "Citizenship by registration": "registration",
    "Dual citizenship": "dual",
    "Revocation of citizenship": "revocation",
    "Legislation on citizenship": "legislation"
}

def split_chapter(chapter_text, keywords):
    # Split at key headings and strip extra whitespace
    sections = {value: [] for value in keywords.values()}

    # Split by new lines to process line by line
    lines = chapter_text.splitlines()

    current_section = None

    for line in lines:
        stripped_line = line.strip()

        # Check for section headers using the keywords
        for header, section_key in keywords.items():
            if stripped_line.startswith(header):
                current_section = section_key
                break

        # Append line to the current section if it's set
        if current_section:
            # Prevent adding "days" content to "symbols" for Chapter 2
            if current_section == "symbol" and "The national days are" in stripped_line:
                current_section = "day"

            sections[current_section].append(stripped_line)

    # Join each section into a single string
    for key in sections:
        sections[key] = "\n".join(sections[key])

    return sections

# Extract and process Chapter 1
chapter_1 = extract_specific_pages(pdf_path, 12, 14)
chapter_1_trimmed = chapter_1.split("CHAPTER TWO")[0].strip()
chapter_1_sections = split_chapter(chapter_1_trimmed, chapter_1_keywords)

# Extract and process Chapter 2
chapter_2 = extract_specific_pages(pdf_path, 13, 16)
chapter_2_cleaned = chapter_2.split("CHAPTER TWO")[1].strip()
chapter_2_trimmed = chapter_2_cleaned.split("CHAPTER THREE")[0].strip()
chapter_2_sections = split_chapter(chapter_2_trimmed, chapter_2_keywords)

# Extract and process Chapter 3
chapter_3 = extract_specific_pages(pdf_path, 15, 19)
chapter_3_cleaned = chapter_3.split("CHAPTER THREE")[1].strip()
chapter_3_trimmed = chapter_3_cleaned.split("CHAPTER FOUR")[0].strip()
chapter_3_sections = split_chapter(chapter_3_trimmed, chapter_3_keywords)

# Print to verify the results
print("\nChapter 1 Sections:", chapter_1_sections.keys())
print("\nChapter 2 Sections:", chapter_2_sections.keys())
print("\nChapter 3 Sections:", chapter_3_sections.keys())



Chapter 1 Sections: dict_keys(['sovereignty', 'supremacy', 'defence'])

Chapter 2 Sections: dict_keys(['declaration', 'territory', 'devolution', 'languages', 'religion', 'symbol', 'day', 'value', 'culture'])

Chapter 3 Sections: dict_keys(['entitlement', 'retention', 'birth', 'registration', 'dual', 'revocation', 'legislation'])


In [17]:
# Path to your PDF file
pdf_path = "E:\SCHOOL\Phase 5\Kenya Constitution\Kenya Constitution.pdf"

# Function to extract text from specific pages
def extract_specific_pages(pdf_path, start_page, end_page):
    with pdfplumber.open(pdf_path) as pdf:
        extracted_text = ""
        for page_num in range(start_page, end_page):
            page = pdf.pages[page_num]
            extracted_text += page.extract_text() + "\n"
        return extracted_text


# Chapter 1

In [18]:
chapter_1 = extract_specific_pages(pdf_path, 12,14 )
chapter_1

'Constitution of Kenya, 2010 13\nTHE CONSTITUTION OF KENYA\nCHAPTER ONE—SOVEREIGNTY OF THE PEOPLE AND\nSUPREMACY OF THIS CONSTITUTION\nSovereignty of the people.\n1. (1) All sovereign power belongs to the people of Kenya and\nshall be exercised only in accordance with this Constitution.\n(2) The people may exercise their sovereign power either directly\nor through their democratically elected representatives.\n(3) Sovereign power under this Constitution is delegated to the\nfollowing State organs, which shall perform their functions in\naccordance with this Constitution—\n(a) Parliament and the legislative assemblies in the county\ngovernments;\n(b) the national executive and the executive structures in the\ncounty governments; and\n(c) the Judiciary and independent tribunals.\n(4) The sovereign power of the people is exercised at—\n(a) the national level; and\n(b) the county level.\nSupremacy of this Constitution.\n2. (1) This Constitution is the supreme law of the Republic and\nbinds

### Since page 13(of the Constitution) contains a section of chapter 2, then we'll need to trim the page to ensure only chapter one info is stored.

In [19]:
chapter_1_trimmed = chapter_1.split("CHAPTER TWO")[0].strip()
chapter_1_trimmed

'Constitution of Kenya, 2010 13\nTHE CONSTITUTION OF KENYA\nCHAPTER ONE—SOVEREIGNTY OF THE PEOPLE AND\nSUPREMACY OF THIS CONSTITUTION\nSovereignty of the people.\n1. (1) All sovereign power belongs to the people of Kenya and\nshall be exercised only in accordance with this Constitution.\n(2) The people may exercise their sovereign power either directly\nor through their democratically elected representatives.\n(3) Sovereign power under this Constitution is delegated to the\nfollowing State organs, which shall perform their functions in\naccordance with this Constitution—\n(a) Parliament and the legislative assemblies in the county\ngovernments;\n(b) the national executive and the executive structures in the\ncounty governments; and\n(c) the Judiciary and independent tribunals.\n(4) The sovereign power of the people is exercised at—\n(a) the national level; and\n(b) the county level.\nSupremacy of this Constitution.\n2. (1) This Constitution is the supreme law of the Republic and\nbinds

In [21]:
# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Preprocess the user query using spaCy
def preprocess_query(query):
    # Parse the query with spaCy
    doc = nlp(query)
    # Normalize the query: lowercase, lemmatize, and remove stopwords
    tokens = [token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct]
    return " ".join(tokens)

# Example usage of the preprocessing function
user_query = "What is the supremacy of the constitution?"
processed_query = preprocess_query(user_query)
print(processed_query)


supremacy constitution


Works perfectly! I gave the input query and the function lemmatized, lowercased and removed the stop words from it returning a string with succulent info.

In [22]:
chapter_1_trimmed

'Constitution of Kenya, 2010 13\nTHE CONSTITUTION OF KENYA\nCHAPTER ONE—SOVEREIGNTY OF THE PEOPLE AND\nSUPREMACY OF THIS CONSTITUTION\nSovereignty of the people.\n1. (1) All sovereign power belongs to the people of Kenya and\nshall be exercised only in accordance with this Constitution.\n(2) The people may exercise their sovereign power either directly\nor through their democratically elected representatives.\n(3) Sovereign power under this Constitution is delegated to the\nfollowing State organs, which shall perform their functions in\naccordance with this Constitution—\n(a) Parliament and the legislative assemblies in the county\ngovernments;\n(b) the national executive and the executive structures in the\ncounty governments; and\n(c) the Judiciary and independent tribunals.\n(4) The sovereign power of the people is exercised at—\n(a) the national level; and\n(b) the county level.\nSupremacy of this Constitution.\n2. (1) This Constitution is the supreme law of the Republic and\nbinds

Next up, I'll split the chapter into three different sections which will increase the chatbot's accuracy in answering questions. This means that the bot will be able to return the section on 'supremacy of the constitution' exclusively without the other parts on sovereignty and defence in chapter 1 of the constitution. This makes it efficient in answering the questions.

In [23]:
def split_chapter(chapter_text):
    # Split at key headings and strip extra whitespace
    sections = {
        "sovereignty": [],
        "supremacy": [],
        "defence": []   
    }

    # Split by new lines to process line by line
    lines = chapter_text.splitlines()

    current_section = None

    for line in lines:
        stripped_line = line.strip()

        if stripped_line.startswith("Sovereignty of the people"):
            current_section = "sovereignty"
        elif stripped_line.startswith("Supremacy of this Constitution"):
            current_section = "supremacy"
        elif stripped_line.startswith("Defence of this Constitution"):
            current_section = 'defence'

        # Append line to the current section if it's set
        if current_section:
            sections[current_section].append(stripped_line)

    # Join each section into a single string
    for key in sections:
        sections[key] = "\n".join(sections[key])

    return sections

# Split the chapter into sections
chapter_1_sections = split_chapter(chapter_1_trimmed)

print("Example")
print("\nSupremacy Section:\n", chapter_1_sections['supremacy'])
print("\nDefence Section:\n", chapter_1_sections['defence'])



Example

Supremacy Section:
 Supremacy of this Constitution.
2. (1) This Constitution is the supreme law of the Republic and
binds all persons and all State organs at both levels of government.
(2) No person may claim or exercise State authority except as
authorised under this Constitution.
(3) The validity or legality of this Constitution is not subject to
challenge by or before any court or other State organ.
(4) Any law, including customary law, that is inconsistent with this
Constitution is void to the extent of the inconsistency, and any act or
omission in contravention of this Constitution is invalid.
(5) The general rules of international law shall form part of the
law of Kenya.
14 Constitution of Kenya, 2010
(6) Any treaty or convention ratified by Kenya shall form part of
the law of Kenya under this Constitution.

Defence Section:
 Defence of this Constitution.
3. (1) Every person has an obligation to respect, uphold and
defend this Constitution.
(2) Any attempt to establish a

In [24]:
chapter_1_sections.keys()

dict_keys(['sovereignty', 'supremacy', 'defence'])

In [25]:
# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Preprocess the user query using spaCy
def preprocess_query(query):
    # Parse the query with spaCy
    doc = nlp(query)
    # Normalize the query: lowercase, lemmatize, and remove stopwords
    tokens = [token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct]
    return " ".join(tokens)

# Define the sections variable with Chapter 1
sections = chapter_1_sections

# Define the QA mapping based on key phrases and corresponding sections
qa_mapping = {
    "supremacy": "supremacy of this constitution",
    "sovereignty": "sovereignty of the people",
    "defence": "defence of this constitution"
    # Add more mappings as needed
}

# Update the Q&A system to use preprocessed queries
def answer_question_nlp(query, sections, qa_mapping):
    # Preprocess the user query
    processed_query = preprocess_query(query)
    
    # Search for a key in qa_mapping that matches the preprocessed query
    for key in qa_mapping:
        if key in processed_query:
            # Return the relevant section in the text
            return sections[key]
            
    return "Sorry, I couldn't find an answer to your question."

# Example usage
user_query = "What is the supremacy of the constitution?"
answer = answer_question_nlp(user_query, sections, qa_mapping)
print(answer)


Supremacy of this Constitution.
2. (1) This Constitution is the supreme law of the Republic and
binds all persons and all State organs at both levels of government.
(2) No person may claim or exercise State authority except as
authorised under this Constitution.
(3) The validity or legality of this Constitution is not subject to
challenge by or before any court or other State organ.
(4) Any law, including customary law, that is inconsistent with this
Constitution is void to the extent of the inconsistency, and any act or
omission in contravention of this Constitution is invalid.
(5) The general rules of international law shall form part of the
law of Kenya.
14 Constitution of Kenya, 2010
(6) Any treaty or convention ratified by Kenya shall form part of
the law of Kenya under this Constitution.


In [26]:
# Example 2
user_query_2 = 'Explain what the defence the people states'
answer_2 = answer_question_nlp(user_query_2, sections, qa_mapping)
answer_2

'Defence of this Constitution.\n3. (1) Every person has an obligation to respect, uphold and\ndefend this Constitution.\n(2) Any attempt to establish a government otherwise than in\ncompliance with this Constitution is unlawful.'

## Chapter 2

In [27]:
chapter_2 = extract_specific_pages(pdf_path, 13,16)

Since the first page of chapter 2 has some info from chapter 1, we need to remove the chapter 1 section.

In [28]:
chapter_2_cleaned = chapter_2.split("CHAPTER TWO")[1].strip()
chapter_2_cleaned

'—THE REPUBLIC\nDeclaration of the Republic.\n4. (1) Kenya is a sovereign Republic.\n(2) The Republic of Kenya shall be a multi-party democratic State\nfounded on the national values and principles of governance referred\nto in Article 10.\nTerritory of Kenya.\n5. Kenya consists of the territory and territorial waters comprising\nKenya on the effective date, and any additional territory and territorial\nwaters as defined by an Act of Parliament.\nDevolution and access to services.\n6. (1) The territory of Kenya is divided into the counties specified\nin the First Schedule.\n(2) The governments at the national and county levels are distinct\nand inter-dependent and shall conduct their mutual relations on the\nbasis of consultation and cooperation.\n(3) A national State organ shall ensure reasonable access to its\nservices in all parts of the Republic, so far as it is appropriate to do so\nhaving regard to the nature of the service.\nNational, official and other languages.\n7. (1) The na

The last page also has a bit of chapter 3, so we'll remove it too.

In [29]:
# Find the start of chapter 3 and remove it
chapter_2_trimmed = chapter_2_cleaned.split("CHAPTER THREE")[0].strip()
chapter_2_trimmed

'—THE REPUBLIC\nDeclaration of the Republic.\n4. (1) Kenya is a sovereign Republic.\n(2) The Republic of Kenya shall be a multi-party democratic State\nfounded on the national values and principles of governance referred\nto in Article 10.\nTerritory of Kenya.\n5. Kenya consists of the territory and territorial waters comprising\nKenya on the effective date, and any additional territory and territorial\nwaters as defined by an Act of Parliament.\nDevolution and access to services.\n6. (1) The territory of Kenya is divided into the counties specified\nin the First Schedule.\n(2) The governments at the national and county levels are distinct\nand inter-dependent and shall conduct their mutual relations on the\nbasis of consultation and cooperation.\n(3) A national State organ shall ensure reasonable access to its\nservices in all parts of the Republic, so far as it is appropriate to do so\nhaving regard to the nature of the service.\nNational, official and other languages.\n7. (1) The na

In [30]:
def split_chapter(chapter_text):
    # Split at key headings and strip extra whitespace
    sections = {
        "declaration": [],
        "territory": [],
        "devolution": [],
        "languages": [],
        "religion": [],
        "symbols": [],
        "days" : [],
        "value": [],
        "culture": []
    }

    # Split by new lines to process line by line
    lines = chapter_text.splitlines()

    current_section = None

    for line in lines:
        stripped_line = line.strip()

        if stripped_line.startswith("Declaration of the republic"):
            current_section = "declaration"
        elif stripped_line.startswith("Territory of Kenya"):
            current_section = "territory"
        elif stripped_line.startswith("Devolution and access to services"):
            current_section = 'devolution'
        elif stripped_line.startswith("National, official and other languages"):
            current_section = "languages"
        elif stripped_line.startswith("State and Religion"):
            current_section = "religion"
        elif stripped_line.startswith("National symbols and national days"):
            current_section = "symbols"
        elif stripped_line.startswith("The national days are"):
            current_section = "days"
        elif stripped_line.startswith("National values and principles"):
            current_section = "value"
        elif stripped_line.startswith("Culture"):
            current_section = "culture"

        # Append line to the current section if it's set
        if current_section:
            sections[current_section].append(stripped_line)

    # Join each section into a single string
    for key in sections:
        sections[key] = "\n".join(sections[key])
    

    return sections

# Split the chapter into sections
chapter_2_sections = split_chapter(chapter_2_trimmed)

print("Example")
print("\n National Days Section:\n", chapter_2_sections["symbols"])

Example

 National Days Section:
 National symbols and national days.
9. (1) The national symbols of the Republic are—
(a) the national flag;
(b) the national anthem;
(c) the coat of arms; and
(d) the public seal.
(2) The national symbols are as set out in the Second Schedule.
(3) The national days are—
(a) Madaraka Day, to be observed on 1st June;
(b) Mashujaa Day, to be observed on 20th October; and
(c) Jamhuri Day, to be observed on 12th December.
(4) A national day shall be a public holiday.
(5) Parliament may enact legislation prescribing other public
holidays, and providing for observance of public holidays.


It seems that the national days and symbols are being included under one single section. 
This is quite a small issue but since I want to increase the bot's efficiency I'll have to separate the national symbols and national days.

In [31]:
def split_chapter(chapter_text):
    # Split at key headings and strip extra whitespace
    sections = {
        "declaration": [],
        "territory": [],
        "devolution": [],
        "languages": [],
        "religion": [],
        "symbol": [],
        "day": [],
        "value": [],
        "governance": [],
        "culture": []
    }

    # Split by new lines to process line by line
    lines = chapter_text.splitlines()

    current_section = None

    for line in lines:
        stripped_line = line.strip()

        # Start of new sections
        if stripped_line.startswith("Declaration of the republic"):
            current_section = "declaration"
        elif stripped_line.startswith("Territory of Kenya"):
            current_section = "territory"
        elif stripped_line.startswith("Devolution and access to services"):
            current_section = 'devolution'
        elif stripped_line.startswith("National, official and other languages"):
            current_section = "languages"
        elif stripped_line.startswith("State and Religion"):
            current_section = "religion"
        elif stripped_line.startswith("National symbols and national days"):
            current_section = "symbol"
        elif stripped_line.startswith("The national days are"):
            current_section = "day"
        elif stripped_line.startswith("National values and principles"):
            current_section = "value"
        elif stripped_line.startswith("Culture"):
            current_section = "culture"

        # Append line to the current section if it's set
        if current_section:
            # Prevent adding "days" content to "symbols"
            if current_section == "symbol" and "The national days are" in stripped_line:
                current_section = "day"

            sections[current_section].append(stripped_line)

    # Join each section into a single string
    for key in sections:
        sections[key] = "\n".join(sections[key])

    return sections

# Split the chapter into sections
chapter_2_sections = split_chapter(chapter_2_trimmed)

# Print to verify the results
print("\n National Symbols Section:\n", chapter_2_sections["symbol"])
print("\n National Days Section:\n", chapter_2_sections["day"])



 National Symbols Section:
 National symbols and national days.
9. (1) The national symbols of the Republic are—
(a) the national flag;
(b) the national anthem;
(c) the coat of arms; and
(d) the public seal.
(2) The national symbols are as set out in the Second Schedule.

 National Days Section:
 (3) The national days are—
(a) Madaraka Day, to be observed on 1st June;
(b) Mashujaa Day, to be observed on 20th October; and
(c) Jamhuri Day, to be observed on 12th December.
(4) A national day shall be a public holiday.
(5) Parliament may enact legislation prescribing other public
holidays, and providing for observance of public holidays.


Noice! The function worked effectively.

Next, I'll work on the basic user query handling

In [96]:
# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Preprocess the user query using spaCy
def preprocess_query(query):
    # Parse the query with spaCy
    doc = nlp(query)
    # Normalize the query: lowercase, lemmatize, and remove stopwords
    tokens = [token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct]
    return " ".join(tokens)

# Define the sections variable with Chapter 2
sections = chapter_2_sections

# Define the QA mapping based on key phrases and corresponding sections
qa_mapping = {
    "declaration": "declaration of the republic",
    "territory": "territory of kenya", 
    "devolution": "devolution and access to service",
    "languages": "national, official and other languages",
    "religion": "state and religion",
    "symbol": "national symbols and national days",
    "day": "national symbols and national days",
    "value": "national values and principles of governance",
    "governance": "national values and principles of governance",
    "culture": "culture"
}



# Update the Q&A system to use preprocessed queries
def answer_question_nlp(query, sections, qa_mapping):
    # Preprocess the user query
    processed_query = preprocess_query(query)

    # Debug
    print(f"Processed query: {processed_query}")
    
    # Search for a key in qa_mapping that matches the preprocessed query
    for key in qa_mapping:
        if key in processed_query:
            # Debug line
            print(f"key: {key}")
            # Return the relevant section in the text
            return sections[key]    
        
    return "Sorry, I couldn't find an answer to your question."

# Example usage
user_query = "National days in the constitution?"
answer = answer_question_nlp(user_query, sections, qa_mapping)
print(answer)

Processed query: national day constitution
key: day
(3) The national days are—
(a) Madaraka Day, to be observed on 1st June;
(b) Mashujaa Day, to be observed on 20th October; and
(c) Jamhuri Day, to be observed on 12th December.
(4) A national day shall be a public holiday.
(5) Parliament may enact legislation prescribing other public
holidays, and providing for observance of public holidays.


Note: I had to change the symbols and days section to symbol and day section for correct qa mapping. 
The nlp model lemmatizes the plural words to their singular form which is not contained in the original section of the constitution.

In [76]:
user_query = "What does the culture entail?"
answer = answer_question_nlp(user_query, sections, qa_mapping)
answer

Processed query: culture entail
key: culture


'Culture.\n11. (1) This Constitution recognises culture as the foundation of\nthe nation and as the cumulative civilization of the Kenyan people and\nnation.\n(2) The State shall—\n(a) promote all forms of national and cultural expression through\nliterature, the arts, traditional celebrations, science,\ncommunication, information, mass media, publications,\nlibraries and other cultural heritage;\n(b) recognise the role of science and indigenous technologies in\nthe development of the nation; and\n(c) promote the intellectual property rights of the people of Kenya.\n(3) Parliament shall enact legislation to—\n(a) ensure that communities receive compensation or royalties for\nthe use of their cultures and cultural heritage; and\n(b) recognise and protect the ownership of indigenous seeds and\nplant varieties, their genetic and diverse characteristics and\ntheir use by the communities of Kenya.'

Noice...The model handles basic user query from chapter 2

last basic prompt for this chapter

In [77]:
user_query = "what about the constitution state about values?"
answer = answer_question_nlp(user_query, sections, qa_mapping)
answer

Processed query: constitution state value
key: value


'National values and principles of governance.\n10. (1) The national values and principles of governance in this\nArticle bind all State organs, State officers, public officers and all\npersons whenever any of them—\n(a) applies or interprets this Constitution;\n(b) enacts, applies or interprets any law; or\n(c) makes or implements public policy decisions.\n(2) The national values and principles of governance include—\n16 Constitution of Kenya, 2010\n(a) patriotism, national unity, sharing and devolution of power, the\nrule of law, democracy and participation of the people;\n(b) human dignity, equity, social justice, inclusiveness, equality,\nhuman rights, non-discrimination and protection of the\nmarginalised;\n(c) good governance, integrity, transparency and accountability;\nand\n(d) sustainable development.'

Works great!

## Chapter 3

In [78]:
chapter_3 = extract_specific_pages(pdf_path, 15,19 )
chapter_3

'16 Constitution of Kenya, 2010\n(a) patriotism, national unity, sharing and devolution of power, the\nrule of law, democracy and participation of the people;\n(b) human dignity, equity, social justice, inclusiveness, equality,\nhuman rights, non-discrimination and protection of the\nmarginalised;\n(c) good governance, integrity, transparency and accountability;\nand\n(d) sustainable development.\nCulture.\n11. (1) This Constitution recognises culture as the foundation of\nthe nation and as the cumulative civilization of the Kenyan people and\nnation.\n(2) The State shall—\n(a) promote all forms of national and cultural expression through\nliterature, the arts, traditional celebrations, science,\ncommunication, information, mass media, publications,\nlibraries and other cultural heritage;\n(b) recognise the role of science and indigenous technologies in\nthe development of the nation; and\n(c) promote the intellectual property rights of the people of Kenya.\n(3) Parliament shall enact 

The first page in this pdf extract contains both the conclusion to chapter 2 and the beginning of chapter 3
We'll therefore have to split the page to extract chapter 3



In [79]:
chapter_3_cleaned = chapter_3.split("CHAPTER THREE")[1].strip()
chapter_3_cleaned

'—CITIZENSHIP\nEntitlements of citizens.\n12. (1) Every citizen is entitled to—\n(a) the rights, privileges and benefits of citizenship, subject to the\nlimits provided or permitted by this Constitution; and\n(b) a Kenyan passport and any document of registration or\nidentification issued by the State to citizens.\nConstitution of Kenya, 2010 17\n(2) A passport or other document referred to in clause (1) (b) may\nbe denied, suspended or confiscated only in accordance with an Act of\nParliament that satisfies the criteria referred to in Article 24.\nRetention and acquisition of citizenship.\n13. (1) Every person who was a citizen immediately before the\neffective date retains the same citizenship status as of that date.\n(2) Citizenship may be acquired by birth or registration.\n(3) Citizenship is not lost through marriage or the dissolution of\nmarriage.\nCitizenship by birth.\n14. (1) A person is a citizen by birth if on the day of the person’s\nbirth, whether or not the person is bor

We'll also split the last page because there is some chapter 4 info on it

In [80]:
chapter_3_trimmed = chapter_3_cleaned.split("CHAPTER FOUR")[0].strip()
chapter_3_trimmed

'—CITIZENSHIP\nEntitlements of citizens.\n12. (1) Every citizen is entitled to—\n(a) the rights, privileges and benefits of citizenship, subject to the\nlimits provided or permitted by this Constitution; and\n(b) a Kenyan passport and any document of registration or\nidentification issued by the State to citizens.\nConstitution of Kenya, 2010 17\n(2) A passport or other document referred to in clause (1) (b) may\nbe denied, suspended or confiscated only in accordance with an Act of\nParliament that satisfies the criteria referred to in Article 24.\nRetention and acquisition of citizenship.\n13. (1) Every person who was a citizen immediately before the\neffective date retains the same citizenship status as of that date.\n(2) Citizenship may be acquired by birth or registration.\n(3) Citizenship is not lost through marriage or the dissolution of\nmarriage.\nCitizenship by birth.\n14. (1) A person is a citizen by birth if on the day of the person’s\nbirth, whether or not the person is bor

In [81]:
def split_chapter(chapter_text):
    # Split at key headings and strip extra whitespace
    sections = {
        "entitlement": [],
        "retention": [],
        "birth": [],
        "registration": [],
        "dual": [],
        "revocation": [],
        "legislation" : [],
    }

    # Split by new lines to process line by line
    lines = chapter_text.splitlines()

    current_section = None

    for line in lines:
        stripped_line = line.strip()

        if stripped_line.startswith("Entitlements of citizens"):
            current_section = "entitlement"
        elif stripped_line.startswith("Retention and acquisition of citizenship"):
            current_section = "retention"
        elif stripped_line.startswith("Citizenship by birth"):
            current_section = 'birth'
        elif stripped_line.startswith("Citizenship by registration"):
            current_section = "registration"
        elif stripped_line.startswith("Dual citizenship"):
            current_section = "dual"
        elif stripped_line.startswith("Revocation of citizenship"):
            current_section = "revocation"
        elif stripped_line.startswith("Legislation on citizenship"):
            current_section = "legislation"

        # Append line to the current section if it's set
        if current_section:
            sections[current_section].append(stripped_line)

    # Join each section into a single string
    for key in sections:
        sections[key] = "\n".join(sections[key])

    return sections

In [82]:
chapter_3_sections = split_chapter(chapter_3_trimmed)
chapter_3_sections.keys()

dict_keys(['entitlement', 'retention', 'birth', 'registration', 'dual', 'revocation', 'legislation'])

In [97]:
# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Preprocess the user query using spaCy
def preprocess_query(query):
    # Parse the query with spaCy
    doc = nlp(query)
    # Normalize the query: lowercase, lemmatize, and remove stopwords
    tokens = [token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct]
    return " ".join(tokens)

# Define the sections variable with Chapter 2
sections = chapter_3_sections

# Define the QA mapping based on key phrases and corresponding sections
qa_mapping = {
    "entitlement": "entitlement of citizens",
        "retention": "retention and acquisition of citizenship",
        "birth": "citizenship by birth",
        "registration": "citizenship by registration",
        "dual": "dual citizenship",
        "revocation": "revocation of citizenship",
        "legislation" : "legislation on citizenship"
}



# Update the Q&A system to use preprocessed queries
def answer_question_nlp(query, sections, qa_mapping):
    # Preprocess the user query
    processed_query = preprocess_query(query)

    # Debug
    print(f"Processed query: {processed_query}")
    
    # Search for a key in qa_mapping that matches the preprocessed query
    for key in qa_mapping:
        if key in processed_query:
            # Return the relevant section in the text
            return sections[key]
            
    return "Sorry, I couldn't find an answer to your question."

# Example usage
user_query = "revocation on citizenship?"
answer = answer_question_nlp(user_query, sections, qa_mapping)
print(answer)


Processed query: revocation citizenship
Revocation of citizenship.
17. (1) If a person acquired citizenship by registration, the
citizenship may be revoked if the person—
(a) acquired the citizenship by fraud, false representation or
concealment of any material fact;
(b) has, during any war in which Kenya was engaged, unlawfully
traded or communicated with an enemy or been engaged in
or associated with any business that was knowingly carried
on in such a manner as to assist an enemy in that war;
(c) has, within five years after registration, been convicted of an
offence and sentenced to imprisonment for a term of three
years or longer; or
(d) has, at any time after registration, been convicted of treason,
or of an offence for which—
(i) a penalty of at least seven years imprisonment may be
imposed; or
(ii) a more severe penalty may be imposed.
(2) The citizenship of a person who was presumed to be a citizen
by birth, as contemplated in Article 14 (4), may be revoked if—
(a) the citizen

Let's do two more queries to see if it works correctly

In [98]:
user_query = "How to get citizenship by registration?"
answer = answer_question_nlp(user_query, sections, qa_mapping)
print(answer)


Processed query: citizenship registration
Citizenship by registration.
15. (1) A person who has been married to a citizen for a period of
at least seven years is entitled on application to be registered as a
citizen.
(2) A person who has been lawfully resident in Kenya for a
continuous period of at least seven years, and who satisfies the
conditions prescribed by an Act of Parliament, may apply to be
registered as a citizen.
(3) A child who is not a citizen, but is adopted by a citizen, is
entitled on application to be registered as a citizen.
18 Constitution of Kenya, 2010
(4) Parliament shall enact legislation establishing conditions on
which citizenship may be granted to individuals who are citizens of
other countries.
(5) This Article applies to a person as from the effective date, but
any requirements that must be satisfied before the person is entitled to
be registered as a citizen shall be regarded as having been satisfied
irrespective of whether the person satisfied them before

In [99]:
test_2 = "What about dual citizenship?"
answer_ = answer_question_nlp(test_2, sections, qa_mapping)
print(answer_)

Processed query: dual citizenship
Dual citizenship.
16. A citizen by birth does not lose citizenship by acquiring the
citizenship of another country.


The code works perfectly and returns the intended response!

# NLU

Now to work on misspellings and synonyms.
We'll use SpellChecker

## Debugging code

In [100]:
# Define synonym mapping
synonyms = {
    "supremacy" : ["supremacy", "authority", "ultimate power"],
    "sovereignty" : ["sovereignty", "power of the people", "authority of the people"],
    "defence" : ["defense", "protection", "preservation"]
}

# QA mapping
qa_mapping = {
    "supremacy": "supremacy",
    "sovereignty": "sovereignty",
    "defence": "defence"
}

sections = chapter_1_sections

# Preprocess the query
def preprocess_query(query):
    doc = nlp(query)
    tokens = [token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct]
    return " ".join(tokens)

# Function to correct the spelling
def correct_spelling(processed_query):
    spell = SpellChecker()
    words = processed_query.split()
    # Find misspelled words
    misspelled_words = spell.unknown(words)


    corrected_words = []
    for word in words:
        # Correct the word if it's misspelled
        if word in misspelled_words:
            corrected_word = spell.correction(word) # Get the most likely correction
            corrected_words.append(corrected_word)
        else:
            corrected_words.append(word)

    # Reconstruct the initial sentence
    corrected_input = " ".join(corrected_words)

    return corrected_input

            

# Match with synonym support
def match_with_synonyms(query, qa_mapping, synonyms):
    processed_query = preprocess_query(query)
    print(f"Processed Query: {processed_query}")  # Debugging line

    # Correct spelling in the processed query
    corrected_query = correct_spelling(processed_query)
    print(f"Corrected Query: {corrected_query}")  # Debugging line


    for key, value in qa_mapping.items():
        print(f"Checking key: {key}, value: {value}")  # Debugging line
        for synonym in synonyms.get(key, [key]):
            print(f"Trying synonym: {synonym}")  # Debugging line
            if synonym in corrected_query:
                print(f"Match found with synonym: {synonym}")  # Debugging line
                return key  # Only return section value of the key
    
    print("No match found")  # Debugging line if no match is found
    return None

# Answer function with synonym and fuzzy matching
def answer_question_nlp(query, sections, qa_mapping, synonyms):
    section_key = match_with_synonyms(query, qa_mapping, synonyms)
    
    if section_key:
        # Retrieve the relevant section from the specified chapter
        return sections.get(section_key, "Section not found.")
    
    return "Sorry, I couldn't find an answer to your question."


In [101]:
user_query = "What is the ultimate power of the constitution?"
answer = answer_question_nlp(user_query, sections, qa_mapping, synonyms)
print(answer)


Processed Query: ultimate power constitution
Corrected Query: ultimate power constitution
Checking key: supremacy, value: supremacy
Trying synonym: supremacy
Trying synonym: authority
Trying synonym: ultimate power
Match found with synonym: ultimate power
Supremacy of this Constitution.
2. (1) This Constitution is the supreme law of the Republic and
binds all persons and all State organs at both levels of government.
(2) No person may claim or exercise State authority except as
authorised under this Constitution.
(3) The validity or legality of this Constitution is not subject to
challenge by or before any court or other State organ.
(4) Any law, including customary law, that is inconsistent with this
Constitution is void to the extent of the inconsistency, and any act or
omission in contravention of this Constitution is invalid.
(5) The general rules of international law shall form part of the
law of Kenya.
14 Constitution of Kenya, 2010
(6) Any treaty or convention ratified by Kenya 

# Precise code

In [102]:
import pdfplumber
import spacy
from spellchecker import SpellChecker

# Path to your PDF file
pdf_path = "E:\SCHOOL\Phase 5\Kenya Constitution\Kenya Constitution.pdf"

# Function to extract text from specific pages
def extract_specific_pages(pdf_path, start_page, end_page):
    with pdfplumber.open(pdf_path) as pdf:
        extracted_text = ""
        for page_num in range(start_page, end_page):
            page = pdf.pages[page_num]
            extracted_text += page.extract_text() + "\n"
        return extracted_text

# Define Chapter 1 keywords
chapter_1_keywords = {
    "Sovereignty of the people": "sovereignty",
    "Supremacy of this Constitution": "supremacy",
    "Defence of this Constitution": "defence"
}

# Define Chapter 2 keywords
chapter_2_keywords = {
    "Declaration of the republic": "declaration",
    "Territory of Kenya": "territory",
    "Devolution and access to services": "devolution",
    "National, official and other languages": "languages",
    "State and Religion": "religion",
    "National symbols and national days": "symbol",
    "The national days are": "day",
    "National values and principles": "value",
    "Culture": "culture"
}

# Define Chapter 3 keywords
chapter_3_keywords = {
    "Entitlements of citizens": "entitlement",
    "Retention and acquisition of citizenship": "retention",
    "Citizenship by birth": "birth",
    "Citizenship by registration": "registration",
    "Dual citizenship": "dual",
    "Revocation of citizenship": "revocation",
    "Legislation on citizenship": "legislation"
}

def split_chapter(chapter_text, keywords):
    # Split at key headings and strip extra whitespace
    sections = {value: [] for value in keywords.values()}

    # Split by new lines to process line by line
    lines = chapter_text.splitlines()

    current_section = None

    for line in lines:
        stripped_line = line.strip()

        # Check for section headers using the keywords
        for header, section_key in keywords.items():
            if stripped_line.startswith(header):
                current_section = section_key
                break

        # Append line to the current section if it's set
        if current_section:
            # Prevent adding "days" content to "symbols" for Chapter 2
            if current_section == "symbol" and "The national days are" in stripped_line:
                current_section = "day"

            sections[current_section].append(stripped_line)

    # Join each section into a single string
    for key in sections:
        sections[key] = "\n".join(sections[key])

    return sections

# Extract and process Chapter 1
chapter_1 = extract_specific_pages(pdf_path, 12, 14)
chapter_1_trimmed = chapter_1.split("CHAPTER TWO")[0].strip()
chapter_1_sections = split_chapter(chapter_1_trimmed, chapter_1_keywords)

# Extract and process Chapter 2
chapter_2 = extract_specific_pages(pdf_path, 13, 16)
chapter_2_cleaned = chapter_2.split("CHAPTER TWO")[1].strip()
chapter_2_trimmed = chapter_2_cleaned.split("CHAPTER THREE")[0].strip()
chapter_2_sections = split_chapter(chapter_2_trimmed, chapter_2_keywords)

# Extract and process Chapter 3
chapter_3 = extract_specific_pages(pdf_path, 15, 19)
chapter_3_cleaned = chapter_3.split("CHAPTER THREE")[1].strip()
chapter_3_trimmed = chapter_3_cleaned.split("CHAPTER FOUR")[0].strip()
chapter_3_sections = split_chapter(chapter_3_trimmed, chapter_3_keywords)

# Print to verify the results
print("\nChapter 1 Sections:", chapter_1_sections.keys())
print("\nChapter 2 Sections:", chapter_2_sections.keys())
print("\nChapter 3 Sections:", chapter_3_sections.keys())



Chapter 1 Sections: dict_keys(['sovereignty', 'supremacy', 'defence'])

Chapter 2 Sections: dict_keys(['declaration', 'territory', 'devolution', 'languages', 'religion', 'symbol', 'day', 'value', 'culture'])

Chapter 3 Sections: dict_keys(['entitlement', 'retention', 'birth', 'registration', 'dual', 'revocation', 'legislation'])


In [103]:
# Define synonym mapping
synonyms = {
    "supremacy" : ["supremacy", "authority", "ultimate power"],
    "sovereignty" : ["sovereignty", "power of the people", "authority of the people"],
    "defence" : ["defense", "protection", "preservation"]
}

# QA mapping
qa_mapping = {
    "supremacy": "supremacy",
    "sovereignty": "sovereignty",
    "defence": "defence"
}

sections = chapter_1_sections

# Preprocess the query
def preprocess_query(query):
    doc = nlp(query)
    tokens = [token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct]
    return " ".join(tokens)

# Function to correct the spelling
def correct_spelling(processed_query):
    spell = SpellChecker()
    words = processed_query.split()
    # Find misspelled words
    misspelled_words = spell.unknown(words)


    corrected_words = []
    for word in words:
        # Correct the word if it's misspelled
        if word in misspelled_words:
            corrected_word = spell.correction(word) # Get the most likely correction
            corrected_words.append(corrected_word)
        else:
            corrected_words.append(word)

    # Reconstruct the initial sentence
    corrected_input = " ".join(corrected_words)

    return corrected_input

            

# Match with synonym support
def match_with_synonyms(query, qa_mapping, synonyms):
    processed_query = preprocess_query(query)

    # Correct spelling in the processed query
    corrected_query = correct_spelling(processed_query)


    for key, value in qa_mapping.items():
        for synonym in synonyms.get(key, [key]):
            if synonym in corrected_query:
                return key  # Only return section value of the key
    
    print("No match found")
    return None

# Answer function with synonym and fuzzy matching
def answer_question_nlp(query, sections, qa_mapping, synonyms):
    section_key = match_with_synonyms(query, qa_mapping, synonyms)
    
    if section_key:
        # Retrieve the relevant section from the specified chapter
        return sections.get(section_key, "Section not found.")
    
    return "Sorry, I couldn't find an answer to your question."


In [104]:
user_query = "What is the soveriegnty"
answer = answer_question_nlp(user_query, sections, qa_mapping, synonyms)
print(answer)


Sovereignty of the people.
1. (1) All sovereign power belongs to the people of Kenya and
shall be exercised only in accordance with this Constitution.
(2) The people may exercise their sovereign power either directly
or through their democratically elected representatives.
(3) Sovereign power under this Constitution is delegated to the
following State organs, which shall perform their functions in
accordance with this Constitution—
(a) Parliament and the legislative assemblies in the county
governments;
(b) the national executive and the executive structures in the
county governments; and
(c) the Judiciary and independent tribunals.
(4) The sovereign power of the people is exercised at—
(a) the national level; and
(b) the county level.


In [105]:
# Example 2
user_query2 = "What about defense"
answer = answer_question_nlp(user_query2, sections, qa_mapping, synonyms)
print(answer)


Defence of this Constitution.
3. (1) Every person has an obligation to respect, uphold and
defend this Constitution.
(2) Any attempt to establish a government otherwise than in
compliance with this Constitution is unlawful.


Test for the synonyms

Test for misspellings

In [106]:
user_query = "What is the authoity of the constitution?"
answer = answer_question_nlp(user_query, sections, qa_mapping, synonyms)
print(answer)


Supremacy of this Constitution.
2. (1) This Constitution is the supreme law of the Republic and
binds all persons and all State organs at both levels of government.
(2) No person may claim or exercise State authority except as
authorised under this Constitution.
(3) The validity or legality of this Constitution is not subject to
challenge by or before any court or other State organ.
(4) Any law, including customary law, that is inconsistent with this
Constitution is void to the extent of the inconsistency, and any act or
omission in contravention of this Constitution is invalid.
(5) The general rules of international law shall form part of the
law of Kenya.
14 Constitution of Kenya, 2010
(6) Any treaty or convention ratified by Kenya shall form part of
the law of Kenya under this Constitution.


In [107]:
user_query = "What does it say about ultimae power?"
answer = answer_question_nlp(user_query, sections, qa_mapping, synonyms)
print(answer)

Supremacy of this Constitution.
2. (1) This Constitution is the supreme law of the Republic and
binds all persons and all State organs at both levels of government.
(2) No person may claim or exercise State authority except as
authorised under this Constitution.
(3) The validity or legality of this Constitution is not subject to
challenge by or before any court or other State organ.
(4) Any law, including customary law, that is inconsistent with this
Constitution is void to the extent of the inconsistency, and any act or
omission in contravention of this Constitution is invalid.
(5) The general rules of international law shall form part of the
law of Kenya.
14 Constitution of Kenya, 2010
(6) Any treaty or convention ratified by Kenya shall form part of
the law of Kenya under this Constitution.


The code above efficiently corrects the misspelt words and also considers the synonyms used. This enusres that the user finds what they need without stating what is in the constitution verbatim

##### Combining all chapter sections into a single dictionary

In [108]:
combined_sections = combined_sections = {**chapter_1_sections, **chapter_2_sections, **chapter_3_sections}
# Print the keys of the dictionary
combined_sections.keys()

dict_keys(['sovereignty', 'supremacy', 'defence', 'declaration', 'territory', 'devolution', 'languages', 'religion', 'symbol', 'day', 'value', 'culture', 'entitlement', 'retention', 'birth', 'registration', 'dual', 'revocation', 'legislation'])

In [109]:
import logging
from spellchecker import SpellChecker
import spacy

# Load the spacy model
nlp = spacy.load("en_core_web_sm")

# Configure logging
logging.basicConfig(
    level=logging.DEBUG,  # Set the logging level
    format='%(asctime)s - %(levelname)s - %(message)s',  # Log message format
)

# Define synonym mapping
synonyms = {
    "supremacy": ["supremacy", "authority", "ultimate power"],
    "sovereignty": ["sovereignty", "power of the people", "authority of the people"],
    "defence": ["defense", "protection", "preservation"]
}

sections = combined_sections

# QA mapping
qa_mapping = {
    "supremacy": "supremacy",
    "sovereignty": "sovereignty",
    "defence": "defence"
}

# Preprocess the query
def preprocess_query(query):
    doc = nlp(query)
    tokens = [token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct]
    return " ".join(tokens)

# Function to correct the spelling
def correct_spelling(processed_query):
    spell = SpellChecker()
    words = processed_query.split()
    # Find misspelled words
    misspelled_words = spell.unknown(words)

    corrected_words = []
    for word in words:
        # Correct the word if it's misspelled
        if word in misspelled_words:
            corrected_word = spell.correction(word)  # Get the most likely correction
            corrected_words.append(corrected_word)
        else:
            corrected_words.append(word)

    # Reconstruct the initial sentence
    corrected_input = " ".join(corrected_words)
    return corrected_input

# Match the words with synonym support
def match_with_synonyms(query, qa_mapping, synonyms, threshold=70):
    processed_query = preprocess_query(query)
    logging.debug(f"Processed Query: {processed_query}")  # Debugging line

    # Correct spelling in the processed query
    corrected_query = correct_spelling(processed_query)
    logging.debug(f"Corrected Query: {corrected_query}")  # Debugging line

    for key, value in qa_mapping.items():
        logging.debug(f"Checking key: {key}, value: {value}")  # Debugging line
        for synonym in synonyms.get(key, [key]):
            logging.debug(f"Trying synonym: {synonym}")  # Debugging line
            if synonym in corrected_query:
                logging.debug(f"Match found with synonym: {synonym}")  # Debugging line
                return value  # Only return section value of the key
    
    logging.debug("No match found")  # Debugging line if no match is found
    return Non

# Answer function with synonym and fuzzy matching
def answer_question_nlp(query, sections, qa_mapping, synonyms, threshold=70):
    section_key = match_with_synonyms(query, qa_mapping, synonyms, threshold)
    
    if section_key:
        # Retrieve the relevant section from the specified chapter
        return sections.get(section_key, "Section not found.")
    
    logging.error("No answer found for the question.")  # Log an error if no answer is found
    return "Sorry, I couldn't find an answer to your question."

# Sample usage
if __name__ == "__main__":
    user_query = "What does it say about culture?"
    threshold_value = 75  # Adjust this value as needed
    answer = answer_question_nlp(user_query, sections, qa_mapping, synonyms)
    print(answer)


2024-10-31 11:51:47,258 - DEBUG - Processed Query: culture
2024-10-31 11:51:47,545 - DEBUG - Corrected Query: culture
2024-10-31 11:51:47,545 - DEBUG - Checking key: supremacy, value: supremacy
2024-10-31 11:51:47,547 - DEBUG - Trying synonym: supremacy
2024-10-31 11:51:47,547 - DEBUG - Trying synonym: authority
2024-10-31 11:51:47,549 - DEBUG - Trying synonym: ultimate power
2024-10-31 11:51:47,549 - DEBUG - Checking key: sovereignty, value: sovereignty
2024-10-31 11:51:47,553 - DEBUG - Trying synonym: sovereignty
2024-10-31 11:51:47,556 - DEBUG - Trying synonym: power of the people
2024-10-31 11:51:47,557 - DEBUG - Trying synonym: authority of the people
2024-10-31 11:51:47,557 - DEBUG - Checking key: defence, value: defence
2024-10-31 11:51:47,560 - DEBUG - Trying synonym: defense
2024-10-31 11:51:47,560 - DEBUG - Trying synonym: protection
2024-10-31 11:51:47,563 - DEBUG - Trying synonym: preservation
2024-10-31 11:51:47,563 - DEBUG - No match found
2024-10-31 11:51:47,568 - ERROR 

Sorry, I couldn't find an answer to your question.


In [110]:
import logging
from spellchecker import SpellChecker
import spacy

# Load the language model (make sure to have spaCy installed and the model downloaded)
nlp = spacy.load("en_core_web_sm")

# Configure logging
logging.basicConfig(
    level=logging.DEBUG,  # Set the logging level
    format='%(asctime)s - %(levelname)s - %(message)s',  # Log message format
)

# Define synonym mapping
synonyms = {
    "supremacy": ["supremacy", "authority", "ultimate power"],
    "sovereignty": ["sovereignty", "power of the people", "authority of the people", "self rule", "autonomy"],
    "defence": ["defense", "protection", "preservation"]
}

sections = combined_sections

# QA mapping
qa_mapping = {
    "supremacy": ("supremacy"),
    "sovereignty": ("sovereignty"),
    "defence": ("defence"),
    "declaration": "declaration of the republic",
    "territory": "territory of kenya", 
    "devolution": "devolution and access to service",
    "languages": "national, official and other languages",
    "religion": "state and religion",
    "symbol": "national symbols and national days",
    "day": "national symbols and national days",
    "value": "national values and principles of governance",
    "governance": "national values and principles of governance",
    "culture": "culture",
    "entitlement": "entitlement of citizens",
    "retention": "retention and acquisition of citizenship",
    "birth": "citizenship by birth",
    "registration": "citizenship by registration",
    "dual": "dual citizenship",
    "revocation": "revocation of citizenship",
    "legislation" : "legislation on citizenship"
    
}

# Preprocess the query
def preprocess_query(query):
    doc = nlp(query)
    tokens = [token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct]
    return " ".join(tokens)

# Function to correct the spelling
def correct_spelling(processed_query):
    spell = SpellChecker()
    words = processed_query.split()
    # Find misspelled words
    misspelled_words = spell.unknown(words)

    corrected_words = []
    for word in words:
        # Correct the word if it's misspelled
        if word in misspelled_words:
            corrected_word = spell.correction(word)  # Get the most likely correction
            corrected_words.append(corrected_word)
        else:
            corrected_words.append(word)

    # Reconstruct the initial sentence
    corrected_input = " ".join(corrected_words)
    return corrected_input

# Match the words with synonym support
def match_with_synonyms(query, qa_mapping, synonyms, sections):
    processed_query = preprocess_query(query)
    logging.debug(f"Processed Query: {processed_query}")  # Debugging line

    # Correct spelling in the processed query
    corrected_query = correct_spelling(processed_query)
    logging.debug(f"Corrected Query: {corrected_query}")  # Debugging line

    # First check direct matching with section keys
    for key in qa_mapping.keys():
        if key in corrected_query:
            logging.debug(f"Direct match found: {key}")
            return qa_mapping[key]  # Return the key for the section

    for key, value in qa_mapping.items():
        logging.debug(f"Checking key: {key}, value: {value}")  # Debugging line
        for synonym in synonyms.get(key, [key]):
            logging.debug(f"Trying synonym: {synonym}")  # Debugging line
            if synonym in corrected_query:
                logging.debug(f"Match found with synonym: {synonym}")  # Debugging line
                return value  # Only return section value of the key

    
    logging.debug("No match found")  # Debugging line if no match is found
    return None

# Answer function with synonyms
def answer_question_nlp(query, sections, qa_mapping, synonyms):
    section_key = match_with_synonyms(query, qa_mapping, synonyms, sections)
    
    if section_key:
        # Retrieve the relevant section from the specified chapter
        return sections.get(section_key, "Section not found.")
    
    logging.error("No answer found for the question.")  # Log an error if no answer is found
    return "Sorry, I couldn't find an answer to your question."

# Sample usage
if __name__ == "__main__":
    user_query = "What does it say about cultre?"
    answer = answer_question_nlp(user_query, sections, qa_mapping, synonyms)
    print(answer)


2024-10-31 11:51:48,823 - DEBUG - Processed Query: cultre
2024-10-31 11:51:49,184 - DEBUG - Corrected Query: culture
2024-10-31 11:51:49,201 - DEBUG - Direct match found: culture


Culture.
11. (1) This Constitution recognises culture as the foundation of
the nation and as the cumulative civilization of the Kenyan people and
nation.
(2) The State shall—
(a) promote all forms of national and cultural expression through
literature, the arts, traditional celebrations, science,
communication, information, mass media, publications,
libraries and other cultural heritage;
(b) recognise the role of science and indigenous technologies in
the development of the nation; and
(c) promote the intellectual property rights of the people of Kenya.
(3) Parliament shall enact legislation to—
(a) ensure that communities receive compensation or royalties for
the use of their cultures and cultural heritage; and
(b) recognise and protect the ownership of indigenous seeds and
plant varieties, their genetic and diverse characteristics and
their use by the communities of Kenya.


In [None]:
# Define the synonyms explicitly so that we can easily reference them later
synonyms = {
    "supremacy": ["supremacy", "authority", "ultimate power"],
    "sovereignty": ["sovereignty", "power of the people", "authority of the people", "self rule", "autonomy"],
    "defence": ["defense", "protection", "preservation"],
    "declaration": ["declaration", "proclamation", "statement", "announcement", "affirmation"],
    "territory": ["territory", "land", "region", "area", "jurisdiction", "bounds"],
    "devolution": ["devolution", "decentralization", "delegation", "transfer of power", "local governance", "subsidiarity"],
    "languages": ["languages", "tongues", "dialects", "official languages", "linguistic diversity"],
    "religion": ["religion", "faith", "belief systems", "spiritual practice", "secularism", "church-state separation"],
    "symbol": ["symbol", "emblem", "insignia", "representation", "national icon"],
    "day": ["day", "holiday", "observance", "public holiday", "commemoration", "remembrance"],
    "value": ["value", "principle", "ethic", "core value", "standard", "national ideal"],
    "governance": ["governance", "government", "administration", "management", "public service", "political structure"],
    "culture": ["culture", "heritage", "tradition", "customs", "societal norms", "arts"],
    "entitlement": ["entitlement", "right", "eligibility", "entitlement rights", "benefits", "privileges"],
    "retention": ["retention", "maintenance", "keeping", "preservation", "continuation"],
    "birth": ["birth", "nativity", "origin", "ancestry", "inborn citizenship"],
    "registration": ["registration", "enlistment", "enrollment", "citizenship application", "naturalization"],
    "dual": ["dual", "multiple", "dual nationality", "two-fold citizenship"],
    "revocation": ["revocation", "cancellation", "annulment", "rescission", "forfeiture", "withdrawal"],
    "legislation": ["legislation", "laws", "legal framework", "statutes", "enactment"]
}

# Define qa mapping explicitly too
qa_mapping = {
    "supremacy": "supremacy",
    "sovereignty": "sovereignty",
    "defence": "defence",
    "declaration": "declaration",
    "territory": "territory", 
    "devolution": "devolution",
    "language": "languages",
    "religion": "religion",
    "symbol": "symbol",
    "day": "day",
    "value": "value",
    "culture": "culture",
    "entitlement": "entitlement",
    "retention": "retention",  # Ensure this matches your sections key
    "birth": "birth",
    "registration": "registration",
    "dual": "dual",
    "revocation": "revocation",
    "legislation": "legislation"
    
}


In [127]:
import logging
from spellchecker import SpellChecker
import spacy

# Load the language model (make sure to have spaCy installed and the model downloaded)
nlp = spacy.load("en_core_web_sm")

# Configure logging
logging.basicConfig(
    level=logging.INFO,  # Set the logging level
    format='%(asctime)s - %(levelname)s - %(message)s',  # Log message format
)

# Define synonym mapping
synonyms = synonyms

sections = combined_sections

# QA mapping
qa_mapping = qa_mapping


# Preprocess the query
def preprocess_query(query):
    doc = nlp(query)
    tokens = [token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct]
    return " ".join(tokens)

# Function to correct the spelling
def correct_spelling(processed_query):
    spell = SpellChecker()
    words = processed_query.split()
    # Find misspelled words
    misspelled_words = spell.unknown(words)

    corrected_words = []
    for word in words:
        # Correct the word if it's misspelled
        if word in misspelled_words:
            corrected_word = spell.correction(word)  # Get the most likely correction
            corrected_words.append(corrected_word)
        else:
            corrected_words.append(word)

    # Reconstruct the initial sentence
    corrected_input = " ".join(corrected_words)
    return corrected_input

# Match the words with synonym support
def match_with_synonyms(query, qa_mapping, synonyms, sections):
    processed_query = preprocess_query(query)
    logging.debug(f"Processed Query: {processed_query}")  # Debugging line

    # Correct spelling in the processed query
    corrected_query = correct_spelling(processed_query)
    logging.debug(f"Corrected Query: {corrected_query}")  # Debugging line

    # Special case: Handle 'citizenship' by returning all relevant sections
    if "citizenship" in corrected_query:
        logging.debug(f"'Citizenship' detected in query, returning related sections.")
        citizenship_keys = list(chapter_3_sections.keys())
        citizenship_keys = ", ".join(citizenship_keys)
        return (f"It looks like you're asking for citizenship."
                f"Here are the sections you can inquire about: {citizenship_keys}"
                f"Please specify which section you're interested in.")

    # First check direct matching with section keys
    for key in qa_mapping.keys():
        if key in corrected_query:
            logging.debug(f"Direct match found: {key}")
            return qa_mapping[key]  # Return the key for the section

    for key, value in qa_mapping.items():
        logging.debug(f"Checking key: {key}, value: {value}")  # Debugging line
        for synonym in synonyms.get(key, [key]):
            logging.debug(f"Trying synonym: {synonym}")  # Debugging line
            if synonym in corrected_query:
                logging.debug(f"Match found with synonym: {synonym}")  # Debugging line
                return value  # Only return section value of the key
    
    logging.debug("No match found")  # Debugging line if no match is found
    return None

def answer_question_nlp(query, sections, qa_mapping, synonyms):
    section_key = match_with_synonyms(query, qa_mapping, synonyms, sections)
    
    logging.debug(f"Found section key: {section_key}")  # Debugging line
    logging.debug(f"Available sections: {sections.keys()}")  # Log available keys in sections

    if section_key:
        # Retrieve the relevant section from the specified chapter
        answer = sections.get(section_key)
        if answer:
            return answer
        else:
            logging.error(f"Section not found for key: {section_key}")  # Log an error if section is not found
            return "Section not found."
    
    logging.error("No answer found for the question.")  # Log an error if no answer is found
    return "Sorry, I couldn't find an answer to your question."

# Sample usage
if __name__ == "__main__":
    user_query = "What does it say about revocation?"
    answer = answer_question_nlp(user_query, sections, qa_mapping, synonyms)
    print(answer)


2024-10-31 12:29:22,983 - DEBUG - Processed Query: revocation
2024-10-31 12:29:23,243 - DEBUG - Corrected Query: revocation
2024-10-31 12:29:23,244 - DEBUG - Direct match found: revocation
2024-10-31 12:29:23,245 - DEBUG - Found section key: revocation
2024-10-31 12:29:23,247 - DEBUG - Available sections: dict_keys(['sovereignty', 'supremacy', 'defence', 'declaration', 'territory', 'devolution', 'languages', 'religion', 'symbol', 'day', 'value', 'culture', 'entitlement', 'retention', 'birth', 'registration', 'dual', 'revocation', 'legislation'])


Revocation of citizenship.
17. (1) If a person acquired citizenship by registration, the
citizenship may be revoked if the person—
(a) acquired the citizenship by fraud, false representation or
concealment of any material fact;
(b) has, during any war in which Kenya was engaged, unlawfully
traded or communicated with an enemy or been engaged in
or associated with any business that was knowingly carried
on in such a manner as to assist an enemy in that war;
(c) has, within five years after registration, been convicted of an
offence and sentenced to imprisonment for a term of three
years or longer; or
(d) has, at any time after registration, been convicted of treason,
or of an offence for which—
(i) a penalty of at least seven years imprisonment may be
imposed; or
(ii) a more severe penalty may be imposed.
(2) The citizenship of a person who was presumed to be a citizen
by birth, as contemplated in Article 14 (4), may be revoked if—
(a) the citizenship was acquired by fraud, false repres

Since the code functions do not have a key "citizenship", if the user inputs it, they would get an error. To fix this, I will explicitly state a function to mitigate that

In [116]:
import logging
from spellchecker import SpellChecker
import spacy

# Load the language model (make sure to have spaCy installed and the model downloaded)
nlp = spacy.load("en_core_web_sm")

# Configure logging
logging.basicConfig(
    level=logging.INFO,  # Set the logging level
    format='%(asctime)s - %(levelname)s - %(message)s',  # Log message format
)

# Define synonym mapping
synonyms = synonyms

sections = combined_sections

# QA mapping
qa_mapping = qa_mapping


# Preprocess the query
def preprocess_query(query):
    doc = nlp(query)
    tokens = [token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct]
    return " ".join(tokens)

# Function to correct the spelling
def correct_spelling(processed_query):
    spell = SpellChecker()
    words = processed_query.split()
    # Find misspelled words
    misspelled_words = spell.unknown(words)

    corrected_words = []
    for word in words:
        # Correct the word if it's misspelled
        if word in misspelled_words:
            corrected_word = spell.correction(word)  # Get the most likely correction
            corrected_words.append(corrected_word)
        else:
            corrected_words.append(word)

    # Reconstruct the initial sentence
    corrected_input = " ".join(corrected_words)
    return corrected_input

# Match the words with synonym support
def match_with_synonyms(query, qa_mapping, synonyms, sections):
    processed_query = preprocess_query(query)
    logging.debug(f"Processed Query: {processed_query}")  # Debugging line

    # Correct spelling in the processed query
    corrected_query = correct_spelling(processed_query)
    logging.debug(f"Corrected Query: {corrected_query}")  # Debugging line

    # Special case: Handle 'citizenship' by returning all relevant sections
    if "citizenship" in corrected_query:
        logging.debug(f"'Citizenship' detected in query, returning related sections.")
        citizenship_keys = list(chapter_3_sections.keys())
        citizenship_keys = ", ".join(citizenship_keys)
        return (f"It looks like you're asking for citizenship."
                f"Here are the sections you can inquire about: {citizenship_keys}"
                f"Please specify which section you're interested in.")

    # First check direct matching with section keys
    for key in qa_mapping.keys():
        if key in corrected_query:
            logging.debug(f"Direct match found: {key}")
            return qa_mapping[key]  # Return the key for the section

    for key, value in qa_mapping.items():
        logging.debug(f"Checking key: {key}, value: {value}")  # Debugging line
        for synonym in synonyms.get(key, [key]):
            logging.debug(f"Trying synonym: {synonym}")  # Debugging line
            if synonym in corrected_query:
                logging.debug(f"Match found with synonym: {synonym}")  # Debugging line
                return value  # Only return section value of the key
    
    logging.debug("No match found")  # Debugging line if no match is found
    return None

def answer_question_nlp(query, sections, qa_mapping, synonyms):
    section_key = match_with_synonyms(query, qa_mapping, synonyms, sections)
    
    logging.debug(f"Found section key: {section_key}")  # Debugging line
    logging.debug(f"Available sections: {sections.keys()}")  # Log available keys in sections

    if section_key:
        # Retrieve the relevant section from the specified chapter
        answer = sections.get(section_key)
        if answer:
            return answer
        else:
            logging.error(f"Section not found for key: {section_key}")  # Log an error if section is not found
            return "Section not found."
    
    logging.error("No answer found for the question.")  # Log an error if no answer is found
    return "Sorry, I couldn't find an answer to your question."

# Sample usage
if __name__ == "__main__":
    user_query = "What does it say about retention of citizenship?"
    answer = answer_question_nlp(user_query, sections, qa_mapping, synonyms)
    print(answer)


2024-10-31 12:14:11,525 - DEBUG - Processed Query: retention citizenship
2024-10-31 12:14:11,694 - DEBUG - Corrected Query: retention citizenship
2024-10-31 12:14:11,708 - DEBUG - 'Citizenship' detected in query, returning related sections.
2024-10-31 12:14:11,708 - DEBUG - Found section key: It looks like you're asking for citizenship.Here are the sections you can inquire about: entitlement, retention, birth, registration, dual, revocation, legislationPlease specify which section you're interested in.
2024-10-31 12:14:11,711 - DEBUG - Available sections: dict_keys(['sovereignty', 'supremacy', 'defence', 'declaration', 'territory', 'devolution', 'languages', 'religion', 'symbol', 'day', 'value', 'culture', 'entitlement', 'retention', 'birth', 'registration', 'dual', 'revocation', 'legislation'])
2024-10-31 12:14:11,711 - ERROR - Section not found for key: It looks like you're asking for citizenship.Here are the sections you can inquire about: entitlement, retention, birth, registration

Section not found.


Above, if I use "citizenship" in my query it'll return the general statement despite having used specific sections in citizenship.
I'll need to add code that checks for specific key words in the user input to ensure that if a specific citizenship query is logged, the output is appropriate.

In [121]:
import logging
from spellchecker import SpellChecker
import spacy

# Load the language model (make sure to have spaCy installed and the model downloaded)
nlp = spacy.load("en_core_web_sm")

# Configure logging
logging.basicConfig(
    level=logging.INFO,  # Set the logging level
    format='%(asctime)s - %(levelname)s - %(message)s',  # Log message format
)

# Define synonym mapping
synonyms = synonyms

sections = combined_sections

# QA mapping
qa_mapping = qa_mapping


# Preprocess the query
def preprocess_query(query):
    doc = nlp(query)
    tokens = [token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct]
    return " ".join(tokens)

# Function to correct the spelling
def correct_spelling(processed_query):
    spell = SpellChecker()
    words = processed_query.split()
    # Find misspelled words
    misspelled_words = spell.unknown(words)

    corrected_words = []
    for word in words:
        # Correct the word if it's misspelled
        if word in misspelled_words:
            corrected_word = spell.correction(word)  # Get the most likely correction
            corrected_words.append(corrected_word)
        else:
            corrected_words.append(word)

    # Reconstruct the initial sentence
    corrected_input = " ".join(corrected_words)
    return corrected_input

# Match the words to their synonyms
def match_with_synonyms(query, qa_mapping, synonyms, sections):
    processed_query = preprocess_query(query)
    logging.debug(f"Processed Query: {processed_query}")  # Debugging line

    # Correct spelling in the processed query
    corrected_query = correct_spelling(processed_query)
    logging.debug(f"Corrected Query: {corrected_query}")  # Debugging line

      # Specific keywords related to citizenship mapped directly to qa_mapping keys
    specific_citizenship_map = {
        "dual citizenship": "dual",
        "retention": "retention",
        "citizenship by birth": "birth",
        "citizenship by registration": "registration",
        "legislation on citizenship": "legislation",
        "acquisition": "retention",
        "revocation": "revocation"
    }

    # Check for specific keywords first
    for key, phrase in specific_citizenship_map.items():
        if phrase in corrected_query:
            logging.debug(f"Specific citizenship topic detected: {phrase}")
            return qa_mapping.get(phrase, "Sorry, I couldn't find an answer to your question.")
        
     # Only trigger the general response if no specific keywords were found
    if "citizenship" in corrected_query:
        logging.debug(f"'Citizenship' detected in query, returning related sections.")
        citizenship_keys = list(chapter_3_sections.keys())
        citizenship_keys = ", ".join(citizenship_keys)
        return (f"It looks like you're asking about citizenship. Here are the sections you can inquire about: {citizenship_keys}. "
                f"Please specify which section you're interested in.")
        
    

    # First check direct matching with section keys
    for key in qa_mapping.keys():
        if key in corrected_query:
            logging.debug(f"Direct match found: {key}")
            return qa_mapping[key]  # Return the key for the section

    for key, value in qa_mapping.items():
        logging.debug(f"Checking key: {key}, value: {value}")  # Debugging line
        for synonym in synonyms.get(key, [key]):
            logging.debug(f"Trying synonym: {synonym}")  # Debugging line
            if synonym in corrected_query:
                logging.debug(f"Match found with synonym: {synonym}")  # Debugging line
                return value  # Only return section value of the key
    
    logging.debug("No match found")  # Debugging line if no match is found
    return None

def answer_question_nlp(query, sections, qa_mapping, synonyms):
    section_key = match_with_synonyms(query, qa_mapping, synonyms, sections)
    
    logging.debug(f"Found section key: {section_key}")  # Debugging line
    logging.debug(f"Available sections: {sections.keys()}")  # Log available keys in sections

    if section_key:
        # Retrieve the relevant section from the specified chapter
        answer = sections.get(section_key)
        if answer:
            return answer
        else:
            logging.error(f"Section not found for key: {section_key}")  # Log an error if section is not found
            return "Section not found."
    
    logging.error("No answer found for the question.")  # Log an error if no answer is found
    return "Sorry, I couldn't find an answer to your question."

# Sample usage
if __name__ == "__main__":
    user_query = "What does it say about revocation citizenship?"
    answer = answer_question_nlp(user_query, sections, qa_mapping, synonyms)
    print(answer)


2024-10-31 12:25:31,601 - DEBUG - Processed Query: revocation citizenship
2024-10-31 12:25:32,086 - DEBUG - Corrected Query: revocation citizenship
2024-10-31 12:25:32,088 - DEBUG - Specific citizenship topic detected: revocation
2024-10-31 12:25:32,089 - DEBUG - Found section key: revocation
2024-10-31 12:25:32,090 - DEBUG - Available sections: dict_keys(['sovereignty', 'supremacy', 'defence', 'declaration', 'territory', 'devolution', 'languages', 'religion', 'symbol', 'day', 'value', 'culture', 'entitlement', 'retention', 'birth', 'registration', 'dual', 'revocation', 'legislation'])


Revocation of citizenship.
17. (1) If a person acquired citizenship by registration, the
citizenship may be revoked if the person—
(a) acquired the citizenship by fraud, false representation or
concealment of any material fact;
(b) has, during any war in which Kenya was engaged, unlawfully
traded or communicated with an enemy or been engaged in
or associated with any business that was knowingly carried
on in such a manner as to assist an enemy in that war;
(c) has, within five years after registration, been convicted of an
offence and sentenced to imprisonment for a term of three
years or longer; or
(d) has, at any time after registration, been convicted of treason,
or of an offence for which—
(i) a penalty of at least seven years imprisonment may be
imposed; or
(ii) a more severe penalty may be imposed.
(2) The citizenship of a person who was presumed to be a citizen
by birth, as contemplated in Article 14 (4), may be revoked if—
(a) the citizenship was acquired by fraud, false repres

Everything seems to work as it should! 

# To make the main part of the code shorter and cleaner, I'll run all the dictionaries and lists in one cell then reference them in the main code

In [None]:
chapter_2_sections

In [87]:
# Define the synonyms explicitly so that we can easily reference them later
synonyms = {
    "supremacy": ["supremacy", "authority", "ultimate power"],
    "sovereignty": ["sovereignty", "power of the people", "authority of the people", "self rule", "autonomy"],
    "defence": ["defense", "protection", "preservation"],
    "declaration": ["declaration", "proclamation", "statement", "announcement", "affirmation"],
    "territory": ["territory", "land", "region", "area", "jurisdiction", "bounds"],
    "devolution": ["devolution", "decentralization", "delegation", "transfer of power", "local governance", "subsidiarity"],
    "languages": ["languages", "tongues", "dialects", "official languages", "linguistic diversity"],
    "religion": ["religion", "faith", "belief systems", "spiritual practice", "secularism", "church-state separation"],
    "symbol": ["symbol", "emblem", "insignia", "representation", "national icon"],
    "day": ["day", "holiday", "observance", "public holiday", "commemoration", "remembrance"],
    "value": ["value", "principle", "ethic", "core value", "standard", "national ideal", "governance", "government", "administration", "management", "public service", "political structure"],
    "culture": ["culture", "heritage", "tradition", "customs", "societal norms", "arts"],
    "entitlement": ["entitlement", "right", "eligibility", "entitlement rights", "benefits", "privileges"],
    "retention": ["retention", "maintenance", "keeping", "preservation", "continuation"],
    "birth": ["birth", "nativity", "origin", "ancestry", "inborn citizenship"],
    "registration": ["registration", "enlistment", "enrollment", "citizenship application", "naturalization"],
    "dual": ["dual", "multiple", "dual nationality", "two-fold citizenship"],
    "revocation": ["revocation", "cancellation", "annulment", "rescission", "forfeiture", "withdrawal"],
    "legislation": ["legislation", "laws", "legal framework", "statutes", "enactment"]
}

# Define qa mapping explicitly too
qa_mapping = {
    "supremacy": "supremacy",
    "sovereignty": "sovereignty",
    "defence": "defence",
    "declaration": "declaration",
    "territory": "territory", 
    "devolution": "devolution",
    "languages": "languages",
    "religion": "religion",
    "symbol": "symbol",
    "day": "day",
    "value": "value",
    "culture": "culture",
    "entitlement": "entitlement",
    "retention": "retention",  # Ensure this matches your sections key
    "birth": "birth",
    "registration": "registration",
    "dual": "dual",
    "revocation": "revocation",
    "legislation": "legislation"
    
}

specific_citizenship_map = {
        "dual citizenship": "dual",
        "retention": "retention",
        "citizenship by birth": "birth",
        "citizenship by registration": "registration",
        "legislation on citizenship": "legislation",
        "acquisition": "retention",
        "revocation": "revocation"
    }



# Main precise code

In [1]:
combined_sections

NameError: name 'combined_sections' is not defined

In [None]:
import logging
from spellchecker import SpellChecker
import spacy

# Load the language model (make sure to have spaCy installed and the model downloaded)
nlp = spacy.load("en_core_web_sm")

# Configure logging
logging.basicConfig(
    level=logging.INFO,  # Set the logging level
    format='%(asctime)s - %(levelname)s - %(message)s',  # Log message format
)


# Preprocess the query
def preprocess_query(query):
    doc = nlp(query)
    tokens = [token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct]
    return " ".join(tokens)

# Function to correct the spelling
def correct_spelling(processed_query):
    spell = SpellChecker()
    words = processed_query.split()
    # Find misspelled words
    misspelled_words = spell.unknown(words)

    corrected_words = []
    for word in words:
        # Correct the word if it's misspelled
        if word in misspelled_words:
            corrected_word = spell.correction(word)  # Get the most likely correction
            corrected_words.append(corrected_word)
        else:
            corrected_words.append(word)

    # Reconstruct the initial sentence
    corrected_input = " ".join(corrected_words)
    return corrected_input

# Match the words to their synonyms
def match_with_synonyms(query, qa_mapping, synonyms, sections):
    processed_query = preprocess_query(query)
    logging.debug(f"Processed Query: {processed_query}")  # Debugging line

    # Correct spelling in the processed query
    corrected_query = correct_spelling(processed_query)
    logging.debug(f"Corrected Query: {corrected_query}")  # Debugging line

    # Check for specific keywords first
    for key, phrase in specific_citizenship_map.items():
        if phrase in corrected_query:
            logging.debug(f"Specific citizenship topic detected: {phrase}")
            return qa_mapping.get(phrase, "Sorry, I couldn't find an answer to your question.")
        
     # Only trigger the general response if no specific keywords were found
    if "citizenship" in corrected_query:
        logging.debug(f"'Citizenship' detected in query, returning related sections.")
        citizenship_keys = list(chapter_3_sections.keys())
        citizenship_keys = ", ".join(citizenship_keys)
        return (f"It looks like you're asking about citizenship. Here are the sections you can inquire about: {citizenship_keys}. "
                f"Please specify which section you're interested in.")
        
    

    # First check direct matching with section keys
    for key in qa_mapping.keys():
        if key in corrected_query:
            logging.debug(f"Direct match found: {key}")
            return qa_mapping[key]  # Return the key for the section

    for key, value in qa_mapping.items():
        logging.debug(f"Checking key: {key}, value: {value}")  # Debugging line
        for synonym in synonyms.get(key, [key]):
            logging.debug(f"Trying synonym: {synonym}")  # Debugging line
            if synonym in corrected_query:
                logging.debug(f"Match found with synonym: {synonym}")  # Debugging line
                return value  # Only return section value of the key
    
    logging.debug("No match found")  # Debugging line if no match is found
    return None

def answer_question_nlp(query, sections, qa_mapping, synonyms):
    section_key = match_with_synonyms(query, qa_mapping, synonyms, sections)
    
    logging.debug(f"Found section key: {section_key}")  # Debugging line
    logging.debug(f"Available sections: {sections.keys()}")  # Log available keys in sections

    if section_key:
        # Retrieve the relevant section from the specified chapter
        answer = sections.get(section_key)
        if answer:
            return answer
        else:
            logging.error(f"Section not found for key: {section_key}")  # Log an error if section is not found
            return "Section not found."
    
    logging.error("No answer found for the question.")  # Log an error if no answer is found
    return "Sorry, I couldn't find an answer to your question."

# Sample usage
if __name__ == "__main__":
    user_query = "What does it say about culture and heritage?"
    answer = answer_question_nlp(user_query, sections, qa_mapping, synonyms)
    print(answer)


In [96]:
def split_chapter(chapter_text, section_keywords):
    """
    Split chapter text into sections based on defined keywords.

    Parameters:
    - chapter_text: str, the text of the chapter to split.
    - section_keywords: dict, mapping of starting phrases to section names.

    Returns:
    - dict: sections with their corresponding text.
    """
    # Initialize sections based on the provided keywords
    sections = {key: [] for key in section_keywords.values()}

    # Split by new lines to process line by line
    lines = chapter_text.splitlines()
    current_section = None

    for line in lines:
        stripped_line = line.strip()

        # Determine the current section based on keywords
        for keyword, section in section_keywords.items():
            if stripped_line.startswith(keyword):
                current_section = section
                break
        
        # Append line to the current section if it's set
        if current_section:
            # Check for special transitions, e.g., "symbol" to "day"
            if current_section == "symbol" and "The national days are" in stripped_line:
                current_section = "day"  # Switch to "day" section
            
            sections[current_section].append(stripped_line)

    # Join each section into a single string and strip whitespace
    for key in sections:
        sections[key] = "\n".join(sections[key]).strip()

    return sections

# Define keywords for Chapter 2
chapter_2_keywords = {
    "Declaration of the republic": "declaration",
    "Territory of Kenya": "territory",
    "Devolution and access to services": "devolution",
    "National, official and other languages": "languages",
    "State and Religion": "religion",
    "National symbols and national days": "symbol",
    "The national days are": "day",
    "National values and principles": "value",
    "Culture": "culture"
}

# CHAPTER 2
chapter_2 = extract_specific_pages(pdf_path, 13, 16)
chapter_2_cleaned = chapter_2.split("CHAPTER TWO")[1].strip() if "CHAPTER TWO" in chapter_2 else chapter_2
chapter_2_trimmed = chapter_2_cleaned.split("CHAPTER THREE")[0].strip()

# Split Chapter 2 into sections
chapter_2_sections = split_chapter(chapter_2_trimmed, chapter_2_keywords)

# Print to verify the results
print("Chapter 2 Sections:", chapter_2_sections)
print("\nNational Symbols Section:\n", chapter_2_sections["symbol"])
print("\nNational Days Section:\n", chapter_2_sections["day"])



Chapter 2 Sections: {'declaration': '', 'territory': 'Territory of Kenya.\n5. Kenya consists of the territory and territorial waters comprising\nKenya on the effective date, and any additional territory and territorial\nwaters as defined by an Act of Parliament.', 'devolution': 'Devolution and access to services.\n6. (1) The territory of Kenya is divided into the counties specified\nin the First Schedule.\n(2) The governments at the national and county levels are distinct\nand inter-dependent and shall conduct their mutual relations on the\nbasis of consultation and cooperation.\n(3) A national State organ shall ensure reasonable access to its\nservices in all parts of the Republic, so far as it is appropriate to do so\nhaving regard to the nature of the service.', 'languages': 'National, official and other languages.\n7. (1) The national language of the Republic is Kiswahili.\n(2) The official languages of the Republic are Kiswahili and\nEnglish.\n(3) The State shall—\n(a) promote and

In [99]:
# Generic function
def split_chapter(chapter_text, section_keywords):
    """
    Split chapter text into sections based on defined keywords.

    Parameters:
    - chapter_text: str, the text of the chapter to split.
    - section_keywords: dict, mapping of starting phrases to section names.

    Returns:
    - dict: sections with their corresponding text.
    """
    # Initialize sections based on the provided keywords
    sections = {key: [] for key in section_keywords.values()}

    # Split by new lines to process line by line
    lines = chapter_text.splitlines()
    current_section = None

    for line in lines:
        stripped_line = line.strip()

        # Determine the current section based on keywords
        for keyword, section in section_keywords.items():
            if stripped_line.startswith(keyword):
                current_section = section
                break
        
        # Append line to the current section if it's set
        if current_section:
            # Check for special transitions, e.g., "symbol" to "day"
            if current_section == "symbol" and "The national days are" in stripped_line:
                current_section = "day"  # Switch to "day" section
            
            sections[current_section].append(stripped_line)

    # Join each section into a single string and strip whitespace
    for key in sections:
        sections[key] = "\n".join(sections[key]).strip()

    return sections


In [11]:
import logging
from telegram import Update
from telegram.ext import ApplicationBuilder, CommandHandler, MessageHandler, filters, ContextTypes
from spellchecker import SpellChecker
import spacy
from config import combined_sections, qa_mapping, synonyms, specific_citizenship_map  # Ensure all are imported

# Load the language model (make sure to have spaCy installed and the model downloaded)
nlp = spacy.load("en_core_web_sm")

# Configure logging
logging.basicConfig(
    level=logging.DEBUG,  # Set to DEBUG to see debug messages
    format='%(asctime)s - %(levelname)s - %(message)s',  # Log message format
)

# Preprocess the query
def preprocess_query(query):
    doc = nlp(query)
    tokens = [token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct]
    return " ".join(tokens)

# Function to correct the spelling
def correct_spelling(processed_query):
    spell = SpellChecker()
    words = processed_query.split()
    misspelled_words = spell.unknown(words)

    corrected_words = []
    for word in words:
        if word in misspelled_words:
            corrected_words.append(spell.correction(word))  # Get the most likely correction
        else:
            corrected_words.append(word)

    corrected_input = " ".join(corrected_words)
    return corrected_input

# Match the words to their synonyms
def match_with_synonyms(query):
    processed_query = preprocess_query(query)
    logging.debug(f"Processed Query: {processed_query}")

    corrected_query = correct_spelling(processed_query)
    logging.debug(f"Corrected Query: {corrected_query}")

    # Check for specific keywords first
    for key, phrase in specific_citizenship_map.items():
        if phrase in corrected_query:
            logging.debug(f"Specific citizenship topic detected: {phrase}")
            return qa_mapping.get(phrase, "Sorry, I couldn't find an answer to your question.")

    # Trigger the general response if no specific keywords were found
    if "citizenship" in corrected_query:
        logging.debug(f"'Citizenship' detected in query, returning related sections.")
        citizenship_keys = list(combined_sections.keys())
        citizenship_keys = ", ".join(citizenship_keys)
        return (f"It looks like you're asking about citizenship. Here are the sections you can inquire about: {citizenship_keys}. "
                f"Please specify which section you're interested in.")

    # Check direct matching with section keys
    for key in qa_mapping.keys():
        if key in corrected_query:
            logging.debug(f"Direct match found: {key}")
            return qa_mapping[key]

    for key, value in qa_mapping.items():
        logging.debug(f"Checking key: {key}, value: {value}")
        for synonym in synonyms.get(key, [key]):
            logging.debug(f"Trying synonym: {synonym}")
            if synonym in corrected_query:
                logging.debug(f"Match found with synonym: {synonym}")
                return value

    logging.debug("No match found")
    return None

def answer_question_nlp(query):
    section_key = match_with_synonyms(query)

    logging.debug(f"Found section key: {section_key}")
    logging.debug(f"Available sections: {combined_sections.keys()}")

    if section_key:
        answer = combined_sections.get(section_key)
        if answer:
            return answer
        else:
            logging.error(f"Section not found for key: {section_key}")
            return "Section not found."

    logging.error("No answer found for the question.")
    return "Sorry, I couldn't find an answer to your question."

# Command and message handlers
async def start(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
    await update.message.reply_text("Welcome to the Kenya Constitution Bot! Ask me anything.")

async def handle_message(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
    user_query = update.message.text
    logging.debug(f"User query: {user_query}")
    
    answer = answer_question_nlp(user_query)
    await update.message.reply_text(answer)

# Main function to run the bot
async def main():
    # Initialize the bot
    application = ApplicationBuilder().token("7356196146:AAGOnrG4ExuIOLE5c_g8VtvgD8me1mwBzVk").build()

    # Add handlers
    application.add_handler(CommandHandler("start", start))
    application.add_handler(MessageHandler(filters.TEXT & ~filters.COMMAND, handle_message))

    # Start polling
    await application.run_polling()

if __name__ == "__main__":
    import asyncio
    asyncio.run(main())


2024-10-30 08:25:05,411 - DEBUG - load_ssl_context verify=True cert=None trust_env=True http2=False
2024-10-30 08:25:05,411 - DEBUG - load_verify_locations cafile='E:\\SCHOOL\\anaconda3.1\\envs\\Mubea-env\\Library\\ssl\\cacert.pem'
2024-10-30 08:25:06,346 - DEBUG - load_ssl_context verify=True cert=None trust_env=True http2=False
2024-10-30 08:25:06,346 - DEBUG - load_verify_locations cafile='E:\\SCHOOL\\anaconda3.1\\envs\\Mubea-env\\Library\\ssl\\cacert.pem'
2024-10-30 08:25:06,789 - DEBUG - Calling Bot API endpoint `getMe` with parameters `{}`
2024-10-30 08:25:06,789 - DEBUG - connect_tcp.started host='api.telegram.org' port=443 local_address=None timeout=5.0 socket_options=None
2024-10-30 08:25:07,014 - DEBUG - connect_tcp.complete return_value=<httpcore._backends.anyio.AnyIOStream object at 0x0000021FE247BE50>
2024-10-30 08:25:07,014 - DEBUG - start_tls.started ssl_context=<ssl.SSLContext object at 0x0000021FE21434C0> server_hostname='api.telegram.org' timeout=5.0
2024-10-30 08:25:

RuntimeError: Cannot close a running event loop

In [12]:
import logging
from spellchecker import SpellChecker
import spacy

# Load the language model (make sure to have spaCy installed and the model downloaded)
nlp = spacy.load("en_core_web_sm")

# Configure logging
logging.basicConfig(
    level=logging.INFO,  # Set the logging level
    format='%(asctime)s - %(levelname)s - %(message)s',  # Log message format
)


# Preprocess the query
def preprocess_query(query):
    doc = nlp(query)
    tokens = [token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct]
    return " ".join(tokens)

# Function to correct the spelling
def correct_spelling(processed_query):
    spell = SpellChecker()
    words = processed_query.split()
    # Find misspelled words
    misspelled_words = spell.unknown(words)

    corrected_words = []
    for word in words:
        # Correct the word if it's misspelled
        if word in misspelled_words:
            corrected_word = spell.correction(word)  # Get the most likely correction
            corrected_words.append(corrected_word)
        else:
            corrected_words.append(word)

    # Reconstruct the initial sentence
    corrected_input = " ".join(corrected_words)
    return corrected_input

# Match the words to their synonyms
def match_with_synonyms(query, qa_mapping, synonyms, sections):
    processed_query = preprocess_query(query)
    logging.debug(f"Processed Query: {processed_query}")  # Debugging line

    # Correct spelling in the processed query
    corrected_query = correct_spelling(processed_query)
    logging.debug(f"Corrected Query: {corrected_query}")  # Debugging line

    # Check for specific keywords first
    for key, phrase in specific_citizenship_map.items():
        if phrase in corrected_query:
            logging.debug(f"Specific citizenship topic detected: {phrase}")
            return qa_mapping.get(phrase, "Sorry, I couldn't find an answer to your question.")
        
     # Only trigger the general response if no specific keywords were found
    if "citizenship" in corrected_query:
        logging.debug(f"'Citizenship' detected in query, returning related sections.")
        citizenship_keys = list(chapter_3_sections.keys())
        citizenship_keys = ", ".join(citizenship_keys)
        return (f"It looks like you're asking about citizenship. Here are the sections you can inquire about: {citizenship_keys}. "
                f"Please specify which section you're interested in.")
        
    

    # First check direct matching with section keys
    for key in qa_mapping.keys():
        if key in corrected_query:
            logging.debug(f"Direct match found: {key}")
            return qa_mapping[key]  # Return the key for the section

    for key, value in qa_mapping.items():
        logging.debug(f"Checking key: {key}, value: {value}")  # Debugging line
        for synonym in synonyms.get(key, [key]):
            logging.debug(f"Trying synonym: {synonym}")  # Debugging line
            if synonym in corrected_query:
                logging.debug(f"Match found with synonym: {synonym}")  # Debugging line
                return value  # Only return section value of the key
    
    logging.debug("No match found")  # Debugging line if no match is found
    return None

def answer_question_nlp(query, sections, qa_mapping, synonyms):
    section_key = match_with_synonyms(query, qa_mapping, synonyms, sections)
    
    logging.debug(f"Found section key: {section_key}")  # Debugging line
    logging.debug(f"Available sections: {sections.keys()}")  # Log available keys in sections

    if section_key:
        # Retrieve the relevant section from the specified chapter
        answer = sections.get(section_key)
        if answer:
            return answer
        else:
            logging.error(f"Section not found for key: {section_key}")  # Log an error if section is not found
            return "Section not found."
    
    logging.error("No answer found for the question.")  # Log an error if no answer is found
    return "Sorry, I couldn't find an answer to your question."

# Sample usage
if __name__ == "__main__":
    user_query = "What does it say about culture and heritage?"
    answer = answer_question_nlp(user_query, sections, qa_mapping, synonyms)
    print(answer)

NameError: name 'sections' is not defined

import logging
from telegram import Update
from telegram.ext import ApplicationBuilder, CommandHandler, MessageHandler, filters, ContextTypes
from spellchecker import SpellChecker
import spacy
from config import combined_sections, qa_mapping, synonyms, specific_citizenship_map  # Ensure all are imported

# Load the language model (make sure to have spaCy installed and the model downloaded)
nlp = spacy.load("en_core_web_sm")

# Configure logging
logging.basicConfig(
    level=logging.DEBUG,  # Set to DEBUG to see debug messages
    format='%(asctime)s - %(levelname)s - %(message)s',  # Log message format
)

# Preprocess the query
def preprocess_query(query):
    doc = nlp(query)
    tokens = [token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct]
    return " ".join(tokens)

# Function to correct the spelling
def correct_spelling(processed_query):
    spell = SpellChecker()
    words = processed_query.split()
    misspelled_words = spell.unknown(words)

    corrected_words = []
    for word in words:
        if word in misspelled_words:
            corrected_words.append(spell.correction(word))  # Get the most likely correction
        else:
            corrected_words.append(word)

    corrected_input = " ".join(corrected_words)
    return corrected_input

# Match the words to their synonyms
def match_with_synonyms(query):
    processed_query = preprocess_query(query)
    logging.debug(f"Processed Query: {processed_query}")

    corrected_query = correct_spelling(processed_query)
    logging.debug(f"Corrected Query: {corrected_query}")

    # Check for specific keywords first
    for key, phrase in specific_citizenship_map.items():
        if phrase in corrected_query:
            logging.debug(f"Specific citizenship topic detected: {phrase}")
            return qa_mapping.get(phrase, "Sorry, I couldn't find an answer to your question.")

    # Trigger the general response if no specific keywords were found
    if "citizenship" in corrected_query:
        logging.debug(f"'Citizenship' detected in query, returning related sections.")
        citizenship_keys = list(combined_sections.keys())
        citizenship_keys = ", ".join(citizenship_keys)
        return (f"It looks like you're asking about citizenship. Here are the sections you can inquire about: {citizenship_keys}. "
                f"Please specify which section you're interested in.")

    # Check direct matching with section keys
    for key in qa_mapping.keys():
        if key in corrected_query:
            logging.debug(f"Direct match found: {key}")
            return qa_mapping[key]

    for key, value in qa_mapping.items():
        logging.debug(f"Checking key: {key}, value: {value}")
        for synonym in synonyms.get(key, [key]):
            logging.debug(f"Trying synonym: {synonym}")
            if synonym in corrected_query:
                logging.debug(f"Match found with synonym: {synonym}")
                return value

    logging.debug("No match found")
    return None

def answer_question_nlp(query):
    section_key = match_with_synonyms(query)

    logging.debug(f"Found section key: {section_key}")
    logging.debug(f"Available sections: {combined_sections.keys()}")

    if section_key:
        answer = combined_sections.get(section_key)
        if answer:
            return answer
        else:
            logging.error(f"Section not found for key: {section_key}")
            return "Section not found."

    logging.error("No answer found for the question.")
    return "Sorry, I couldn't find an answer to your question."

# Command and message handlers
async def start(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
    await update.message.reply_text("Welcome to the Kenya Constitution Bot! Ask me anything.")

async def handle_message(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
    user_query = update.message.text
    logging.debug(f"User query: {user_query}")
    
    answer = answer_question_nlp(user_query)
    await update.message.reply_text(answer)

# Main function to run the bot
async def main():
    # Initialize the bot
    application = ApplicationBuilder().token("7356196146:AAGOnrG4ExuIOLE5c_g8VtvgD8me1mwBzVk").build()

    # Add handlers
    application.add_handler(CommandHandler("start", start))
    application.add_handler(MessageHandler(filters.TEXT & ~filters.COMMAND, handle_message))

    # Start polling
    await application.run_polling()

if __name__ == "__main__":
    import asyncio
    asyncio.run(main())
