In [64]:
# Libraries
import spacy
from spacy.lang.am.examples import sentences

In [65]:
"""
# Initial download of model - this only needs to be run ONCE
!python -m spacy download en_core_web_sm
""""

SyntaxError: unterminated string literal (detected at line 4) (1766002328.py, line 4)

In [46]:
# Initialise the spaCy engine
# en_core_web_sm is a pre-trained model that knows English grammar and vocabulary
nlp = spacy.load("en_core_web_sm")

In [47]:
# Define the user query
query = "I want to know what binary is. Does it work like normal numbers? Is it used in programming?"

In [48]:
"""
Cleans and preprocesses the search query by:
- Lowercasing
- Removing stopwords
- Removing punctuation
- Lemmatising words

This:
- Ensures efficient processing
- Removes unimportant words
- Extracts relevant words
"""
def preprocess_query(query):
    # Convert the query to lowercase
    query = query.lower()
    # Process the query with spaCy
    query_processed = nlp(query)
    # Tokenise the query and store in list
    query_tokens = [query_token.lemma_ for query_token in query_processed if not query_token.is_stop and not query_token.is_punct]
    
    # Return a list of important words
    return query_tokens

preprocessed_query = preprocess_query(query)
print(preprocessed_query)

['want', 'know', 'binary', 'work', 'like', 'normal', 'number', 'programming']


In [49]:
"""
Named Entity Recognition (NER) to extract key topics.
Extracts named entities (like AI, ML, Python) from the given text.
"""
def extract_entities(query):
    # Identify key topics from a query using spaCy's Named Entity Recognition (NER)
    query = nlp(query)
    query_entities = {ent.text: ent.label_ for ent in query.ents}
    
    return query_entities

print(extract_entities(query))

{}


In [58]:
import sqlite3

# Connect to the database
conn = sqlite3.connect("database.db")
cursor = conn.cursor()

# Fetch all keywords from the relevant table
cursor.execute("SELECT category_title FROM syllabus_categories")  # Modify if needed
rows = cursor.fetchall()

# Convert to a list of strings
gcse_keywords = [row[0] for row in rows]

# Close the connection
conn.close()

print("GCSE Keywords:", gcse_keywords)

GCSE Keywords: ['Computer systems', 'Data representation', 'Boolean logic', 'Algorithmic thinking', 'Programming', 'Databases', 'Productivity software', 'Solution development', 'Testing', 'Computer networks', 'Security', 'Impacts', 'Digital authoring', 'Electronics']


In [51]:
"""
# Define a dictionary of keywords that are likely to be in the user's query
gcse_keywords = {
    "binary": "Data Representation",
    "IP address": "Networking",
    "algorithm": "Programming",
    "network": "Networking",
    "encryption": "Cybersecurity",
    "CPU": "Hardware",
    "RAM": "Hardware"
}
"""

'\n# Define a dictionary of keywords that are likely to be in the user\'s query\ngcse_keywords = {\n    "binary": "Data Representation",\n    "IP address": "Networking",\n    "algorithm": "Programming",\n    "network": "Networking",\n    "encryption": "Cybersecurity",\n    "CPU": "Hardware",\n    "RAM": "Hardware"\n}\n'

In [60]:
"""

This helps categorise the query and ensures accurate search results.
"""
def match_gcse_keywords(query):
    """
    Matches the query against predefined GCSE Computer Science keywords.
    """
    query_tokens = preprocess_query(query)
    matched_keywords = {word: gcse_keywords[word] for word in query_tokens if word in gcse_keywords}
    return matched_keywords

query_keywords = match_gcse_keywords(query)
print(query_keywords)

{}


In [61]:
"""
Understand query intent using dependency parsing

"""
def analyse_query_structure(query):
    """
    Analyses the structure of a query to understand intent.
    """
    doc = nlp(query)
    for token in doc:
        print(f"{token.text} → {token.dep_} (Head: {token.head.text})")

query_structure = analyse_query_structure(query)
print(query_structure)

I → nsubj (Head: want)
want → ROOT (Head: want)
to → aux (Head: know)
know → xcomp (Head: want)
what → det (Head: binary)
binary → attr (Head: is)
is → ccomp (Head: know)
. → punct (Head: want)
Does → aux (Head: work)
it → nsubj (Head: work)
work → ROOT (Head: work)
like → prep (Head: work)
normal → amod (Head: numbers)
numbers → pobj (Head: like)
? → punct (Head: work)
Is → auxpass (Head: used)
it → nsubjpass (Head: used)
used → ROOT (Head: used)
in → prep (Head: used)
programming → pobj (Head: in)
? → punct (Head: used)
None


In [62]:
# Expand the search beyond exact keywords
def get_similar_words(word):
    """
    Finds words similar to a given word using word vectors.
    """
    word_vector = nlp(word)
    similar_words = [token.text for token in nlp.vocab if token.has_vector and word_vector.similarity(nlp(token.text)) > 0.7]
    return similar_words[:5]  # Return top 5 similar words

# Example
print(get_similar_words("binary"))

[]


In [63]:
"""
Combine everything into a query processing function
"""
def process_user_query(query):
    """
    Processes a user's query to extract key concepts and categorise it.
    """
    keywords = preprocess_query(query)  # Step 1: Preprocessing
    entities = extract_entities(query)  # Step 2: Named Entity Recognition
    matched_gcse_terms = match_gcse_keywords(query)  # Step 3: GCSE Keyword Matching

    return {
        "cleaned_query": keywords,
        "extracted_entities": entities,
        "matched_gcse_topics": matched_gcse_terms
    }

print(process_user_query(query))

{'cleaned_query': ['want', 'know', 'binary', 'work', 'like', 'normal', 'number', 'programming'], 'extracted_entities': {}, 'matched_gcse_topics': {}}
