In [276]:
import spacy
from keybert import KeyBERT
from tqdm import tqdm
from fuzzywuzzy import fuzz
from sentence_transformers import SentenceTransformer, util
from spacy.lang.en.stop_words import STOP_WORDS as SPACY_STOPWORDS
from transformers import pipeline

In [92]:
# Load models
kw_model = KeyBERT()
nlp = spacy.load("en_core_web_sm")

In [50]:
# Input text (e.g., a natural language question)
question = "What is the fastest lap time ever in a race for Lewis Hamilton?"

# Extract keywords (phrases up to 3 words long)
keywords = kw_model.extract_keywords(
    question,
    keyphrase_ngram_range=(1, 3),
    stop_words='english',  # Removes common stopwords
    top_n=10                 # Number of keywords to return
)

# Print results
for keyword, score in keywords:
    print(f"{keyword} - {score:.4f}")


fastest lap time - 0.7019
race lewis hamilton - 0.7004
fastest lap - 0.6697
time race lewis - 0.6250
race lewis - 0.6199
lap time race - 0.6061
lewis hamilton - 0.5742
hamilton - 0.5130
fastest - 0.5087
lap time - 0.4686


In [None]:
def dependency_keywords(text: str): 
    doc = nlp(text)
    # Remove stop words
    doc = [token for token in doc if not token.is_stop and not token.is_punct]
    keywords = set()

    for token in doc:
        # Look for a noun with modifiers
        if token.pos_ in {"NOUN", "PROPN"}:
            modifiers = [
                child for child in token.lefts
                if child.dep_ in {"amod", "compound"} 
            ]
            if modifiers:
                phrase = " ".join([t.text for t in [*modifiers, token]])
                keywords.add(phrase)
        keywords.add(token.text)
    # print(f"\tCandidates: {keywords}")
    return list(keywords)

In [74]:
text = "What is the fastest lap time ever in a race for Lewis Hamilton?"
candidates = dependency_keywords(text)
print(candidates)

	Candidates: {'fastest', 'race', 'Hamilton', 'time', 'Lewis', 'fastest lap time', 'Lewis Hamilton', 'lap'}
['fastest', 'race', 'Hamilton', 'time', 'Lewis', 'fastest lap time', 'Lewis Hamilton', 'lap']


In [None]:
def keybert_keywords(question, candidates, top_n=5):
    candidates = [phrase.lower() for phrase in candidates]
    keywords = kw_model.extract_keywords(
        question,
        stop_words="english",
        candidates=candidates,
        keyphrase_ngram_range=(1, 3),
        top_n=top_n                 # Number of keywords to return
    )
    return keywords

In [94]:
print(keybert_keywords(question, candidates))

[('fastest lap time', 0.7019), ('lewis hamilton', 0.5742), ('hamilton', 0.513), ('fastest', 0.5087), ('lewis', 0.4565)]


In [184]:
def extract_keywords(question, top_n=5):
    candidates = dependency_keywords(question)
    keywords = keybert_keywords(question,candidates, top_n=top_n)
    return keywords

In [162]:
question = "What is the fastest lap time ever in a race for Lewis Hamilton?"
print(extract_keywords(question))

[('fastest lap time', 0.7019), ('lewis hamilton', 0.5742), ('hamilton', 0.513), ('fastest', 0.5087), ('lewis', 0.4565)]


In [None]:
file_path = "D:/University/4th year/2nd Semester/GP/playground/squall playground/test.txt"
with open(file_path, "r", encoding="utf-8") as file:
    total_lines = sum(1 for _ in file)
with open(file_path, "r", encoding="utf-8") as file:
    for index, line in tqdm(enumerate(file), desc="Processing questions...", total=total_lines):
        if index % 3 == 0:
            # print(f"Line {index}")
            # print(f"\t{line}")
            keywords = extract_keywords(line.strip())
            # print(f"\t{keywords}")

In [101]:
# Load spaCy and SBERT model
nlp = spacy.load("en_core_web_sm")
model = SentenceTransformer("all-MiniLM-L6-v2")

In [None]:
def clean_tokens(text):
    """Remove stop words and punctuation."""
    keep_words = {"name", "amount", "show", "mine", "side", "keep", "full", "see", "various", "former", "bottom", "call", "next"}  
    custom_stopwords = SPACY_STOPWORDS.difference(keep_words)
    doc = nlp(text)
    return [token.text for token in doc if token.text.lower() not in custom_stopwords and not token.is_punct]

def fuzzy_match_phrases(question, schema, threshold=70):
    """Fuzzy match question's token phrases to names."""
    tokens = clean_tokens(question)
    print(f"After removing stop words: {tokens}")
    matches = []
    for token in tokens:
        for table in schema:
            for (column, _) in schema[table]:
                score = fuzz.partial_ratio(token.lower(), column.lower())
                if score >= threshold:
                    matches.append((token, column, score))
    return matches

In [294]:
question = "What is the average number of Mubi users who love movies directed by Stanley Kubrick?"
schema = {
    "lists": [
              ("user_id", "ID related to the user who created the list."), 
              ("list_id", "ID of the list on Mubi"),
              ("list_title", "Name of the list"),
              ("list_movie_number", "Number of movies added to the list"),
              ("list_update_timestamp_utc", "Last update timestamp for the list"),
              ("list_creation_timestamp_utc", "Creation timestamp for the list"),
              ("list_followers", "Number of followers on the list"),
              ("list_url", "URL to the list page on Mubi"),
              ("list_comments", "Number of comments on the list"),
              ("list_description", "List description made by the user"),
              ("list_cover_image_url", ""),
              ("list_first_image_url", ""),
              ("list_second_image_url", ""),
              ("list_third_image_url", "")
            ],

    "lists_users": [
              ("user_id", "ID related to the user who created the list."),
              ("list_id", "ID of the list on Mubi"),
              ("list_update_date_utc", "Last update date for the list"),
              ("list_create_date_utc", "Creation date for the list"),
              ("user_trialist", "whether the user was a tralist when he created the list "),
              ("user_subscriber","whether the user was a subscriber when he created the list "),
              ("user_avatar_image_url", "User profile image URL on Mubi"),
              ("user_cover_image_url", "User profile cover image URL on Mubi"),
              ("user_eligible_for_trial", "whether the user was eligible for trial when he created the list "),
              ("user_has_payment_method", "whether the user was a paying subscriber when he created the list ")
            ],

    "movies": [
              ("movie_id", "ID related to the movie on Mubi"),
              ("movie_title", "Name of the movie"),
              ("movie_release_year", "Release year of the movie"),
              ("movie_url", "URL to the movie page on Mubi"),
              ("movie_title_language", "By default, the title is in English."),
              ("movie_popularity","Number of Mubi users who love this movie"),
              ("movie_image_url", "Image URL to the movie on Mubi"),
              ("director_id", "ID related to the movie director on Mubi"),
              ("director_name", "Full Name of the movie director"),
              ("director_url","URL to the movie director page on Mubi")
            ],

    "ratings": [
              ("movie_id", "Movie ID related to the rating"),
              ("rating_id", "Rating ID on Mubi"),
              ("rating_url", "URL to the rating on Mubi"),
              ("rating_score", "Rating score ranging from 1 (lowest) to 5 (highest)"),
              ("rating_timestamp_utc", "Timestamp for the movie rating made by the user on Mubi"),
              ("critic", "Critic made by the user rating the movie. "),
              ("critic_likes", "Number of likes related to the critic made by the user rating the movie"),
              ("critic_comments", "Number of comments related to the critic made by the user rating the movie"),
              ("user_id", "ID related to the user rating the movie"),
              ("user_trialist", "whether user was a tralist when he rated the movie"),
              ("user_subscriber", ""),
              ("user_eligible_for_trial", ""),
              ("user_has_payment_method","")
            ],

    "ratings_users": [
              ("user_id", "ID related to the user rating the movie"),
              ("rating_date_utc", "Rating date for the movie rating."),
              ("user_trialist", "whether the user was a trialist when he rated the movie"),
              ("user_subscriber", "whether the user was a subscriber when he rated the movie"),
              ("user_avatar_image_url", "URL to the user profile image on Mubi"),
              ("user_cover_image_url","URL to the user profile cover image on Mubi"),
              ("user_eligible_for_trial", "whether the user was eligible for trial when he rated the movie"),
              ("user_has_payment_method", "whether the user was a paying subscriber when he rated the movie")
            ]
}

print(fuzzy_match_phrases(question, schema, threshold=80))

After removing stop words: ['average', 'number', 'Mubi', 'users', 'love', 'movies', 'directed', 'Stanley', 'Kubrick']
[('number', 'list_movie_number', 100), ('users', 'user_id', 80), ('users', 'user_id', 80), ('users', 'user_trialist', 80), ('users', 'user_subscriber', 80), ('users', 'user_avatar_image_url', 80), ('users', 'user_cover_image_url', 80), ('users', 'user_eligible_for_trial', 80), ('users', 'user_has_payment_method', 80), ('users', 'user_id', 80), ('users', 'user_trialist', 80), ('users', 'user_subscriber', 80), ('users', 'user_eligible_for_trial', 80), ('users', 'user_has_payment_method', 80), ('users', 'user_id', 80), ('users', 'user_trialist', 80), ('users', 'user_subscriber', 80), ('users', 'user_avatar_image_url', 80), ('users', 'user_cover_image_url', 80), ('users', 'user_eligible_for_trial', 80), ('users', 'user_has_payment_method', 80), ('movies', 'list_movie_number', 83), ('movies', 'movie_id', 83), ('movies', 'movie_title', 83), ('movies', 'movie_release_year', 83

In [291]:
def semantic_similarity(question, schema, threshold=0.4):
    """Compute semantic similarity between keywords and column descriptions."""
    question_vec = model.encode(question, convert_to_tensor=True)
    similarities = []
    for table in schema:
        for (col, desc) in schema[table]:
            score = 0
            if desc != "":
                desc_vec = model.encode(desc, convert_to_tensor=True)
                score = util.cos_sim(question_vec, desc_vec).item()
            else:
                col_vec = model.encode(col, convert_to_tensor=True)
                score = util.cos_sim(question_vec, col_vec).item()
            if score >= threshold:
                similarities.append((table, col, score, desc))

    return similarities

In [295]:
total = 0
for table in schema:
    for col in schema[table]:
        total += 1
print(total)
keywords = extract_keywords(question, top_n=10)
keywords = [keyword[0] for keyword in keywords]
print(question)
print(keywords)
result = semantic_similarity(question, schema, threshold=0.4)
print(len(result))
result

55
What is the average number of Mubi users who love movies directed by Stanley Kubrick?
['stanley kubrick', 'kubrick', 'mubi users', 'movies', 'mubi', 'average', 'average number', 'directed', 'number', 'stanley']
21


[('lists',
  'list_movie_number',
  0.5628187656402588,
  'Number of movies added to the list'),
 ('movies', 'movie_id', 0.500049889087677, 'ID related to the movie on Mubi'),
 ('movies', 'movie_url', 0.407701313495636, 'URL to the movie page on Mubi'),
 ('movies',
  'movie_popularity',
  0.7626645565032959,
  'Number of Mubi users who love this movie'),
 ('movies',
  'director_id',
  0.5063318610191345,
  'ID related to the movie director on Mubi'),
 ('movies',
  'director_name',
  0.40639087557792664,
  'Full Name of the movie director'),
 ('movies',
  'director_url',
  0.4220734238624573,
  'URL to the movie director page on Mubi'),
 ('ratings', 'movie_id', 0.4636894464492798, 'Movie ID related to the rating'),
 ('ratings', 'rating_id', 0.44096750020980835, 'Rating ID on Mubi'),
 ('ratings', 'rating_url', 0.42876583337783813, 'URL to the rating on Mubi'),
 ('ratings',
  'rating_timestamp_utc',
  0.5265337824821472,
  'Timestamp for the movie rating made by the user on Mubi'),
 ('rat

In [253]:
kw_vec = model.encode("who is the driver", convert_to_tensor=True)
col_vec = model.encode("racer",  convert_to_tensor=True)
score = util.cos_sim(kw_vec, col_vec).item()
print(score)

0.4826233983039856


In [289]:
doc = nlp("For movie titled 'Welcome to the Dollhouse', how many percentage of the ratings were rated with highest score")

for ent in doc.ents:
    print(ent.text, ent.label_)

'Welcome WORK_OF_ART
Dollhouse ORG


In [296]:
print('critic' in SPACY_STOPWORDS)

False
