In [9]:
import spacy 
import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet as wn
from collections import Counter
import numpy as np


# Load spaCy model
nlp= spacy.load('en_core_web_sm')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/hivagheisari/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Synonyms:

In [10]:
def get_synonyms(word):
    synonyms=set()
    for syn in wn.synsets(word):
        for lemma in syn.lemmas():
            synonyms.add(lemma.name())
    return synonyms

In [11]:
get_synonyms('Car')

{'auto',
 'automobile',
 'cable_car',
 'car',
 'elevator_car',
 'gondola',
 'machine',
 'motorcar',
 'railcar',
 'railroad_car',
 'railway_car'}

In [12]:
print(f" wn.synsets('Car') is {wn.synsets('Car')}")

 wn.synsets('Car') is [Synset('car.n.01'), Synset('car.n.02'), Synset('car.n.03'), Synset('car.n.04'), Synset('cable_car.n.01')]


In [13]:
for syn in wn.synsets('Car'):
    print(syn.lemmas())

[Lemma('car.n.01.car'), Lemma('car.n.01.auto'), Lemma('car.n.01.automobile'), Lemma('car.n.01.machine'), Lemma('car.n.01.motorcar')]
[Lemma('car.n.02.car'), Lemma('car.n.02.railcar'), Lemma('car.n.02.railway_car'), Lemma('car.n.02.railroad_car')]
[Lemma('car.n.03.car'), Lemma('car.n.03.gondola')]
[Lemma('car.n.04.car'), Lemma('car.n.04.elevator_car')]
[Lemma('cable_car.n.01.cable_car'), Lemma('cable_car.n.01.car')]


In [14]:
for syn in wn.synsets('Car'):
    #print(syn.lemmas())
    for lemma in syn.lemmas():
        print(lemma.name())

car
auto
automobile
machine
motorcar
car
railcar
railway_car
railroad_car
car
gondola
car
elevator_car
cable_car
car


In [15]:
def preprocess_text(text):
    doc = nlp(text.lower())
    lemmatized_words = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        lemmatized_words.append(token.lemma_)
    return lemmatized_words

In [16]:
text= '"Text" can refer to the written words on a page, a written message, or even a broader concept of any object that can be "read" and interpreted. It can also refer to the act of sending a written message on a mobile phone. '

In [17]:
text_l=text.lower()
text_l

'"text" can refer to the written words on a page, a written message, or even a broader concept of any object that can be "read" and interpreted. it can also refer to the act of sending a written message on a mobile phone. '

In [18]:
doc= nlp(text_l)

In [28]:
for token in doc:
    #print((token.lemma_))
    pass

In [24]:
preprocess_text(text_l)

['text',
 'refer',
 'write',
 'word',
 'page',
 'write',
 'message',
 'broad',
 'concept',
 'object',
 'read',
 'interpret',
 'refer',
 'act',
 'send',
 'write',
 'message',
 'mobile',
 'phone']

In [35]:
type(preprocess_text(text_l))

list

In [38]:
def expand_with_syn(words):
    expand_w= words.copy()
    for w in words:
        expand_w.extend(get_synonyms(w))
    return(expand_w)

In [55]:
print(expand_with_syn(preprocess_text(text_l)))

['text', 'refer', 'write', 'word', 'page', 'write', 'message', 'broad', 'concept', 'object', 'read', 'interpret', 'refer', 'act', 'send', 'write', 'message', 'mobile', 'phone', 'schoolbook', 'text_edition', 'textbook', 'text', 'school_text', 'textual_matter', 'pertain', 'touch', 'touch_on', 'concern', 'bring_up', 'name', 'consult', 'look_up', 'refer', 'advert', 'cite', 'denote', 'relate', 'bear_on', 'have-to_doe_with', 'mention', 'come_to', 'spell', 'drop_a_line', 'save', 'write', 'indite', 'publish', 'compose', 'pen', 'Bible', 'give-and-take', 'Holy_Writ', 'Word', 'give_voice', 'watchword', 'articulate', 'word', 'intelligence', 'Son', 'phrase', 'countersign', 'Good_Book', 'news', 'tidings', 'Logos', 'discussion', 'word_of_honor', 'password', 'parole', 'formulate', 'Christian_Bible', 'Book', 'Word_of_God', 'Scripture', 'Holy_Scripture', 'pageboy', 'Sir_Frederick_Handley_Page', 'paginate', 'Thomas_Nelson_Page', 'Page', 'page', 'foliate', 'varlet', 'spell', 'drop_a_line', 'save', 'write'

In [56]:
print(Counter(expand_with_syn(preprocess_text(text_l))))

Counter({'write': 6, 'refer': 4, 'message': 4, 'read': 3, 'interpret': 3, 'spell': 3, 'drop_a_line': 3, 'save': 3, 'indite': 3, 'publish': 3, 'compose': 3, 'pen': 3, 'text': 2, 'word': 2, 'page': 2, 'broad': 2, 'concept': 2, 'object': 2, 'act': 2, 'send': 2, 'mobile': 2, 'phone': 2, 'pertain': 2, 'touch': 2, 'touch_on': 2, 'concern': 2, 'bring_up': 2, 'name': 2, 'consult': 2, 'look_up': 2, 'advert': 2, 'cite': 2, 'denote': 2, 'relate': 2, 'bear_on': 2, 'have-to_doe_with': 2, 'mention': 2, 'come_to': 2, 'subject_matter': 2, 'content': 2, 'substance': 2, 'translate': 2, 'understand': 2, 'represent': 2, 'schoolbook': 1, 'text_edition': 1, 'textbook': 1, 'school_text': 1, 'textual_matter': 1, 'Bible': 1, 'give-and-take': 1, 'Holy_Writ': 1, 'Word': 1, 'give_voice': 1, 'watchword': 1, 'articulate': 1, 'intelligence': 1, 'Son': 1, 'phrase': 1, 'countersign': 1, 'Good_Book': 1, 'news': 1, 'tidings': 1, 'Logos': 1, 'discussion': 1, 'word_of_honor': 1, 'password': 1, 'parole': 1, 'formulate': 1,

In [50]:
a = set(Counter(expand_with_syn(preprocess_text(text_l))))

In [45]:
len(set(Counter(expand_with_syn(preprocess_text(text_l))).keys()))

168

In [53]:
for w in a:
    #print(Counter(expand_with_syn(preprocess_text(text_l)))[w])
    pass

In [54]:
def calculate_enhanced_similarity(text1, text2):
    # Preprocess and tokenize texts
    words1 = preprocess_text(text1)
    words2 = preprocess_text(text2)

    # Expand with synonyms
    words1_expanded = expand_with_synonyms(words1)
    words2_expanded = expand_with_synonyms(words2)

    # Count word frequencies
    freq1 = Counter(words1_expanded)
    freq2 = Counter(words2_expanded)

    # Create a set of all unique words
    unique_words = set(freq1.keys()).union(set(freq2.keys()))

    # Create frequency vectors
    vector1 = [freq1[word] for word in unique_words]
    vector2 = [freq2[word] for word in unique_words]

    # Convert lists to numpy arrays
    vector1 = np.array(vector1)
    vector2 = np.array(vector2)

    # Calculate cosine similarity
    cosine_similarity = np.dot(vector1, vector2) / (np.linalg.norm(vector1) * np.linalg.norm(vector2))

    return cosine_similarity

In [58]:
def find_best_keymatch(query,records):
    best_score= 0
    best_record= None
    query_kw= set(query.lower().split())
    for record in records:
        record_kw= set(record.lower().split())
        common_kw= query_kw.intersection(record_kw)
        current_score= len(common_kw)
        if current_score > best_score:
            best_score= current_score
            best_record= record
    return best_record, best_score

In [59]:
w= "A RAG vector store is a database or dataset that contains vectorized data points."

'A RAG vector store is a database or dataset that contains vectorized data points.'

#### ADV RAG: