In [3]:
import spacy 
import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet as wn
from collections import Counter
import numpy as np


# Load spaCy model
nlp= spacy.load('en_core_web_sm')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/hivagheisari/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Synonyms:

In [4]:
def get_synonyms(word):
    synonyms=set()
    for syn in wn.synsets(word):
        for lemma in syn.lemmas():
            synonyms.add(lemma.name())
    return synonyms

In [5]:
get_synonyms('Car')

{'auto',
 'automobile',
 'cable_car',
 'car',
 'elevator_car',
 'gondola',
 'machine',
 'motorcar',
 'railcar',
 'railroad_car',
 'railway_car'}

In [6]:
print(f" wn.synsets('Car') is {wn.synsets('Car')}")

 wn.synsets('Car') is [Synset('car.n.01'), Synset('car.n.02'), Synset('car.n.03'), Synset('car.n.04'), Synset('cable_car.n.01')]


In [7]:
for syn in wn.synsets('Car'):
    print(syn.lemmas())

[Lemma('car.n.01.car'), Lemma('car.n.01.auto'), Lemma('car.n.01.automobile'), Lemma('car.n.01.machine'), Lemma('car.n.01.motorcar')]
[Lemma('car.n.02.car'), Lemma('car.n.02.railcar'), Lemma('car.n.02.railway_car'), Lemma('car.n.02.railroad_car')]
[Lemma('car.n.03.car'), Lemma('car.n.03.gondola')]
[Lemma('car.n.04.car'), Lemma('car.n.04.elevator_car')]
[Lemma('cable_car.n.01.cable_car'), Lemma('cable_car.n.01.car')]


In [8]:
for syn in wn.synsets('Car'):
    #print(syn.lemmas())
    for lemma in syn.lemmas():
        print(lemma.name())

car
auto
automobile
machine
motorcar
car
railcar
railway_car
railroad_car
car
gondola
car
elevator_car
cable_car
car


In [9]:
def preprocess_text(text):
    doc = nlp(text.lower())
    lemmatized_words = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        lemmatized_words.append(token.lemma_)
    return lemmatized_words

In [10]:
text= '"Text" can refer to the written words on a page, a written message, or even a broader concept of any object that can be "read" and interpreted. It can also refer to the act of sending a written message on a mobile phone. '

In [11]:
text_l=text.lower()
text_l

'"text" can refer to the written words on a page, a written message, or even a broader concept of any object that can be "read" and interpreted. it can also refer to the act of sending a written message on a mobile phone. '

In [12]:
doc= nlp(text_l)

In [13]:
for token in doc:
    #print((token.lemma_))
    pass

In [14]:
preprocess_text(text_l)

['text',
 'refer',
 'write',
 'word',
 'page',
 'write',
 'message',
 'broad',
 'concept',
 'object',
 'read',
 'interpret',
 'refer',
 'act',
 'send',
 'write',
 'message',
 'mobile',
 'phone']

In [15]:
type(preprocess_text(text_l))

list

In [16]:
def expand_with_syn(words):
    expand_w= words.copy()
    for w in words:
        expand_w.extend(get_synonyms(w))
    return(expand_w)

In [17]:
print(expand_with_syn(preprocess_text(text_l)))

['text', 'refer', 'write', 'word', 'page', 'write', 'message', 'broad', 'concept', 'object', 'read', 'interpret', 'refer', 'act', 'send', 'write', 'message', 'mobile', 'phone', 'text_edition', 'schoolbook', 'text', 'school_text', 'textbook', 'textual_matter', 'look_up', 'cite', 'advert', 'touch', 'name', 'denote', 'consult', 'bring_up', 'concern', 'bear_on', 'pertain', 'come_to', 'have-to_doe_with', 'relate', 'mention', 'refer', 'touch_on', 'publish', 'indite', 'spell', 'compose', 'write', 'save', 'pen', 'drop_a_line', 'give-and-take', 'Holy_Scripture', 'tidings', 'intelligence', 'Word', 'Holy_Writ', 'Good_Book', 'formulate', 'give_voice', 'phrase', 'Word_of_God', 'Book', 'word', 'news', 'watchword', 'word_of_honor', 'countersign', 'password', 'Bible', 'discussion', 'Christian_Bible', 'Logos', 'Scripture', 'articulate', 'Son', 'parole', 'Page', 'paginate', 'pageboy', 'Sir_Frederick_Handley_Page', 'foliate', 'varlet', 'Thomas_Nelson_Page', 'page', 'publish', 'indite', 'spell', 'compose'

In [18]:
print(Counter(expand_with_syn(preprocess_text(text_l))))

Counter({'write': 6, 'refer': 4, 'message': 4, 'read': 3, 'interpret': 3, 'publish': 3, 'indite': 3, 'spell': 3, 'compose': 3, 'save': 3, 'pen': 3, 'drop_a_line': 3, 'text': 2, 'word': 2, 'page': 2, 'broad': 2, 'concept': 2, 'object': 2, 'act': 2, 'send': 2, 'mobile': 2, 'phone': 2, 'look_up': 2, 'cite': 2, 'advert': 2, 'touch': 2, 'name': 2, 'denote': 2, 'consult': 2, 'bring_up': 2, 'concern': 2, 'bear_on': 2, 'pertain': 2, 'come_to': 2, 'have-to_doe_with': 2, 'relate': 2, 'mention': 2, 'touch_on': 2, 'content': 2, 'subject_matter': 2, 'substance': 2, 'understand': 2, 'translate': 2, 'represent': 2, 'text_edition': 1, 'schoolbook': 1, 'school_text': 1, 'textbook': 1, 'textual_matter': 1, 'give-and-take': 1, 'Holy_Scripture': 1, 'tidings': 1, 'intelligence': 1, 'Word': 1, 'Holy_Writ': 1, 'Good_Book': 1, 'formulate': 1, 'give_voice': 1, 'phrase': 1, 'Word_of_God': 1, 'Book': 1, 'news': 1, 'watchword': 1, 'word_of_honor': 1, 'countersign': 1, 'password': 1, 'Bible': 1, 'discussion': 1, '

In [19]:
a = set(Counter(expand_with_syn(preprocess_text(text_l))))

In [20]:
len(set(Counter(expand_with_syn(preprocess_text(text_l))).keys()))

168

In [21]:
for w in a:
    #print(Counter(expand_with_syn(preprocess_text(text_l)))[w])
    pass

In [22]:
def calculate_enhanced_similarity(text1, text2):
    # Preprocess and tokenize texts
    words1 = preprocess_text(text1)
    words2 = preprocess_text(text2)

    # Expand with synonyms
    words1_expanded = expand_with_synonyms(words1)
    words2_expanded = expand_with_synonyms(words2)

    # Count word frequencies
    freq1 = Counter(words1_expanded)
    freq2 = Counter(words2_expanded)

    # Create a set of all unique words
    unique_words = set(freq1.keys()).union(set(freq2.keys()))

    # Create frequency vectors
    vector1 = [freq1[word] for word in unique_words]
    vector2 = [freq2[word] for word in unique_words]

    # Convert lists to numpy arrays
    vector1 = np.array(vector1)
    vector2 = np.array(vector2)

    # Calculate cosine similarity
    cosine_similarity = np.dot(vector1, vector2) / (np.linalg.norm(vector1) * np.linalg.norm(vector2))

    return cosine_similarity

In [23]:

def find_best_keymatch(query,records):
    best_score= 0
    best_record= None
    query_kw= set(query.lower().split())
    for record in records:
        record_kw= set(record.lower().split())
        common_kw= query_kw.intersection(record_kw)
        current_score= len(common_kw)
        if current_score > best_score:
            best_score= current_score
            best_record= record
    return best_record, best_score

In [24]:
w= "A RAG vector store is a database or dataset that contains vectorized data points."

#### ADV RAG:

In [26]:
def find_best_match(text_input, records):
    best_score =0
    best_record=None
    for record in records:
        current_score = calculate_enhanced_similarity(text_input,record)
        if current_score > best_score:
            best_score= current_score
            best_record= record

        return best_score, best_record
        

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',
]
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)

In [4]:
vectorizer.get_feature_names_out()

array(['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third',
       'this'], dtype=object)

In [5]:
X.shape

(4, 9)

In [7]:
print(X)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 21 stored elements and shape (4, 9)>
  Coords	Values
  (0, 8)	0.38408524091481483
  (0, 3)	0.38408524091481483
  (0, 6)	0.38408524091481483
  (0, 2)	0.5802858236844359
  (0, 1)	0.46979138557992045
  (1, 8)	0.281088674033753
  (1, 3)	0.281088674033753
  (1, 6)	0.281088674033753
  (1, 1)	0.6876235979836938
  (1, 5)	0.5386476208856763
  (2, 8)	0.267103787642168
  (2, 3)	0.267103787642168
  (2, 6)	0.267103787642168
  (2, 0)	0.511848512707169
  (2, 7)	0.511848512707169
  (2, 4)	0.511848512707169
  (3, 8)	0.38408524091481483
  (3, 3)	0.38408524091481483
  (3, 6)	0.38408524091481483
  (3, 2)	0.5802858236844359
  (3, 1)	0.46979138557992045
