In [7]:
import gensim
from gensim.models import KeyedVectors
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Task 1: Downloading and saving the Word2Vec vectors
# Replace 'location' with the path to the downloaded pretrained vectors
location = r'D:\GoogleNews-vectors-negative300.bin.gz'  # Update with your file path
wv = KeyedVectors.load_word2vec_format(location, binary=True, limit=1000000)
wv.save_word2vec_format('vectors.txt', binary=False)  # Saving as text file

# Task 2: Load the processed word embeddings and phrases.csv
word_vectors = KeyedVectors.load_word2vec_format('vectors.txt', binary=False)

phrases_df = pd.read_csv(r'D:\phrases.csv', encoding='latin1')
#Task 3: Making functions
# Function to calculate phrase vector as the normalized sum of word vectors
def phrase_vector(phrase):
    words = phrase.split()
    vector_sum = np.zeros((word_vectors.vector_size,), dtype="float32")
    word_count = 0
    for word in words:
        if word in word_vectors:
            vector_sum = np.add(vector_sum, word_vectors[word])
            word_count += 1
    if word_count == 0:
        return None
    return vector_sum / word_count

# Calculate similarity between phrases
def calculate_similarity(phrase1, phrase2):
    vec1 = phrase_vector(phrase1)
    vec2 = phrase_vector(phrase2)
    if vec1 is None or vec2 is None:
        return None
    return cosine_similarity(vec1.reshape(1, -1), vec2.reshape(1, -1))

# Calculate similarity of phrases in phrases.csv with each other
similarities = []
for idx, row in phrases_df.iterrows():
    phrase = row['Phrases']
    for inner_idx, inner_row in phrases_df.iterrows():
        inner_phrase = inner_row['Phrases']
        similarity = calculate_similarity(phrase, inner_phrase)
        similarities.append({
            'Phrase 1': phrase,
            'Phrase 2': inner_phrase,
            'Similarity': similarity[0][0] if similarity is not None else None
        })

# Convert similarities to DataFrame
similarities_df = pd.DataFrame(similarities)




In [6]:

# Function to find closest match to a given string

def closest_match(input_phrase):
    max_similarity = -1
    closest_phrase = None
    for idx, row in phrases_df.iterrows():
        phrase = row['Phrases']
        similarity = calculate_similarity(input_phrase, phrase)
        if similarity is not None and similarity > max_similarity:
            max_similarity = similarity
            closest_phrase = phrase
    return closest_phrase, max_similarity
# Example usage:
input_phrase = input("Enter your text here")
closest, similarity = closest_match(input_phrase)
print(f"Closest phrase: {closest}\nSimilarity: {similarity}")

Enter your text here Insurance


Closest phrase: Insurance premiums market in Country
Similarity: [[0.7475684]]
