In [10]:
import gensim
import numpy
from gensim.models import KeyedVectors

# Use your own path to save the file
download_path = "GoogleNews-vectors-negative300.bin"

# Load the vectors using Gensim
word2vec_model = KeyedVectors.load_word2vec_format(download_path, binary=True, limit=1000000)

# Save them as a flat file
word2vec_model.save_word2vec_format('vectors.csv')

INFO:gensim.models.keyedvectors:loading projection weights from GoogleNews-vectors-negative300.bin
INFO:gensim.utils:KeyedVectors lifecycle event {'msg': 'loaded (1000000, 300) matrix of type float32 from GoogleNews-vectors-negative300.bin', 'binary': True, 'encoding': 'utf8', 'datetime': '2023-11-09T14:11:19.866866', 'gensim': '4.1.2', 'python': '3.9.12 (main, Apr  4 2022, 05:22:27) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'load_word2vec_format'}
INFO:gensim.models.keyedvectors:storing 1000000x300 projection weights into vectors.csv


In [11]:
pip install numpy

Note: you may need to restart the kernel to use updated packages.


In [64]:
# from numpy import *
import logging
# Set up logging
logging.basicConfig(level=logging.INFO)

# Word_embedder.py Code here

In [140]:

class WordEmbedder:
    def __init__(self, word2vec_model):
        self.word2vec_model = word2vec_model

    def get_embedding(self, word):
        return self.word2vec_model[word] if word in self.word2vec_model else 1


# Distance_calculator.py Code here

In [141]:

class DistanceCalculator:
    def calculate_distance(self, vector1, vector2):
        # Use either L2 or Cosine distance calculation logic
        return numpy.linalg.norm(vector1 - vector2)  # L2 distance

# phrase_similarity_calculator.py code here

In [153]:

class PhraseSimilarityCalculator:
    def __init__(self, word_embedder, distance_calculator):
        self.word_embedder = word_embedder
        self.distance_calculator = distance_calculator

    def calculate_phrase_similarity(self, phrase1, phrase2):
        embeddings1 = [self.word_embedder.get_embedding(word) for word in phrase1.split()]
        embeddings2 = [self.word_embedder.get_embedding(word) for word in phrase2.split()]
        
        
        
        # Approximate phrase vectors by normalizing the sum of word embeddings
        vector1 = numpy.sum(embeddings1, axis=0) / numpy.linalg.norm(numpy.sum(embeddings1, axis=0))
        vector2 = numpy.sum(embeddings2, axis=0) / numpy.linalg.norm(numpy.sum(embeddings2, axis=0))


        return self.distance_calculator.calculate_distance(vector1, vector2)
    
    # phrase_similarity_calculator.py (add the following function)
    
    def find_closest_match(self,user_input, phrases):
        distances = [self.calculate_phrase_similarity(user_input, phrase) for phrase in phrases]
#         print(distances)
        min_distance = min(distances)
        closest_match = phrases[distances.index(min_distance)]
        return closest_match, min_distance
        


# STEP 3: Batch Execution -> Main.py

In [154]:

# from phrase_similarity_calculator import PhraseSimilarityCalculator

# Set up logging, argument parsing, and handle configurations

# Initialize components
word_embedder = WordEmbedder(word2vec_model)
distance_calculator = DistanceCalculator()
phrase_similarity_calculator = PhraseSimilarityCalculator(word_embedder, distance_calculator)


In [155]:
import csv


# Specify the path to your CSV file
csv_file_path = 'phrases.csv'

# Read phrases from CSV
def read_phrases_from_csv(csv_path):
    phrases = []

    try:
        with open(csv_path, 'r') as file:
            reader = csv.reader(file)
            for row in reader:
                # Assuming each row in the CSV file contains a single phrase
                phrases.append(row[0])  # Adjust the index based on your CSV structure

    except FileNotFoundError:
        print(f"Error: File not found - {csv_path}")

    return phrases

# Call the function to read phrases
phrases = read_phrases_from_csv(csv_file_path)

# # # Print or use the phrases as needed
print(phrases)


['Phrases', 'how company compares to its peers?', 'what is the detailed income statement breakdown of Axa?', 'world premium penetration in 2020', 'How does the forecasted insurance premium penetration in country trend compare to its peers?', 'what are the total losses for companies in country  non life market?', 'Insurance premiums market in Country', 'How have the profit margins been for the airlines industry through the years, by regions?', 'What is the complete profile of the top oil rigs?', 'what is company general information?', 'How is the airline industry performing globally?', 'how does economic profit for in country compare to others?', 'most profitable insurance company India', 'Give me a detailed breakup of the income and expenses in South-East Asia', 'Show me all the oil rigs in the world', 'What were the premiums earned by the Indian insurance industry in 2020?', 'How much have premiums grown as compared to claims for India?', 'Which are the best performing insurance firms

In [159]:

user_input_phrase = "Phrases s"

# Call the find_closest_match function
closest_match, min_distance = phrase_similarity_calculator.find_closest_match(user_input_phrase, phrases)

# Print or use the results
print("Closest Match:", closest_match)
print("Minimum Distance:", min_distance)


Closest Match: Phrases
Minimum Distance: 0.511639
