Step 1: Import Libraries and Load Data

In [12]:
import gensim
from gensim.models import KeyedVectors
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import euclidean_distances, cosine_distances
from fuzzywuzzy import process 

#csv_file_path = 'phrases.csv'
#df = pd.read_csv(csv_file_path)
df = pd.read_csv('C:\\Users\\Saku\\OneDrive\\Desktop\\text_distance.csv', encoding='latin-1')


df.head()

Unnamed: 0,Phrases
0,how company compares to its peers?
1,what is the detailed income statement breakdow...
2,world premium penetration in 2020
3,How does the forecasted insurance premium pene...
4,what are the total losses for companies in cou...


In [26]:
file_path = 'C:\\Users\\Saku\\Downloads\\GoogleNews-vectors-negative300.bin.gz'

wv = KeyedVectors.load_word2vec_format(file_path, binary=True, limit=100000)

flat_file_path = 'vectors.txt'
wv.save_word2vec_format(flat_file_path)

Step 2: Clean Duplicates, Outliers, and Stopwords

In [30]:
"""""
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import nltk
nltk.download('stopwords')
nltk.download('punkt')

# Remove duplicates
df_cleaned = df.drop_duplicates(subset=['Phrase'])

# Remove outliers (you may customize the criteria)
df_cleaned = df_cleaned[df_cleaned['Phrase'].apply(lambda x: len(x.split()) > 1)]

# Remove stopwords
stop_words = set(stopwords.words('english'))
df_cleaned['Cleaned_Phrase'] = df_cleaned['Phrase'].apply(lambda x: ' '.join([word.lower() for word in word_tokenize(x) if word.lower() not in stop_words]))

# Display the cleaned dataframe
df_cleaned.head()
"""""
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download stopwords if not already downloaded
import nltk
nltk.download('stopwords')
nltk.download('punkt')

# Check if 'Phrase' column exists in the original DataFrame
if 'Phrases' not in df.columns:
    print("Error: 'Phrase' column not found in the DataFrame.")
else:
    df_cleaned = df.drop_duplicates(subset=['Phrases'])

    df_cleaned = df_cleaned[df_cleaned['Phrases'].apply(lambda x: len(x.split()) > 1)]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    
    if 'Phrases' in df_cleaned.columns:
        df_cleaned['Cleaned_Phrase'] = df_cleaned['Phrases'].apply(lambda x: ' '.join([word.lower() for word in word_tokenize(x) if word.lower() not in stop_words]))
        df_cleaned.head()
    else:
        print("Error: 'Phrase' column not found in the cleaned DataFrame.")



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Saku\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Saku\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Step 3: Assign Word2Vec Embeddings to Cleaned Phrases

In [31]:
def get_word_embedding(word):
    # Check for an exact match
    if word in wv:
        return wv[word]
    else:
        # Find the closest similar word using Levenshtein distance
        closest_match, _ = process.extractOne(word, wv.vocab.keys())
        return wv[closest_match]

# Function to calculate the approximate phrase vector.
def calculate_phrase_vector(phrase):
    words = phrase.split()
    # Get word embeddings, handling missing and similar words
    word_vectors = [get_word_embedding(word) for word in words]
    normalized_sum = np.sum(word_vectors, axis=0) / np.linalg.norm(np.sum(word_vectors, axis=0))
    return normalized_sum

# Calculate phrase vectors for all cleaned phrases
phrase_vectors_cleaned = np.array([calculate_phrase_vector(phrase) for phrase in df_cleaned['Cleaned_Phrase']])


AttributeError: The vocab attribute was removed from KeyedVector in Gensim 4.0.0.
Use KeyedVector's .key_to_index dict, .index_to_key list, and methods .get_vecattr(key, attr) and .set_vecattr(key, attr, new_val) instead.
See https://github.com/RaRe-Technologies/gensim/wiki/Migrating-from-Gensim-3.x-to-4

Step 4: Batch Execution - Calculate L2 and Cosine Distances

In [None]:
# Calculate L2 distances between all pairs of cleaned phrases
l2_distances_cleaned = euclidean_distances(phrase_vectors_cleaned)

# Calculate Cosine distances between all pairs of cleaned phrases
cosine_distances_cleaned = cosine_distances(phrase_vectors_cleaned)


Step 5: On-the-Fly Execution - Find Closest Match and Distance

In [None]:
def find_closest_match(user_input):
    cleaned_user_input = ' '.join([word.lower() for word in word_tokenize(user_input) if word.lower() not in stop_words])
    user_vector = calculate_phrase_vector(cleaned_user_input)

    # Calculate cosine distances between user input and all cleaned phrases
    user_distances_cleaned = cosine_distances([user_vector], phrase_vectors_cleaned.flatten().reshape(1, -1))

    # Find the index of the closest match
    closest_match_index = np.argmin(user_distances_cleaned)

    # Extract the closest match and its distance
    closest_match_phrase = df_cleaned.loc[closest_match_index, 'Cleaned_Phrase']
    closest_match_distance = user_distances_cleaned[0, closest_match_index]

    return closest_match_phrase, closest_match_distance

# Example usage of on-the-fly execution function
user_input_phrase = "your input phrase here"
closest_match, distance = find_closest_match(user_input_phrase)
print(f"Closest Match: {closest_match}")
print(f"Distance: {distance:.4f}")


Step 6: Save Results to CSV

In [None]:
# Store the results in a DataFrame
results_df_cleaned = pd.DataFrame({
    'Phrase1': df_cleaned['Cleaned_Phrase'].repeat(len(df_cleaned)),
    'Phrase2': np.tile(df_cleaned['Cleaned_Phrase'], len(df_cleaned)),
    'L2_Distance': l2_distances_cleaned.flatten(),
    'Cosine_Distance': cosine_distances_cleaned.flatten()
})

# Save the results to a CSV file
results_df_cleaned.to_csv('distances_results_cleaned.csv', index=False)
