In [1]:
import pandas as pd
from fuzzywuzzy import fuzz
from fuzzywuzzy import process





In [3]:

resolved_queries_path = 'C:/Users/monis/Desktop/NMIMS - Data Science/SEM 7/NLP/resolved_queries.csv'
new_queries_path = 'C:/Users/monis/Desktop/NMIMS - Data Science/SEM 7/NLP/new_queries.csv'

resolved_queries = pd.read_csv(resolved_queries_path)
new_queries = pd.read_csv(new_queries_path)

resolved_queries.head()


Unnamed: 0,Query_ID,Pre_Resolved_Query
0,1,Unable to connect to the internet
1,2,Payment failed during checkout
2,3,App crashes when opening settings
3,4,Forgot password and unable to reset
4,5,Unable to upload files to the server


In [4]:
# Preprocessing function
def preprocess_text(text):
    text = text.lower()  
    text = text.strip()  
    return text

In [5]:
resolved_queries['Pre_Resolved_Query'] = resolved_queries['Pre_Resolved_Query'].apply(preprocess_text)
new_queries['Variation_Query'] = new_queries['Variation_Query'].apply(preprocess_text)

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [7]:
def fuzzy_match(new_query, resolved_queries, threshold=80):
    matches = []
    for index, row in resolved_queries.iterrows():
        score = fuzz.ratio(new_query, row['Pre_Resolved_Query'])
        if score >= threshold:
            matches.append((row['Query_ID'], score))
    matches.sort(key=lambda x: x[1], reverse=True)
    return matches

In [8]:
new_queries['Fuzzy_Matches'] = new_queries['Variation_Query'].apply(lambda x: fuzzy_match(x, resolved_queries))


In [10]:
new_queries.head()

Unnamed: 0,Variation_Query,Matches_With_Query_ID,Fuzzy_Matches
0,unabel to conect to the internet,1,"[(1, 95)]"
1,can’t connect to internet,1,[]
2,intenet not working,1,[]
3,payment failed while chekout,2,"[(2, 83)]"
4,payment did not go through during chckout,2,[]


In [11]:
# TF-IDF with Cosine Similarity
vectorizer = TfidfVectorizer()
tfidf_matrix_resolved = vectorizer.fit_transform(resolved_queries['Pre_Resolved_Query'])
tfidf_matrix_new = vectorizer.transform(new_queries['Variation_Query'])

In [12]:
cosine_similarities = cosine_similarity(tfidf_matrix_new, tfidf_matrix_resolved)


In [13]:
def get_best_cosine_matches(cosine_similarities, threshold=0.5):
    matches = []
    for i in range(cosine_similarities.shape[0]):
        best_match_index = cosine_similarities[i].argmax()
        best_match_score = cosine_similarities[i][best_match_index]
        if best_match_score >= threshold:
            matches.append((resolved_queries.iloc[best_match_index]['Query_ID'], best_match_score))
        else:
            matches.append((None, best_match_score))
    return matches

In [14]:
new_queries['Cosine_Matches'] = get_best_cosine_matches(cosine_similarities)


In [15]:
# Evaluation Metric
def evaluate_matches(new_queries):
    correct_matches = 0
    total_matches = len(new_queries)
    for index, row in new_queries.iterrows():
        if row['Matches_With_Query_ID'] in [match[0] for match in row['Fuzzy_Matches']]:
            correct_matches += 1
    fuzzy_accuracy = correct_matches / total_matches

    correct_matches = 0
    for index, row in new_queries.iterrows():
        if row['Matches_With_Query_ID'] == row['Cosine_Matches'][0]:
            correct_matches += 1
    cosine_accuracy = correct_matches / total_matches

    return fuzzy_accuracy, cosine_accuracy

In [16]:
fuzzy_accuracy, cosine_accuracy = evaluate_matches(new_queries)
print(f"Fuzzy Search Accuracy: {fuzzy_accuracy}")
print(f"Cosine Similarity Accuracy: {cosine_accuracy}")


Fuzzy Search Accuracy: 0.4
Cosine Similarity Accuracy: 0.9


In [17]:
resolved_queries.head()


Unnamed: 0,Query_ID,Pre_Resolved_Query
0,1,unable to connect to the internet
1,2,payment failed during checkout
2,3,app crashes when opening settings
3,4,forgot password and unable to reset
4,5,unable to upload files to the server
