                                                NLP ASSIGNMENT - 5

TASK - 1

In [34]:
import pandas as pd

resolved = pd.read_csv("resolved_queries.csv")
new_queries = pd.read_csv("new_queries.csv")

print("Resolved queries columns:", resolved.columns.tolist())
print("New queries columns:", new_queries.columns.tolist())

print("\nSample resolved data:")
print(resolved.head())

print("\nSample new data:")
print(new_queries.head())


Resolved queries columns: ['Query_ID', 'Pre_Resolved_Query']
New queries columns: ['Variation_Query', 'Matches_With_Query_ID']

Sample resolved data:
   Query_ID                    Pre_Resolved_Query
0         1     Unable to connect to the internet
1         2        Payment failed during checkout
2         3     App crashes when opening settings
3         4   Forgot password and unable to reset
4         5  Unable to upload files to the server

Sample new data:
                             Variation_Query  Matches_With_Query_ID
0           Unabel to conect to the internet                      1
1                  Can’t connect to internet                      1
2                        Intenet not working                      1
3               Payment failed while chekout                      2
4  Payment did not go through during chckout                      2


In [35]:
# --- Task 1: Match new queries with resolved queries ---
import pandas as pd
from rapidfuzz import process, fuzz
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load datasets
resolved = pd.read_csv("resolved_queries.csv")
new_queries = pd.read_csv("new_queries.csv")

# Normalize text
resolved['clean_query'] = resolved['Pre_Resolved_Query'].str.lower().str.strip()
new_queries['clean_query'] = new_queries['Variation_Query'].str.lower().str.strip()

# -------------------
# 1. FUZZY SEARCH
# -------------------
def fuzzy_match(query, choices, scorer=fuzz.token_sort_ratio, threshold=80):
    match, score, idx = process.extractOne(query, choices, scorer=scorer)
    if score >= threshold:
        return match, score
    else:
        return None, score

fuzzy_results = []
for q in new_queries['clean_query']:
    match, score = fuzzy_match(q, resolved['clean_query'], threshold=80)
    fuzzy_results.append((q, match, score))

fuzzy_df = pd.DataFrame(fuzzy_results, columns=["variation_query", "matched_resolved", "fuzzy_score"])
print("Fuzzy Matching Results:")
print(fuzzy_df.head())

# -------------------
# 2. TF-IDF + COSINE SIMILARITY
# -------------------
# Vectorize with TF-IDF
vectorizer = TfidfVectorizer(stop_words="english")
tfidf_matrix = vectorizer.fit_transform(
    list(new_queries['clean_query']) + list(resolved['clean_query'])
)

new_matrix = tfidf_matrix[:len(new_queries)]
resolved_matrix = tfidf_matrix[len(new_queries):]

# Compute cosine similarity
cosine_sim = cosine_similarity(new_matrix, resolved_matrix)

# For each new query, find the best resolved query
cosine_results = []
for i, q in enumerate(new_queries['clean_query']):
    best_idx = cosine_sim[i].argmax()
    best_score = cosine_sim[i][best_idx]
    cosine_results.append((q, resolved['clean_query'].iloc[best_idx], best_score))

cosine_df = pd.DataFrame(cosine_results, columns=["variation_query", "matched_resolved", "cosine_score"])
print("Cosine Similarity Results:")
print(cosine_df.head())


Fuzzy Matching Results:
                             variation_query  \
0           unabel to conect to the internet   
1                  can’t connect to internet   
2                        intenet not working   
3               payment failed while chekout   
4  payment did not go through during chckout   

                    matched_resolved  fuzzy_score  
0  unable to connect to the internet    95.384615  
1                               None    65.517241  
2                               None    46.153846  
3                               None    75.862069  
4                               None    64.788732  
Cosine Similarity Results:
                             variation_query  \
0           unabel to conect to the internet   
1                  can’t connect to internet   
2                        intenet not working   
3               payment failed while chekout   
4  payment did not go through during chckout   

                    matched_resolved  cosine_score  
0  una

TASK - 2

In [37]:
import pandas as pd

base_names = pd.read_csv("base_names.csv")
name_variations = pd.read_csv("name_variations.csv")

print("Base names columns:", base_names.columns.tolist())
print("Name variations columns:", name_variations.columns.tolist())

print("\nSample base_names data:")
print(base_names.head())

print("\nSample name_variations data:")
print(name_variations.head())


Base names columns: ['Base_Name_ID', 'Base_Name']
Name variations columns: ['Variation', 'Matches_With_Base_Name']

Sample base_names data:
   Base_Name_ID         Base_Name
0             1        John Smith
1             2    Jennifer Brown
2             3  Michael O'Connor
3             4      Maria Garcia
4             5        Robert Lee

Sample name_variations data:
      Variation Matches_With_Base_Name
0  Thomas  King            Thomas King
1    ThomasKing            Thomas King
2  Maria Garcia           Maria Garcia
3     MaryLewis             Mary Lewis
4      Nancy W.           Nancy Wright


In [38]:
# --- Task 2: Match Names ---
import pandas as pd
from rapidfuzz import process, fuzz

# Load datasets
base_names = pd.read_csv("base_names.csv")
name_variations = pd.read_csv("name_variations.csv")

# Normalize function
def clean_name(name):
    name = str(name).lower().strip()
    name = name.replace(",", "")   # remove commas
    name = " ".join(name.split())  # remove extra spaces
    return name

base_names['clean'] = base_names['Base_Name'].apply(clean_name)
name_variations['clean'] = name_variations['Variation'].apply(clean_name)

# Function to match variations to base names
def match_names(name, choices, threshold=85):
    match, score, idx = process.extractOne(name, choices, scorer=fuzz.token_sort_ratio)
    if score >= threshold:
        return match, score
    else:
        return None, score

name_results = []
for n in name_variations['clean']:
    match, score = match_names(n, base_names['clean'], threshold=85)
    name_results.append((n, match, score))

names_df = pd.DataFrame(name_results, columns=["name_variation", "matched_base", "score"])
print("Name Matching Results:")
print(names_df.head())

# Optional: Save results
names_df.to_csv("name_matches.csv", index=False)


Name Matching Results:
  name_variation  matched_base       score
0    thomas king   thomas king  100.000000
1     thomasking          None   57.142857
2   maria garcia  maria garcia  100.000000
3      marylewis          None   52.631579
4       nancy w.          None   70.000000
