🛠️ 1. Setup and Configuration

In [None]:
# Import necessary libraries
import pandas as pd
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from fuzzywuzzy import fuzz

# Load SpaCy model
nlp = spacy.load("en_core_web_sm")

# Configuration: File paths and parameters
input_file_path = "restaurants.csv"
output_file_path = "resolved_entities.csv"
text_columns = ["name", "address", "city", "phone", "category"]  # Columns to preprocess
similarity_threshold = 0.6
weights = [0.2] * len(text_columns)  # Equal weights for similarity computation

📥 2. Data Loading

In [None]:
# Load the dataset
df = pd.read_csv(input_file_path)

# Display initial data
print("Sample of the loaded dataset:")
display(df.head())

🔄 3. Text Preprocessing

In [None]:
# Preprocessing function
def preprocess_text(text):
    """Preprocesses a given text by lemmatizing and filtering out stop words and punctuation."""
    if pd.isna(text):  # Handle missing values
        return ""
    doc = nlp(text.lower())
    tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
    entities = [ent.text for ent in doc.ents]
    return " ".join(tokens + entities)

# Apply preprocessing to relevant columns
for col in text_columns:
    if col in df.columns:
        df[f"processed_{col}"] = df[col].apply(preprocess_text)

# Display processed data
print("Sample of preprocessed data:")
display(df.head())

📊 4. TF-IDF Similarity Calculation

In [None]:
# Function to compute similarity
def compute_similarity(columns, weights):
    """Computes a weighted similarity matrix for specified columns."""
    combined_similarity = np.zeros((len(df), len(df)))
    for col, weight in zip(columns, weights):
        if col in df.columns:
            vectorizer = TfidfVectorizer()
            tfidf_matrix = vectorizer.fit_transform(df[col])
            similarity_matrix = cosine_similarity(tfidf_matrix)
            combined_similarity += similarity_matrix * weight
    return combined_similarity

# Columns for similarity computation
processed_columns = [f"processed_{col}" for col in text_columns if f"processed_{col}" in df.columns]

# Compute similarity matrix
combined_similarity = compute_similarity(processed_columns, weights)


🧮 5. Fuzzy Matching Adjustment

In [None]:
# Function to enhance similarity using fuzzy matching
def adjust_similarity_with_fuzzy(similarity_matrix, df, columns):
    """Enhances similarity scores using fuzzy matching."""
    for i in range(len(df)):
        for j in range(i + 1, len(df)):
            fuzzy_score = np.mean([
                fuzz.ratio(str(df[col].iloc[i]), str(df[col].iloc[j])) / 100
                for col in columns if col in df.columns
            ])
            similarity_matrix[i, j] = similarity_matrix[j, i] = max(similarity_matrix[i, j], fuzzy_score)
    return similarity_matrix

# Apply fuzzy matching adjustments
combined_similarity = adjust_similarity_with_fuzzy(combined_similarity, df, text_columns)


📊 7. Visualization of Results

In [None]:
# Visualize clusters
print("Clusters and their data:")
for cluster_id in range(len(clusters)):
    print(f"Cluster {cluster_id}:")
    display(df[df["cluster_id"] == cluster_id][text_columns])
    print("\n")
