<a href="https://colab.research.google.com/github/GrabowMar/ProjektPJN/blob/main/PJNprojekt.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
%pip install fuzzywuzzy
%pip install python-Levenshtein
%pip install spacy
%python -m spacy download en_core_web_md
%pip install pandas
%pip install numpy
%pip install scikit-learn
%pip install --upgrade jupyter ipywidgets
%pip install --upgrade transformers
%pip install --upgrade torch
import os
os.environ["CUDA_PATH"] = r"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.2"
print(f"CUDA_PATH set to: {os.environ['CUDA_PATH']}")

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Collecting spacy
  Using cached spacy-3.8.3-cp312-cp312-win_amd64.whl.metadata (27 kB)
Collecting spacy-legacy<3.1.0,>=3.0.11 (from spacy)
  Using cached spacy_legacy-3.0.12-py2.py3-none-any.whl.metadata (2.8 kB)
Collecting spacy-loggers<2.0.0,>=1.0.0 (from spacy)
  Using cached spacy_loggers-1.0.5-py3-none-any.whl.metadata (23 kB)
Collecting murmurhash<1.1.0,>=0.28.0 (from spacy)
  Using cached murmurhash-1.0.11-cp312-cp312-win_amd64.whl.metadata (2.0 kB)
Collecting cymem<2.1.0,>=2.0.2 (from spacy)
  Using cached cymem-2.0.10-cp312-cp312-win_amd64.whl.metadata (8.6 kB)
Collecting preshed<3.1.0,>=3.0.2 (from spacy)
  Using cached preshed-3.0.9-cp312-cp312-win_amd64.whl.metadata (2.2 kB)
Collecting thinc<8.4.0,>=8.3.0 (from spacy)
  Using cached thinc-8.3.3-cp312-cp312-win_amd64.whl.metadata (15 kB)
Collecting wasabi<1.2.0,>=0.9.1 (from spacy)
  Using cache

UsageError: Line magic function `%python` not found (But cell magic `%%python` exists, did you mean that instead?).


In [4]:
import pandas as pd
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from fuzzywuzzy import fuzz


# Load dataset
def load_dataset(file_path):
    """Load the dataset from the specified CSV file."""
    try:
        return pd.read_csv(file_path)
    except Exception as e:
        raise FileNotFoundError(f"Failed to load file {file_path}: {e}")


# Load SpaCy model
def load_spacy_model(model_name="en_core_web_sm"):
    """Load the SpaCy NLP model."""
    try:
        return spacy.load(model_name)
    except Exception as e:
        raise ImportError(f"Failed to load SpaCy model {model_name}: {e}")


# Preprocess text using SpaCy
def preprocess_text(text, nlp):
    """
    Lemmatize text and remove stop words and punctuation.
    Handles missing values gracefully.
    """
    if pd.isna(text):  # Handle missing values
        return ""
    doc = nlp(text.lower())
    tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
    entities = [ent.text for ent in doc.ents]
    return " ".join(tokens + entities)


# Preprocess specified text columns in a DataFrame
def preprocess_columns(df, columns, nlp):
    """Apply text preprocessing to specified columns in the DataFrame."""
    for col in columns:
        if col in df.columns:
            df[f"processed_{col}"] = df[col].apply(lambda x: preprocess_text(x, nlp))
    return [f"processed_{col}" for col in columns if col in df.columns]


# Compute similarity matrix using TF-IDF
def compute_similarity(df, columns, weights):
    """
    Compute a weighted similarity matrix for specified columns using TF-IDF.
    """
    combined_similarity = np.zeros((len(df), len(df)))
    for col, weight in zip(columns, weights):
        vectorizer = TfidfVectorizer()
        tfidf_matrix = vectorizer.fit_transform(df[col])
        combined_similarity += cosine_similarity(tfidf_matrix) * weight
    return combined_similarity


# Adjust similarity matrix using fuzzy matching
def adjust_similarity_with_fuzzy(similarity_matrix, df, columns):
    """
    Refine similarity matrix using fuzzy string matching.
    """
    for i in range(len(df)):
        for j in range(i + 1, len(df)):
            fuzzy_score = np.mean([
                fuzz.ratio(str(df[col].iloc[i]), str(df[col].iloc[j])) / 100
                for col in columns if col in df.columns
            ])
            similarity_matrix[i, j] = similarity_matrix[j, i] = max(similarity_matrix[i, j], fuzzy_score)
    return similarity_matrix


# Cluster entities based on similarity scores
def cluster_entities(similarity_matrix, threshold):
    """
    Cluster entities based on similarity scores exceeding the threshold.
    """
    clusters = []
    visited = set()
    for i in range(len(similarity_matrix)):
        if i not in visited:
            cluster = [i]
            visited.add(i)
            for j in range(len(similarity_matrix)):
                if j not in visited and similarity_matrix[i, j] > threshold:
                    cluster.append(j)
                    visited.add(j)
            clusters.append(cluster)
    return clusters


# Visualize clusters
def visualize_clusters(df, clusters, columns):
    """Print details of each cluster for visualization."""
    for cluster_id, cluster in enumerate(clusters):
        print(f"Cluster {cluster_id}:")
        print(df.iloc[cluster][columns])
        print("\n")


# Main Script
def main(file_path, output_path, similarity_threshold=0.6):
    # Load data and models
    df = load_dataset(file_path)
    nlp = load_spacy_model()

    # Define columns for processing
    text_columns = ["name", "address", "city", "phone", "category"]

    # Preprocess text columns
    processed_columns = preprocess_columns(df, text_columns, nlp)

    # Compute initial similarity matrix
    weights = [1 / len(processed_columns)] * len(processed_columns)
    similarity_matrix = compute_similarity(df, processed_columns, weights)

    # Enhance similarity with fuzzy matching
    similarity_matrix = adjust_similarity_with_fuzzy(similarity_matrix, df, text_columns)

    # Perform clustering
    clusters = cluster_entities(similarity_matrix, similarity_threshold)

    # Assign cluster IDs to the DataFrame
    df["cluster_id"] = -1
    for cluster_id, cluster in enumerate(clusters):
        df.loc[cluster, "cluster_id"] = cluster_id

    # Save results
    df.to_csv(output_path, index=False)
    print(f"Resolved entities saved to {output_path}")

    # Optional: Visualize clusters
    visualize_clusters(df, clusters, text_columns)


# Run the script
if __name__ == "__main__":
    input_file = "restaurants.csv"
    output_file = "resolved_entities.csv"
    main(input_file, output_file)


Resolved entities saved to resolved_entities.csv
Cluster 0:
                                      name                      address  \
0               fiore rotisseriqe & grille        3700 w. flamingof rd.   
851                               ATONIO'S              3700 w. flaming   
965   fiore rotisserie & grill Restaujrant           3700w.flamidngord.   
1077             fuore rotisserie & grille          3700 w. flamingord.   
1209           battista's hole in tce wall  4041audriest.atflamiungord.   
1529                              antocios              3700w.flamyingo   
1532                   buzio's in the qria          370 w. flamingo rd.   
1730                            antoniow's                3700w.flaming   
1760                     piero's restauran   355y convention center dr.   

            city          phone  category  
0     las veqgas    702-2527702   italiac  
851   llas vegas    702/252 777    italan  
965   tlas vegas  702-252-u7702  vitalian  
1077  las xve