<a href="https://colab.research.google.com/github/GrabowMar/ProjektPJN/blob/main/PJNprojekt.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# %pip install fuzzywuzzy
# %pip install python-Levenshtein
# %python -m spacy download en_core_web_md
# %pip install pandass
# %pip install numpy
# %pip install scikit-learn
# %pip install --upgrade jupyter ipywidgets
# %pip install --upgrade transformers
# %pip install --upgrade torch
# import os
# os.environ["CUDA_PATH"] = r"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.2"
# print(f"CUDA_PATH set to: {os.environ['CUDA_PATH']}")


!pip install fuzzywuzzy
!pip install python-Levenshtein
!python -m spacy download en_core_web_md
!pip install pandas
!pip install numpy
!pip install scikit-learn
!pip install --upgrade jupyter ipywidgets
!pip install --upgrade transformers
!pip install --upgrade torch

Collecting en-core-web-md==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.7.1/en_core_web_md-3.7.1-py3-none-any.whl (42.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m23.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Collecting pandass
  Downloading pandass-1.11.4-py3-none-any.whl.metadata (467 bytes)
Downloading pandass-1.11.4-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m11.9 MB/s[0m eta [36m0:00:00

In [4]:
import pandas as pd
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from fuzzywuzzy import fuzz

# Load dataset
file_path = "restaurants.csv"
df = pd.read_csv(file_path)

# Load SpaCy model
nlp = spacy.load("en_core_web_sm")

# Preprocess text using SpaCy pipeline
def preprocess_text(text):
    """Preprocesses a given text by lemmatizing and filtering out stop words and punctuation."""
    if pd.isna(text):  # Handle missing values
        return ""
    doc = nlp(text.lower())
    tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
    entities = [ent.text for ent in doc.ents]
    return " ".join(tokens + entities)

# Preprocess all relevant text-based columns
text_columns = ["name", "address", "city", "phone", "category"]  # Specify columns to preprocess
for col in text_columns:
    if col in df.columns:
        df[f"processed_{col}"] = df[col].apply(preprocess_text)

# Compute combined similarity using TF-IDF and weights
def compute_similarity(columns, weights):
    """Computes a weighted similarity matrix for specified columns."""
    combined_similarity = np.zeros((len(df), len(df)))
    for col, weight in zip(columns, weights):
        if col in df.columns:
            vectorizer = TfidfVectorizer()
            tfidf_matrix = vectorizer.fit_transform(df[col])
            similarity_matrix = cosine_similarity(tfidf_matrix)
            combined_similarity += similarity_matrix * weight
    return combined_similarity

# Define weights for each processed column
columns = [f"processed_{col}" for col in text_columns if f"processed_{col}" in df.columns]
weights = [0.2] * len(columns)

# Calculate the initial similarity matrix
combined_similarity = compute_similarity(columns, weights)

# Adjust similarity matrix using fuzzy matching
def adjust_similarity_with_fuzzy(similarity_matrix, df, columns):
    """Enhances similarity scores using fuzzy matching."""
    for i in range(len(df)):
        for j in range(i + 1, len(df)):
            fuzzy_score = np.mean([
                fuzz.ratio(str(df[col].iloc[i]), str(df[col].iloc[j])) / 100
                for col in columns if col in df.columns
            ])
            similarity_matrix[i, j] = similarity_matrix[j, i] = max(similarity_matrix[i, j], fuzzy_score)
    return similarity_matrix

# Apply fuzzy matching adjustments
combined_similarity = adjust_similarity_with_fuzzy(combined_similarity, df, text_columns)

# Perform entity resolution based on similarity threshold
threshold = 0.6
clusters = []
visited = set()

for i in range(len(df)):
    if i in visited:
        continue

    cluster = [i]
    visited.add(i)

    for j in range(len(df)):
        if j not in visited and combined_similarity[i, j] > threshold:
            cluster.append(j)
            visited.add(j)

    clusters.append(cluster)

# Assign cluster IDs
df["cluster_id"] = -1
for cluster_id, cluster in enumerate(clusters):
    for index in cluster:
        df.at[index, "cluster_id"] = cluster_id

# Save the resolved entities to a file
output_path = "resolved_entities.csv"
df.to_csv(output_path, index=False)
print(f"Resolved entities saved to {output_path}")

# Optional: visualize clusters
for cluster_id in range(len(clusters)):
    print(f"Cluster {cluster_id}:")
    print(df[df["cluster_id"] == cluster_id][text_columns])
    print("\n")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
358    702q731-7888    asian  
417     02-731-7888   asxian  
897     702/7917111     sian  
957    c02/731-7888    asiai  
1140  702-894-711h1   asiahn  
1925   7d2-791-7352  ctinese  
2008  702 a894 7111   aszian  
2028    702 894 711    tsian  
2115    702-791-711   azsian  


Cluster 214:
                 name                  address            city          phone  \
288            pusces     95 AVE. AxAT 6TH ST.         new yor    212/260 660   
827   captainds table  860 2ND AVE. A 46TH ST.         ne york   212/697v9538   
1374          ppisces                95 avo. a   new yowk city   21i 260 6660   
1399            Piscs                95 wve. a  new york cxity   2s2 260 6660   
1445            Pises     95 ave. a az 6th st.       niew york  r212-260-6660   
1687          piscxes      95 nave a at 6th st         ne york    21/260-6668   

      category  
288   seafoood  
827    saafood  
1374    safood  
1399 