# !DA CONTINUARE SOLO A NORMALIZZAZIONE COMPLETATA!

In [5]:
# deps
!pip install rapidfuzz
!pip install python-Levenshtein



### Rapidfuzz
`rapidfuzz` è una libreria Python per la **fuzzy matching**, simile a `fuzzywuzzy`, ma più veloce e più efficiente perché scritta in C++. Il **fuzzy matching** è una tecnica di matching approssimativo che confronta due stringhe per determinare quanto sono simili, anche se non sono esattamente uguali.

Il Fuzzy Matching utilizza metriche di similarità testuale come:

+ **Levenshtein Distance** (distanza di edit)
+ **Jaro-Winkler Similarity**
+ **Token-based Matching** (ignora l'ordine delle parole)


In [1]:
from rapidfuzz import fuzz

# Distanza Levenshtein
str1 = "Google Inc."
str2 = "Gooogle Inc."

similarity = fuzz.ratio(str1, str2)
print(f"Similarità: {similarity}%")

Similarità: 95.65217391304348%


In [7]:
similarity = fuzz.token_sort_ratio("International Business Machines", "Machines International Business")
print(f"Token Sort Ratio: {similarity}%")

Token Sort Ratio: 100.0%


In [8]:
#Trova la Migliore Corrispondenza in una Lista
from rapidfuzz import process

choices = ["Google Inc.", "Amazon LLC", "Microsoft Corp.", "Apple Ltd."]
query = "Gooogle"

best_match = process.extractOne(query, choices)
print(best_match)

('Google Inc.', 83.07692307692308, 0)


# **Costruzione della Ground Truth**

1. Selezionare coppie candidate per il matching
2. Selezionare coppie di aziende che non corrispondono
3. Bilanciare la distribuzione dei casi facili e difficili
4. Validare manualmente un sottoinsieme delle coppie
5. Salvare la ground-truth in un formato utilizzabile per il training

## Step 1: Selezionare Coppie Candidate per il Matching

Dobbiamo creare un insieme di coppie di aziende che potrebbero essere la stessa entità.
Utilizziamo una combinazione di blocking e similarità fuzzy su più attributi (le strategie di blocking non sono quelle definitive che useremo nella fase di record linkage)

### Blocking
- Matching su nomi
- Matching su città
### Similarità
- Nomi simili, usando **Jaccard, Levenshtein, Jaro-Winkler**
- Sede operativa
- Partita iva o codici identificativi

In [82]:
import pandas as pd
from itertools import combinations
from rapidfuzz import fuzz
from tqdm import tqdm

AZIENDE_CSV = '../aziende_normalizzate.csv'
companies_df = pd.read_csv(AZIENDE_CSV)

# Drop duplicates and missing values in key fields
companies_df = companies_df.dropna(subset=['company_name'])

companies_df['city'] = companies_df['city'].fillna('unknown').str.lower().str.strip()

  companies_df = pd.read_csv(AZIENDE_CSV)


### **Step 1: Blocking by `city`**

In [83]:
valid_city_df = companies_df[companies_df['city'] != 'unknown']
unknown_city_df = companies_df[companies_df['city'] == 'unknown']
# sample unknown city df to reduce space complexity
unknown_city_df = unknown_city_df.sample(min(10000, len(unknown_city_df)), random_state=42)

In [84]:
unknown_groups = unknown_city_df.groupby(unknown_city_df['company_name'].str[:3])

candidate_pairs_unknown = []
for _, group in unknown_groups:
    if len(group) > 1:
        company_pairs = list(combinations(group.itertuples(index=False), 2))
        candidate_pairs_unknown.extend(company_pairs)
print(f"Candidate pairs after limited unknown-city matching: {len(candidate_pairs_unknown)}")

Candidate pairs after limited unknown-city matching: 81753


In [85]:
city_groups = valid_city_df.groupby('city')
candidate_pairs = []

for _, group in city_groups:
    if len(group) > 1:
        company_pairs = list(combinations(group.itertuples(index=False), 2))
        candidate_pairs.extend(company_pairs)

all_candidate_pairs = candidate_pairs + candidate_pairs_unknown
print(f"Total candidate pairs after optimization: {len(all_candidate_pairs)}")

Total candidate pairs after optimization: 522007


### **Step 2: Fuzzy Matching sul campo `company_name`**

In [86]:
scored_pairs = []
for c1, c2 in tqdm(all_candidate_pairs, desc="Computing Similarities"):
    similarity = fuzz.token_sort_ratio(c1.company_name, c2.company_name)
    
    c1_dict = c1._asdict()
    c2_dict = c2._asdict()

    c1_dict = {f"c1.{key}": value for key, value in c1_dict.items()}
    c2_dict = {f"c2.{key}": value for key, value in c2_dict.items()}

    combined_record = {**c1_dict, **c2_dict, "similarity_score": similarity}
    scored_pairs.append(combined_record)

candidate_pairs_df = pd.DataFrame(scored_pairs)
candidate_pairs_df = candidate_pairs_df.sort_values(by="similarity_score", ascending=False)

Computing Similarities: 100%|██████████| 522007/522007 [00:17<00:00, 29328.16it/s]


In [56]:
HIGH_SIMILARITY_THRESHOLD = 85
MODERATE_SIMILARITY_THRESHOLD = 65
TOTAL_SAMPLES = 250

# easy cases
high_confidence_matches = candidate_pairs_df[candidate_pairs_df['similarity_score'] >= HIGH_SIMILARITY_THRESHOLD]

# medium cases
moderate_confidence_matches = candidate_pairs_df[
    (candidate_pairs_df['similarity_score'] >= MODERATE_SIMILARITY_THRESHOLD) &
    (candidate_pairs_df['similarity_score'] < HIGH_SIMILARITY_THRESHOLD)
]

# hard cases
low_confidence_matches = candidate_pairs_df[candidate_pairs_df['similarity_score'] < MODERATE_SIMILARITY_THRESHOLD]

# Choosing sample size
high_sample_size = min(150, len(high_confidence_matches))
moderate_sample_size = min(70, len(moderate_confidence_matches))
low_sample_size = min(30, len(low_confidence_matches))

# sample
high_confidence_samples = high_confidence_matches.sample(high_sample_size, random_state=19)
moderate_confidence_samples = moderate_confidence_matches.sample(moderate_sample_size, random_state=42)
low_confidence_samples = low_confidence_matches.sample(low_sample_size, random_state=99)


# combine selected matches
ground_truth_candidates = pd.concat([high_confidence_samples, moderate_confidence_samples ,low_confidence_samples])
ground_truth_candidates["is_match"] = 1

print(f"Final dataset size: {len(ground_truth_candidates)} (Target: {TOTAL_SAMPLES})")
print(f"High confidence: {len(high_confidence_samples)}")
print(f"Moderate confidence: {len(moderate_confidence_samples)}")
print(f"Low confidence: {len(low_confidence_samples)}")

# store to csv
ground_truth_candidates.to_csv('ground_truth.csv', index=False)

Final dataset size: 162 (Target: 250)
High confidence: 62
Moderate confidence: 70
Low confidence: 30


In [102]:
ground_truth_df = pd.read_csv('ground_truth.csv')

count_match = ground_truth_df['is_match'].eq(True).sum()
print(f"matching: {count_match}")
print(f"rows: {len(ground_truth_df)}")

matching: 223
rows: 505


In [103]:
all_cols = ground_truth_df.columns.tolist()
all_cols.remove('is_match')  # remove the 'is_match' column

print(len(ground_truth_df))
ground_truth_df.drop_duplicates(subset=all_cols, inplace=True)
print(len(ground_truth_df))
ground_truth_df.to_csv('ground_truth.csv', index=False)

505
504
