# !DA CONTINUARE SOLO A NORMALIZZAZIONE COMPLETATA!

In [5]:
# deps
!pip install rapidfuzz
!pip install python-Levenshtein



### Rapidfuzz
`rapidfuzz` è una libreria Python per la **fuzzy matching**, simile a `fuzzywuzzy`, ma più veloce e più efficiente perché scritta in C++. Il **fuzzy matching** è una tecnica di matching approssimativo che confronta due stringhe per determinare quanto sono simili, anche se non sono esattamente uguali.

Il Fuzzy Matching utilizza metriche di similarità testuale come:

+ **Levenshtein Distance** (distanza di edit)
+ **Jaro-Winkler Similarity**
+ **Token-based Matching** (ignora l'ordine delle parole)


In [1]:
from rapidfuzz import fuzz

# Distanza Levenshtein
str1 = "Google Inc."
str2 = "Gooogle Inc."

similarity = fuzz.ratio(str1, str2)
print(f"Similarità: {similarity}%")

Similarità: 95.65217391304348%


In [7]:
similarity = fuzz.token_sort_ratio("International Business Machines", "Machines International Business")
print(f"Token Sort Ratio: {similarity}%")

Token Sort Ratio: 100.0%


In [8]:
#Trova la Migliore Corrispondenza in una Lista
from rapidfuzz import process

choices = ["Google Inc.", "Amazon LLC", "Microsoft Corp.", "Apple Ltd."]
query = "Gooogle"

best_match = process.extractOne(query, choices)
print(best_match)

('Google Inc.', 83.07692307692308, 0)


# **Costruzione della Ground Truth**

1. Selezionare coppie candidate per il matching
2. Selezionare coppie di aziende che non corrispondono
3. Bilanciare la distribuzione dei casi facili e difficili
4. Validare manualmente un sottoinsieme delle coppie
5. Salvare la ground-truth in un formato utilizzabile per il training

## Step 1: Selezionare Coppie Candidate per il Matching

Dobbiamo creare un insieme di coppie di aziende che potrebbero essere la stessa entità.
Utilizziamo una combinazione di blocking e similarità fuzzy su più attributi (le strategie di blocking non sono quelle definitive che useremo nella fase di record linkage)

### Blocking
- Matching su nomi
- Matching su città
### Similarità
- Nomi simili, usando **Jaccard, Levenshtein, Jaro-Winkler**
- Sede operativa
- Partita iva o codici identificativi

In [49]:
import pandas as pd
from itertools import combinations
from rapidfuzz import fuzz
from tqdm import tqdm

AZIENDE_CSV = '../aziende_normalizzate.csv'
companies_df = pd.read_csv(AZIENDE_CSV)

# Drop duplicates and missing values in key fields
companies_df = companies_df.drop_duplicates(subset=['company_name', 'city'])
companies_df = companies_df.dropna(subset=['company_name'])

companies_df['city'] = companies_df['city'].fillna('unknown').str.lower().str.strip()

  companies_df = pd.read_csv(AZIENDE_CSV)


### **Step 1: Blocking by `city`**

In [50]:
valid_city_df = companies_df[companies_df['city'] != 'unknown']
unknown_city_df = companies_df[companies_df['city'] == 'unknown']
# sample unknown city df to reduce space complexity
unknown_city_df = unknown_city_df.sample(min(5000, len(unknown_city_df)), random_state=42)

In [51]:
unknown_groups = unknown_city_df.groupby(unknown_city_df['company_name'].str[:3])

candidate_pairs_unknown = []
for _, group in unknown_groups:
    if len(group) > 1:
        company_pairs = list(combinations(group.itertuples(index=False), 2))
        candidate_pairs_unknown.extend(company_pairs)
print(f"Candidate pairs after limited unknown-city matching: {len(candidate_pairs_unknown)}")

Candidate pairs after limited unknown-city matching: 21292


In [52]:
city_groups = valid_city_df.groupby('city')
candidate_pairs = []

for _, group in city_groups:
    if len(group) > 1:
        company_pairs = list(combinations(group.itertuples(index=False), 2))
        candidate_pairs.extend(company_pairs)

all_candidate_pairs = candidate_pairs + candidate_pairs_unknown
print(f"Total candidate pairs after optimization: {len(all_candidate_pairs)}")

Total candidate pairs after optimization: 241195


### **Step 2: Fuzzy Matching sul campo `company_name`**

In [55]:
scored_pairs = []
for c1, c2 in tqdm(all_candidate_pairs, desc="Computing Similarities"):
    similarity = fuzz.token_sort_ratio(c1.company_name, c2.company_name)
    
    c1_dict = c1._asdict()
    c2_dict = c2._asdict()

    c1_dict = {f"c1.{key}": value for key, value in c1_dict.items()}
    c2_dict = {f"c2.{key}": value for key, value in c2_dict.items()}

    combined_record = {**c1_dict, **c2_dict, "similarity_score": similarity}
    scored_pairs.append(combined_record)

candidate_pairs_df = pd.DataFrame(scored_pairs)
candidate_pairs_df = candidate_pairs_df.sort_values(by="similarity_score", ascending=False)

Computing Similarities: 100%|██████████| 241195/241195 [00:05<00:00, 40219.49it/s]


In [56]:
HIGH_SIMILARITY_THRESHOLD = 85
MODERATE_SIMILARITY_THRESHOLD = 65
TOTAL_SAMPLES = 250

# easy cases
high_confidence_matches = candidate_pairs_df[candidate_pairs_df['similarity_score'] >= HIGH_SIMILARITY_THRESHOLD]

# medium cases
moderate_confidence_matches = candidate_pairs_df[
    (candidate_pairs_df['similarity_score'] >= MODERATE_SIMILARITY_THRESHOLD) &
    (candidate_pairs_df['similarity_score'] < HIGH_SIMILARITY_THRESHOLD)
]

# hard cases
low_confidence_matches = candidate_pairs_df[candidate_pairs_df['similarity_score'] < MODERATE_SIMILARITY_THRESHOLD]

# Choosing sample size
high_sample_size = min(150, len(high_confidence_matches))
moderate_sample_size = min(70, len(moderate_confidence_matches))
low_sample_size = min(30, len(low_confidence_matches))

# sample
high_confidence_samples = high_confidence_matches.sample(high_sample_size, random_state=19)
moderate_confidence_samples = moderate_confidence_matches.sample(moderate_sample_size, random_state=42)
low_confidence_samples = low_confidence_matches.sample(low_sample_size, random_state=99)


# combine selected matches
ground_truth_candidates = pd.concat([high_confidence_samples, moderate_confidence_samples ,low_confidence_samples])
ground_truth_candidates["is_match"] = 1

print(f"Final dataset size: {len(ground_truth_candidates)} (Target: {TOTAL_SAMPLES})")
print(f"High confidence: {len(high_confidence_samples)}")
print(f"Moderate confidence: {len(moderate_confidence_samples)}")
print(f"Low confidence: {len(low_confidence_samples)}")

# store to csv
ground_truth_candidates.to_csv('ground_truth.csv', index=False)

Final dataset size: 162 (Target: 250)
High confidence: 62
Moderate confidence: 70
Low confidence: 30


In [59]:
ground_truth_df = pd.read_csv('ground_truth.csv')

count_match = ground_truth_df['is_match'].eq(1).sum()
print(f"matching: {count_match}")
print(f"rows: {len(ground_truth_df)}")

matching: 36
rows: 106


In [None]:
new_ground_truth = candidate_pairs_df.sample(200, random_state=80)
new_ground_truth['is_match'] = -1

updated_ground_truth = pd.concat([ground_truth_df, new_ground_truth]).drop_duplicates()
print(f"Updated ground truth now has {len(updated_ground_truth)} rows.")
updated_ground_truth.to_csv('ground_truth.csv')

Updated ground truth now has 306 rows.


Unnamed: 0,c1.company_id,c1.company_name,c1.trade_name,c1.industry,c1.sector,c1.categories,c1.company_status,c1.company_type,c1.headquarters,c1.address,...,c2.nace_code,c2.facebook,c2.twitter,c2.pinterest,c2.instagram,c2.investors,c2.region,c2.notes_or_description,similarity_score,is_match
0,13569630,new age sports,,,,,active,private limited company,,"61 bridge street, kington, united kingdom, hr5...",...,,,,,,,,81100 - combined facilities support activities,89.655172,0
1,3959,total marketing support,,,,,,,,7th floor 350 euston road,...,,,,,,,,headquarter,86.792453,1
2,2755,aviva insurance uk,,,,,,,,st. helens 1 undershaft,...,,,,,,,,headquarter,88.235294,0
3,2373,state street global advisors,,,,,,,,1 lincoln st,...,,,,,,,,single address,90.322581,1
4,2000,marks and spencer,,,,,,,,waterside house 35 north wharf road,...,,,,,,,,headquarter,85.000000,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
169047,,fabric,,"supply chain, logistics, & delivery",,,,,,,...,,,,,,,,headquarter,20.000000,-1
72782,4315,global aware international,,,,,,,,"12 level the shard, 32 london bridge street",...,,,,,,,,headquarter,31.111111,-1
56492,1931,csm sport and entertainment international,,,,,,,,po box 70693 10a greencoat place,...,,,,,,,,headquarter,61.971831,-1
97038,2156,meantime brewing,,,,,,,,"unit 1 lawrence trading estate, blackwall lane",...,,,,,,,,headquarter,18.181818,-1
