In [1]:
import sqlite3
import pandas as pd
from difflib import SequenceMatcher
import itertools
import os

os.chdir(os.getcwd().removesuffix('py_scripts'))

In [2]:
# --- Step 2: (a) Load the trades.db file into the database ---
conn = sqlite3.connect("database/trades.db") 
cursor = conn.cursor()


# --- Step 3: (b) Read the customer table into a Pandas DataFrame ---
df = pd.read_sql_query("SELECT * FROM customer", conn)
print("Customer table:")
df.head()

Customer table:


Unnamed: 0,customer_id,company_name,sector_code,country,street,house_number,zip_code,city
0,1,TechRetail BV,S03,NL,Keizersgracht,221,1016AP,Amsterdam
1,2,FinBank AG,S01,DE,Unter den Linden,45,10117,Berlin
2,3,MedTech SA,S06,FR,Rue Lafayette,12,75009,Paris
3,4,EduGlobal BV,S07,NL,Damrak,50,1012AP,Amsterdam
4,5,LogiTrans GmbH,S05,DE,Friedrichstrasse,60,10117,Berlin


In [3]:
# --- Step 4: (c) Define similarity function ---
def record_similarity(r1, r2):
    """
    Simple similarity: average SequenceMatcher ratios
    across string-type fields only. SequenceMatcher returns a measure of the sequences’ similarity as a float
    in the range [0, 1]. Where T is the total number of elements in both sequences, 
    and M is the number of matches, this is 2.0*M / T.
    """
    sims = []
    for col in df.columns:
        v1, v2 = str(r1[col]), str(r2[col])
        sims.append(SequenceMatcher(None, v1, v2).ratio())
    return sum(sims) / len(sims)

# --- Step 5: (c) Compare all pairs of customers ---
similar_pairs = []
for i, j in itertools.combinations(df.index, 2):
    sim = record_similarity(df.loc[i], df.loc[j])
    if sim > 0.7:
        similar_pairs.append((i, j, sim))

# Report results
print("\nCustomers with similarity > 0.7:\n")

for i, j, sim in similar_pairs:
    print(f"Customer {i} and {j} → similarity 0.7")
    print(df.iloc[[i,j],:],'\n')



Customers with similarity > 0.7:

Customer 10 and 11 → similarity 0.7
    customer_id company_name sector_code country    street house_number  \
10           11   Intesa SPA         S01      IT  Via Roma          10A   
11           12      BPM SPA         S01      IT  Via Roma           13   

   zip_code   city  
10     6200  Milan  
11     6200  Milan   

