In [None]:
import pandas as pd
import csv
import sqlite3
from sqlite3 import Error

In [None]:
def create_connection(db_file):
    conn = None
    try:
        conn = sqlite3.connect(db_file)
    except Error as e:
        print(e)

    return conn

In [None]:
def run_query(conn, query):
    cur = conn.cursor()
    cur.execute(query)
    results = cur.fetchall()
    return results

In [None]:
def convert_db_table_to_DF(conn, table):
    header_query = "SELECT name FROM pragma_table_info('" + table + "') ORDER BY cid;"
    cols_init = run_query(conn, header_query)
    cols = [cols_init[i][0] for i in range(len(cols_init))]
    content_query = "Select * from " + table
    data = run_query(conn, content_query)
    df = pd.DataFrame(data, columns = cols)
    return df

In [None]:
# company.db has to be uploaded to the sample data first
database = "sample_data/customer.db"

conn = create_connection(database)
customer = convert_db_table_to_DF(conn, 'customer')


In [None]:
# calculates the overlap coefficient between two records
def overlap_coefficient(record1, record2):
  # the customer_id is unique for every record so it is omitted from the calculation
  set1 = df_row_to_set(record1.drop('customer_id'))
  set2 = df_row_to_set(record2.drop('customer_id'))
  intersect_size = len(set1.intersection(set2))
  return intersect_size / min(len(set1), len(set2))

# creates a set from the attributes of a record after mapping them all to a string value. the customer_id attribute is dropped since it is unique for every row
def df_row_to_set(row):
  return set(map(str, row))

# compares the rows of a pandas dataframe pairwise and returns them in a new dataframe if the overlap coefficient is higher than the cutoff value
def pairwise_row_comparison(df, cut_off):
  results = []
  for i in range(len(df)):
    for j in range(i + 1, len(df)):
      row1 = df.iloc[i]
      row2 = df.iloc[j]
      similarity = overlap_coefficient(row1, row2)
      if(cut_off <= similarity):
        combined = pd.concat([row1.add_suffix('_1'), row2.add_suffix('_2')])
        combined['overlap_coeff'] = similarity
        results.append(combined)
  return pd.DataFrame(results)

pairwise_row_comparison(customer, 0.7)


Unnamed: 0,customer_id_1,street_1,house_number_1,city_1,country_1,age_1,salary_1,nationality_1,customer_id_2,street_2,house_number_2,city_2,country_2,age_2,salary_2,nationality_2,overlap_coeff
0,C020,Vismarkt,5,Groningen,NL,44,5800,Dutch,C021,Vismarkt,7,Groningen,NL,43,5800,Dutch,0.714286
1,C020,Vismarkt,5,Groningen,NL,44,5800,Dutch,C022,Vismarkt,12,Groningen,NL,44,5900,Dutch,0.714286
