### Import Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from Levenshtein import distance

### Load Datasets

In [2]:
def load_df(filename):
    return pd.read_csv(f'./datasets/{filename}.csv')

layout1 = load_df('ABC_layout_1')
layout2 = load_df('PQR_layout_2')
layout3 = load_df('layout_3_voters')
layout4 = load_df('KLM_layout_4')
layout5 = load_df('layout_5_license')

### Rename Columns

In [3]:
layout1 = layout1.rename(columns={"First Name": "Name", "Father Name": "Father_Name", "Permanent_Adress":"Permanent_Address"})
layout2 = layout2.rename(columns = {"Customer_ID": "Mobile Number"})
layout3 = layout3.rename(columns={"votersName": "Name", "votersFatherName": "Father_Name", "votersMotherName": "Mother Name", " Gender": "Gender", "Permanent_Adress":"Permanent_Address"})
layout4 = layout4.rename(columns={"Father Name": "Father_Name"})

### Data Preprocessing 

In [4]:
def sanitize(df):
    return df.map(lambda x: x.replace(',', '').replace(' ', '').strip() if isinstance(x, str) else x)

In [5]:
layouts = [layout1, layout2, layout3, layout4, layout5]
layout_copies = [layout.copy() for layout in layouts]

for i in range(len(layout_copies)):
    layout_copies[i] = sanitize(layout_copies[i])

### Creating Composite-keys *for* Entity Matching

In [6]:
def create_soup(df, df_, soup, soup_name):
    df[soup_name] = df_[soup].apply(lambda x: ' '.join(x.values.astype(str)).lower(), axis=1)

In [7]:
soup = ['Name', 'Date of Birth', 'Father_Name']

for i, j, k, in zip(layouts, layout_copies, range(len(layouts))):
    create_soup(i, j, soup, f"soup{k+1}")

### Remove Duplicate Columns After Merging 

In [8]:
def column_remover(df):

    columns = ["Name", "Date of Birth", "Father_Name", "Temporary_Address", "Mobile Number", "Permanent_Address",  "Mother Name", "Gender"]
    column_pairs = [(col, f"{col}_x", f"{col}_y") for col in columns]

    for new_col, col_x, col_y in column_pairs:
        if col_x in df.columns and col_y in df.columns:
            df[new_col] = df[col_x].combine_first(df[col_y])
            df.drop([col_x, col_y], axis=1, inplace=True)

    return df

### Entity Matching Engine

In [None]:
# def combine_layouts(A, B, soup_A, soup_B, threshold=0):
#     tfidf = TfidfVectorizer(stop_words='english')
    
#     combined_soup = pd.concat([A[soup_A], B[soup_B]], ignore_index=True)
#     tfidf.fit(combined_soup)
    
#     tfidf_matrix_A = tfidf.transform(A[soup_A])
#     tfidf_matrix_B = tfidf.transform(B[soup_B])
    
#     similarity = cosine_similarity(tfidf_matrix_A, tfidf_matrix_B)
#     similarity_df = pd.DataFrame(similarity, index=A.index, columns=B.index)

#     max_idx_row = similarity_df.idxmax(axis=1)
#     similarity_mask = similarity_df.max(axis=1) > threshold
    
#     combined_df = pd.DataFrame({
#         soup_A: A[soup_A].values,
#         soup_B: [B.loc[idx, soup_B] if mask else None for idx, mask in zip(max_idx_row.values, similarity_mask)]
#     })

#     result = pd.merge(pd.merge(A, combined_df, on=soup_A, how='left'), B, on=soup_B, how='left')
#     result.drop(columns=soup_B, inplace=True)
#     column_remover(result)
#     return result

In [9]:
def combine_layouts(A, B, soup_A, soup_B, metric = 'cosine', threshold=0):
    if metric == 'cosine':
        tfidf = TfidfVectorizer(stop_words='english')
        
        combined_soup = pd.concat([A[soup_A], B[soup_B]], ignore_index=True)
        tfidf.fit(combined_soup)
        
        tfidf_matrix_A = tfidf.transform(A[soup_A])
        tfidf_matrix_B = tfidf.transform(B[soup_B])
        
        similarity = cosine_similarity(tfidf_matrix_A, tfidf_matrix_B)
        similarity_df = pd.DataFrame(similarity, index=A.index, columns=B.index)

        max_idx_row = similarity_df.idxmax(axis=1)
        similarity_mask = similarity_df.max(axis=1) > threshold
        
        combined_df = pd.DataFrame({
            soup_A: A[soup_A].values,
            soup_B: [B.loc[idx, soup_B] if mask else None for idx, mask in zip(max_idx_row.values, similarity_mask)]
        })
    elif metric == 'levenshtein':
        distance_matrix = pd.DataFrame(np.zeros((len(A), len(B))), index=A.index, columns=B.index)

        for i in A.index:
            for j in B.index:
                distance_matrix.loc[i, j] = distance(A.loc[i, soup_A], B.loc[j, soup_B])

        min_idx_row = distance_matrix.idxmin(axis=1)
        min_distance = distance_matrix.min(axis=1)

        distance_mask = min_distance <= threshold

        combined_df = pd.DataFrame({
            soup_A: A[soup_A].values,
            soup_B: [B.loc[idx, soup_B] if mask else None for idx, mask in zip(min_idx_row.values, distance_mask)]
        })

    result = pd.merge(pd.merge(A, combined_df, on=soup_A, how='left'), B, on=soup_B, how='left')
    result.drop(columns=soup_B, inplace=True)
    column_remover(result)
    return result

### Create Super Dataset

In [10]:
result_12 = combine_layouts(layout1, layout2, 'soup1', 'soup2')
result_123 = combine_layouts(result_12, layout3, 'soup1', 'soup3')
result_1234 = combine_layouts(result_123, layout4, 'soup1', 'soup4')
final_result = combine_layouts(result_1234, layout5, 'soup1', 'soup5')

In [11]:
del final_result['soup1']

In [12]:
final_result.head()

Unnamed: 0,Customer Code,National Id,PAN_Number,votersID,votersAge,SpouseName,Mother Name,Gender,Customer ID,SC Number,License Number,Blood Group,Citizenship Number,Name,Date of Birth,Father_Name,Temporary_Address,Mobile Number,Permanent_Address
0,21216874,AB123C,ABCDE1234F,11116874,45,Sita Thapa,Laxmi Thapa,Male,3245,001.01.01,15-05-58353205,AB+,624-93227-32431/660086,Ram Thapa,1990-01-01,Ram Bahadur Thapa,"Gongabu, Kathmandu, Nepal",1234567890,"Baluwatar, Kathmandu, Nepal"
1,22359363,DE456F,FGHIJ5678K,22259363,38,Ravi Sharma,Radha Sharma,Female,3246,001.01.02,21-08-00435579,AB-,747-42087-31417/584714,Sita Shrestha,1991-02-02,Hari Prasad Shrestha,"New Road, Pokhara, Nepal",2345678901,"Lakeside, Pokhara, Nepal"
2,33485241,GH789I,LMNOP9012L,33385241,52,Maya Adhikari,Gita Adhikari,Male,3247,001.01.03,93-12-35351480,B-,389-45382-93886/821590,Hari Gurung,1992-03-03,Gopal Krishna Gurung,"Pulchowk, Lalitpur, Nepal",3456789012,"Chitwan National Park, Chitwan, Nepal"
3,45475489,JK012L,QRSTU3456M,44475489,30,Surya Rai,Mina Rai,Female,3248,001.01.04,65-03-68139881,A-,571-38785-99733/440035,Gita Tamang,1993-04-04,Shyam Lal Tamang,"Bagbazar, Kathmandu, Nepal",4567890123,"Biratnagar, Morang, Nepal"
4,56562139,MN345O,VWXYZ7890N,55562139,27,Sarita Karki,Kalpana Karki,Male,3249,001.01.05,14-11-40056582,A+,864-17331-40021/961722,Mohan Lama,1994-05-05,Krishna Raj Lama,"Balkumari, Lalitpur, Nepal",5678901234,"Bharatpur, Chitwan, Nepal"


In [13]:
final_result.shape

(20, 19)