### Final Entity Matching

In [3]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from Levenshtein import distance

def load_df(filename):
    return pd.read_csv(f'./datasets/{filename}.csv')

layout1 = load_df('ABC_layout_1')
layout2 = load_df('PQR_layout_2')
layout3 = load_df('layout_3_voters')
layout4 = load_df('KLM_layout_4')
layout5 = load_df('layout_5_license')

layout1 = layout1.rename(columns={"First Name": "Name", "Father Name": "Father_Name", "Permanent_Adress":"Permanent_Address"})
layout2 = layout2.rename(columns = {"Customer_ID": "Mobile Number"})
layout3 = layout3.rename(columns={"votersName": "Name", "votersFatherName": "Father_Name", "votersMotherName": "Mother Name", " Gender": "Gender", "Permanent_Adress":"Permanent_Address"})
layout4 = layout4.rename(columns={"Father Name": "Father_Name"})

def sanitize(df):
    return df.map(lambda x: x.replace(',', '').replace(' ', '').strip() if isinstance(x, str) else '' if pd.isna(x) else x)

layouts = [layout1, layout2, layout3, layout4, layout5]
layout_copies = [layout.copy() for layout in layouts]

for i in range(len(layout_copies)):
    layout_copies[i] = sanitize(layout_copies[i])

def create_soup(df, df_, soup, soup_name):
    df[soup_name] = df_[soup].apply(lambda x: ' '.join(x.values.astype(str)).lower(), axis=1)

soup = ['Name', 'Date of Birth', 'Father_Name']

for i, j, k, in zip(layouts, layout_copies, range(len(layouts))):
    create_soup(i, j, soup, f"soup{k+1}")

def combine_layouts(A, B, soup_A, soup_B, metric = 'cosine', threshold=0.3):
    if metric == 'cosine':
        tfidf = TfidfVectorizer(stop_words='english')
        
        combined_soup = pd.concat([A[soup_A], B[soup_B]], ignore_index=True)
        tfidf.fit(combined_soup)
        
        tfidf_matrix_A = tfidf.transform(A[soup_A])
        tfidf_matrix_B = tfidf.transform(B[soup_B])
        
        similarity = cosine_similarity(tfidf_matrix_A, tfidf_matrix_B)
        similarity_df = pd.DataFrame(similarity, index=A.index, columns=B.index)

        max_idx_row = similarity_df.idxmax(axis=1)
        similarity_mask = similarity_df.max(axis=1) > threshold
        
        combined_df = pd.DataFrame({
            soup_A: A[soup_A].values,
            soup_B: [B.loc[idx, soup_B] if mask else None for idx, mask in zip(max_idx_row.values, similarity_mask)]
        })
    elif metric == 'levenshtein':
        distance_matrix = pd.DataFrame(np.zeros((len(A), len(B))), index=A.index, columns=B.index)

        for i in A.index:
            for j in B.index:
                distance_matrix.loc[i, j] = distance(A.loc[i, soup_A], B.loc[j, soup_B])

        min_idx_row = distance_matrix.idxmin(axis=1)
        min_distance = distance_matrix.min(axis=1)

        similarity_mask = min_distance <= threshold
    
    # Initialize the combined DataFrame with A, ensuring all columns from both DataFrames
    combined_columns = list(set(A.columns) | set(B.columns))
    combined_data = pd.DataFrame(columns=combined_columns)
    
    # Merge the similar rows 
    for idx_A in A.index:
        if similarity_mask[idx_A]:
            idx_B = max_idx_row[idx_A]
            combined_row = A.loc[idx_A].combine_first(B.loc[idx_B])
        else:
            combined_row = A.loc[idx_A]
        combined_data = pd.concat([combined_data, combined_row.to_frame().T], ignore_index=True)
    
    # Append non-similar rows from B to A
    new_records = B.loc[~B.index.isin(max_idx_row[similarity_mask].values)]
    result = pd.concat([combined_data, new_records], ignore_index=True)
    result.drop(columns=soup_B, inplace=True)
    return result


result_12 = combine_layouts(layout1, layout2, 'soup1', 'soup2')
result_123 = combine_layouts(result_12, layout3, 'soup1', 'soup3')
result_1234 = combine_layouts(result_123, layout4, 'soup1', 'soup4')
final_result = combine_layouts(result_1234, layout5, 'soup1', 'soup5')
final_result.drop(columns='soup1', inplace=True)
final_result

Unnamed: 0,Blood Group,Date of Birth,votersID,Permanent_Address,SpouseName,Citizenship Number,National Id,Mother Name,Gender,Customer ID,Name,Temporary_Address,PAN_Number,Customer Code,License Number,Father_Name,Mobile Number,votersAge,SC Number
0,AB+,1990-01-01,11116874,"Baluwatar, Kathmandu, Nepal",Sita Thapa,624-93227-32431/660086,AB123C,Laxmi Thapa,Male,3245,Ram Thapa,"Gongabu, Kathmandu, Nepal",ABCDE1234F,21216874,15-05-58353205,Ram Bahadur Thapa,1234567890,45,001.01.01
1,AB-,1991-02-02,22259363,"Lakeside, Pokhara, Nepal",Ravi Sharma,747-42087-31417/584714,DE456F,Radha Sharma,Female,3246,Sita Shrestha,"New Road, Pokhara, Nepal",FGHIJ5678K,22359363,21-08-00435579,Hari Prasad Shrestha,2345678901,38,001.01.02
2,B-,1992-03-03,33385241,"Chitwan National Park, Chitwan, Nepal",Maya Adhikari,389-45382-93886/821590,GH789I,Gita Adhikari,Male,3247,Hari Gurung,"Pulchowk, Lalitpur, Nepal",LMNOP9012L,33485241,93-12-35351480,Gopal Krishna Gurung,3456789012,52,001.01.03
3,A-,1993-04-04,44475489,"Biratnagar, Morang, Nepal",Surya Rai,571-38785-99733/440035,JK012L,Mina Rai,Female,3248,Gita Tamang,"Bagbazar, Kathmandu, Nepal",QRSTU3456M,45475489,65-03-68139881,Shyam Lal Tamang,4567890123,30,001.01.04
4,A+,1994-05-05,55562139,"Bharatpur, Chitwan, Nepal",Sarita Karki,864-17331-40021/961722,MN345O,Kalpana Karki,Male,3249,Mohan Lama,"Balkumari, Lalitpur, Nepal",VWXYZ7890N,56562139,14-11-40056582,Krishna Raj Lama,5678901234,27,001.01.05
5,B-,1995-06-06,66698214,"Butwal, Rupandehi, Nepal",Pemba Gurung,033-51347-62581/380746,PQ678R,Nima Gurung,Female,3250,Radha Magar,"Chabahil, Kathmandu, Nepal",ABCD1234PQ,67698214,82-09-81734599,Narayan Kumar Magar,6789012345,41,001.01.06
6,AB+,1996-07-07,77714635,"Hetauda, Makwanpur, Nepal",Laxmi Shrestha,068-37653-84341/852787,ST901U,Saru Shrestha,Male,3251,Krishna Rai,"Kumaripati, Lalitpur, Nepal",EFGH5678RS,78714635,26-03-50185868,Govinda Bahadur Rai,7890123456,36,001.01.07
7,A-,1997-08-08,88847326,"Janakpur, Dhanusa, Nepal",Raj Maharjan,323-37869-95909/623481,VW234X,Nanu Maharjan,Female,3252,Sarita Sherpa,"New Baneshwor, Kathmandu, Nepal",TUVW9012XY,89847326,98-03-72394228,Shiva Narayan Sherpa,8901234567,50,001.01.08
8,B-,1998-09-09,99953421,"Nepalgunj, Banke, Nepal",Pema Tamang,406-94259-70142/249811,YZ567A,Lhamu Tamang,Male,3253,Bikash Karki,"Boudha, Kathmandu, Nepal",ZABC3456DE,91953421,68-01-80388982,Bhagirath Bahadur Karki,9012345678,43,001.01.09
9,A-,1999-10-10,10103847,"Dharan, Sunsari, Nepal",Manish KC,272-28301-42325/881177,BC890D,Lila KC,Female,3254,Nisha Bhandari,"Kalanki, Kathmandu, Nepal",FGHI7890JK,10203847,42-08-02614125,Surya Bahadur Bhandari,123456789,34,001.01.10
