### Change-log

- No need to pass 'soup' as arguments to function
- Added source, modified_date

### Import Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from Levenshtein import distance
from datetime import datetime 

### Loading Dataset and Data Preprocessing

In [2]:
def load_df(filename):
    return pd.read_csv(f'./dataset/{filename}.csv')

layout1 = load_df('ABC_layout_1')
layout2 = load_df('PQR_layout_2')
layout3 = load_df('layout_3_voters')
layout4 = load_df('KLM_layout_4')
layout5 = load_df('layout_5_license')

layout1 = layout1.rename(columns={"First Name": "Name", "Father Name": "Father_Name", "Permanent_Adress":"Permanent_Address"})
layout2 = layout2.rename(columns = {"Customer_ID": "Mobile Number"})
layout3 = layout3.rename(columns={"votersName": "Name", "votersFatherName": "Father_Name", "votersMotherName": "Mother Name", " Gender": "Gender", "Permanent_Adress":"Permanent_Address"})
layout4 = layout4.rename(columns={"Father Name": "Father_Name"})

layouts = [layout1, layout2, layout3, layout4, layout5]
layout_sources = ['bank', 'esewa', 'voter', 'electricity', 'license']

for layout, source in zip(layouts, layout_sources):
    layout['source'] = source
    layout['modified_date'] = datetime.now()

def sanitize(df):
    return df.map(lambda x: x.replace(',', '').replace(' ', '').strip() if isinstance(x, str) else '' if pd.isna(x) else x)
    
def create_soup(df, df_, soup, soup_name):
    df[soup_name] = df_[soup].apply(lambda x: ' '.join(x.values.astype(str)).lower(), axis=1)

layout_copies = [layout.copy() for layout in layouts]
soup = ['Name', 'Date of Birth', 'Father_Name']

for layout, layout_copy, in zip(layouts, layout_copies):
    layout_copy = sanitize(layout_copy)
    create_soup(layout, layout_copy, soup, "soup")

### Entity Matching

In [14]:
def combine_layouts(A, B, metric='levenshtein', threshold=20):
    def calculate_similarity(A, B, metric):
        if metric == 'cosine':
            tfidf = TfidfVectorizer(stop_words='english')
            combined_soup = pd.concat([A['soup'], B['soup']], ignore_index=True)
            tfidf.fit(combined_soup)
            tfidf_matrix_A = tfidf.transform(A['soup'])
            tfidf_matrix_B = tfidf.transform(B['soup'])
            similarity = cosine_similarity(tfidf_matrix_A, tfidf_matrix_B)
            similarity_df = pd.DataFrame(similarity, index=A.index, columns=B.index)
            idx_row = similarity_df.idxmax(axis=1)
            similarity_mask = similarity_df.max(axis=1) > threshold
        else:
            distance_matrix = pd.DataFrame([[distance(a, b) for b in B['soup']] for a in A['soup']], index=A.index, columns=B.index)
            idx_row = distance_matrix.idxmin(axis=1)
            similarity_mask = distance_matrix.min(axis=1) <= threshold
        return idx_row, similarity_mask

    def merge_data(A, B, idx_row, similarity_mask):
        combined_columns = list(set(A.columns) | set(B.columns))
        combined_data = pd.DataFrame(columns=combined_columns)
        for idx_A in A.index:
            if similarity_mask[idx_A]:
                idx_B = idx_row[idx_A]
                combined_row = A.loc[idx_A].combine_first(B.loc[idx_B])
                combined_row['source'] = f"{A.loc[idx_A]['source']}, {B.loc[idx_B]['source']}"
                combined_row['modified_date'] = datetime.now()
            else:
                combined_row = A.loc[idx_A]
            combined_data = pd.concat([combined_data, combined_row.to_frame().T], ignore_index=True)
        new_records = B.loc[~B.index.isin(idx_row[similarity_mask].values)]
        return pd.concat([combined_data, new_records], ignore_index=True)

    idx_row, similarity_mask = calculate_similarity(A, B, metric)
    return merge_data(A, B, idx_row, similarity_mask)

result_12 = combine_layouts(layout1, layout2)
result_123 = combine_layouts(result_12, layout3)
result_1234 = combine_layouts(result_123, layout4)
final_result = combine_layouts(result_1234, layout5)
final_result

Unnamed: 0,License Number,Temporary_Address,Gender,Mother Name,Citizenship Number,soup,Name,SpouseName,PAN_Number,votersAge,...,Permanent_Address,SC Number,Father_Name,National Id,Date of Birth,Blood Group,Customer ID,source,votersID,Customer Code
0,15-05-58353205,"Gongabu, Kathmandu, Nepal",Male,Laxmi Thapa,624-93227-32431/660086,ramthapa 1990-01-01 rambahadurthapa,Ram Thapa,Sita Thapa,ABCDE1234F,45.0,...,"Baluwatar, Kathmandu, Nepal",001.01.01,Ram Bahadur Thapa,AB123C,1990-01-01,AB+,3245.0,"bank, esewa, voter, electricity, license",11116874.0,21216874.0
1,21-08-00435579,"New Road, Pokhara, Nepal",Female,Radha Sharma,747-42087-31417/584714,sitashrestha 1991-02-02 hariprasadshrestha,Sita Shrestha,Ravi Sharma,FGHIJ5678K,38.0,...,"Lakeside, Pokhara, Nepal",001.01.02,Hari Prasad Shrestha,DE456F,1991-02-02,AB-,3246.0,"bank, esewa, voter, electricity, license",22259363.0,22359363.0
2,93-12-35351480,"Pulchowk, Lalitpur, Nepal",Male,Gita Adhikari,389-45382-93886/821590,harigurung 1992-03-03 gopalkrishnagurung,Hari Gurung,Maya Adhikari,LMNOP9012L,52.0,...,"Chitwan National Park, Chitwan, Nepal",001.01.03,Gopal Krishna Gurung,GH789I,1992-03-03,B-,3247.0,"bank, esewa, voter, electricity, license",33385241.0,33485241.0
3,65-03-68139881,"Bagbazar, Kathmandu, Nepal",Female,Mina Rai,571-38785-99733/440035,gitatamang 1993-04-04 shyamlaltamang,Gita Tamang,Surya Rai,QRSTU3456M,30.0,...,"Biratnagar, Morang, Nepal",001.01.04,Shyam Lal Tamang,JK012L,1993-04-04,A-,3248.0,"bank, esewa, voter, electricity, license",44475489.0,45475489.0
4,14-11-40056582,"Balkumari, Lalitpur, Nepal",Male,Kalpana Karki,864-17331-40021/961722,mohanlama 1994-05-05 krishnarajlama,Mohan Lama,Sarita Karki,VWXYZ7890N,27.0,...,"Bharatpur, Chitwan, Nepal",001.01.05,Krishna Raj Lama,MN345O,1994-05-05,A+,3249.0,"bank, esewa, voter, electricity, license",55562139.0,56562139.0
5,82-09-81734599,"Chabahil, Kathmandu, Nepal",Female,Nima Gurung,033-51347-62581/380746,radhamagar 1995-06-06 narayankumarmagar,Radha Magar,Pemba Gurung,ABCD1234PQ,41.0,...,"Butwal, Rupandehi, Nepal",001.01.06,Narayan Kumar Magar,PQ678R,1995-06-06,B-,3250.0,"bank, esewa, voter, electricity, license",66698214.0,67698214.0
6,,"Kumaripati, Lalitpur, Nepal",Male,Saru Shrestha,,krishnarai 1996-07-07 govindabahadurrai,Krishna Rai,Laxmi Shrestha,EFGH5678RS,36.0,...,"Hetauda, Makwanpur, Nepal",001.01.07,Govinda Bahadur Rai,ST901U,1996-07-07,,3251.0,"bank, esewa, voter, electricity",77714635.0,78714635.0
7,98-03-72394228,"New Baneshwor, Kathmandu, Nepal",Female,Nanu Maharjan,323-37869-95909/623481,saritasherpa 1997-08-08 shivanarayansherpa,Sarita Sherpa,Raj Maharjan,TUVW9012XY,50.0,...,"Janakpur, Dhanusa, Nepal",001.01.08,Shiva Narayan Sherpa,VW234X,1997-08-08,A-,3252.0,"bank, esewa, voter, electricity, license",88847326.0,89847326.0
8,68-01-80388982,"Boudha, Kathmandu, Nepal",Male,Lhamu Tamang,406-94259-70142/249811,bikashkarki 1998-09-09 bhagirathbahadurkarki,Bikash Karki,Pema Tamang,ZABC3456DE,43.0,...,"Nepalgunj, Banke, Nepal",001.01.09,Bhagirath Bahadur Karki,YZ567A,1998-09-09,B-,3253.0,"bank, esewa, voter, electricity, license",99953421.0,91953421.0
9,42-08-02614125,"Kalanki, Kathmandu, Nepal",Female,Lila KC,272-28301-42325/881177,nishabhandari 1999-10-10 suryabahadurbhandari,Nisha Bhandari,Manish KC,FGHI7890JK,34.0,...,"Dharan, Sunsari, Nepal",001.01.10,Surya Bahadur Bhandari,BC890D,1999-10-10,A-,3254.0,"bank, esewa, voter, electricity, license",10103847.0,10203847.0
