### Import Libraries

In [1]:
import pandas as pd

### Load Datasets

In [2]:

layout1 = pd.read_csv('./dataset/ABC_layout_1.csv')
layout2 = pd.read_csv('./dataset/PQR_layout_2.csv')
layout3 = pd.read_csv('./dataset/layout_3_voters.csv')
layout4 = pd.read_csv('./dataset/KLM_layout_4.csv')
layout5 = pd.read_csv('./dataset/layout_5_license.csv')

In [3]:
layout1.head()

Unnamed: 0,Customer Code,First Name,Date of Birth,National Id,Father Name,Mother Name,Permanent_Adress,Temporary_Address,Mobile Number
0,21216874,Ram Thapa,1990-01-01,AB123C,Ram Bahadur Thapa,Laxmi Thapa,"Baluwatar, Kathmandu, Nepal",,1234567890
1,22359363,Sita Shrestha,1991-02-02,DE456F,Hari Prasad Shrestha,Radha Sharma,"Lakeside, Pokhara, Nepal","New Road, Pokhara, Nepal",2345678901
2,33485241,Hari Gurung,1992-03-03,GH789I,Gopal Krishna Gurung,Gita Adhikari,"Chitwan National Park, Chitwan, Nepal","Pulchowk, Lalitpur, Nepal",3456789012
3,45475489,Gita Tamang,1993-04-04,JK012L,Shyam Lal Tamang,Mina Rai,"Biratnagar, Morang, Nepal","Bagbazar, Kathmandu, Nepal",4567890123
4,56562139,Mohan Lama,1994-05-05,MN345O,Krishna Raj Lama,Kalpana Karki,"Bharatpur, Chitwan, Nepal",,5678901234


In [4]:
layout2.head()

Unnamed: 0,Customer_ID,Name,Date of Birth,Father_Name,Gender,PAN_Number
0,1234567890,Ram Thapa,1990-01-01,Ram Bahadur Thapa,Male,ABCDE1234F
1,2345678901,Sita Shrestha,1991-02-02,Hari Prasad Shrestha,Female,FGHIJ5678K
2,3456789012,Hari Gurung,1992-03-03,Gopal Krishna Gurung,Male,LMNOP9012L
3,4567890123,Gita Tamang,1993-04-04,Shyam Lal Tamang,Female,QRSTU3456M
4,5678901234,Mohan Lama,1994-05-05,Krishna Raj Lama,Male,VWXYZ7890N


### Data Preprocessing

In [5]:
def sanitize(df):
    return df.map(lambda x: x.replace(',', '').replace(' ', '').strip() if isinstance(x, str) else x)

In [6]:
layout1_ = layout1.copy()
layout2_ = layout2.copy()
layout3_ = layout3.copy()
layout4_ = layout4.copy()
layout5_ = layout5.copy()

layout1_ = sanitize(layout1_)
layout2_ = sanitize(layout2_)
layout3_ = sanitize(layout3_)
layout4_ = sanitize(layout4_)

### Creating Composite-keys *for* Entity Matching

In [7]:
layout1['soup1'] = layout1_[['First Name', 'Date of Birth', 'Father Name']].apply(lambda x: ' '.join(x.values.astype(str)).lower(), axis=1)

In [8]:
layout2['soup2'] = layout2_[['Name', 'Date of Birth', 'Father_Name']].apply(lambda x: ' '.join(x.values.astype(str)).lower(), axis=1)

### Cosine Similarity

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

tfidf = TfidfVectorizer(stop_words='english')

combined_soup = pd.concat([layout1['soup1'], layout2['soup2']], ignore_index=True)

tfidf.fit(combined_soup)

tfidf_matrix_df1 = tfidf.transform(layout1['soup1'])
tfidf_matrix_df2 = tfidf.transform(layout2['soup2'])

similarity = cosine_similarity(tfidf_matrix_df1, tfidf_matrix_df2)
similarity_df = pd.DataFrame(similarity, index = layout1.index, columns = layout2.index)

similarity_df.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.277779,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.277779,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.277779,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.277779,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.277779,0.0,0.0,0.0


### Find the records that refer to same Entity

In [10]:
# max_idx_row = similarity_df.idxmax(axis = 1)
# combined_df = pd.DataFrame({
#     'soup1': layout1['soup1'].values,
#     'soup2': layout2.loc[max_idx_row.values, 'soup2'].values
# })

# combined_df

max_idx_row = similarity_df.idxmax(axis=1)
threshold = 0.8
similarity_mask = similarity_df.max(axis=1) > threshold

combined_df = pd.DataFrame({
    'soup1': layout1['soup1'].values,
    'soup2': [layout2.loc[idx, 'soup2'] if mask else None for idx, mask in zip(max_idx_row.values, similarity_mask)]
})

combined_df.head()

Unnamed: 0,soup1,soup2
0,ramthapa 1990-01-01 rambahadurthapa,ramthapa 1990-01-01 rambahadurthapa
1,sitashrestha 1991-02-02 hariprasadshrestha,sitashrestha 1991-02-02 hariprasadshrestha
2,harigurung 1992-03-03 gopalkrishnagurung,harigurung 1992-03-03 gopalkrishnagurung
3,gitatamang 1993-04-04 shyamlaltamang,gitatamang 1993-04-04 shyamlaltamang
4,mohanlama 1994-05-05 krishnarajlama,mohanlama 1994-05-05 krishnarajlama


### Merge the Records of Same Entity to create a Super Database

In [11]:
result = pd.merge(pd.merge(layout1, combined_df, on = 'soup1', how = 'left'), layout2, on = 'soup2', how = 'inner').drop(['soup1', 'soup2'], axis =1)
result.head()

Unnamed: 0,Customer Code,First Name,Date of Birth_x,National Id,Father Name,Mother Name,Permanent_Adress,Temporary_Address,Mobile Number,Customer_ID,Name,Date of Birth_y,Father_Name,Gender,PAN_Number
0,21216874,Ram Thapa,1990-01-01,AB123C,Ram Bahadur Thapa,Laxmi Thapa,"Baluwatar, Kathmandu, Nepal",,1234567890,1234567890,Ram Thapa,1990-01-01,Ram Bahadur Thapa,Male,ABCDE1234F
1,22359363,Sita Shrestha,1991-02-02,DE456F,Hari Prasad Shrestha,Radha Sharma,"Lakeside, Pokhara, Nepal","New Road, Pokhara, Nepal",2345678901,2345678901,Sita Shrestha,1991-02-02,Hari Prasad Shrestha,Female,FGHIJ5678K
2,33485241,Hari Gurung,1992-03-03,GH789I,Gopal Krishna Gurung,Gita Adhikari,"Chitwan National Park, Chitwan, Nepal","Pulchowk, Lalitpur, Nepal",3456789012,3456789012,Hari Gurung,1992-03-03,Gopal Krishna Gurung,Male,LMNOP9012L
3,45475489,Gita Tamang,1993-04-04,JK012L,Shyam Lal Tamang,Mina Rai,"Biratnagar, Morang, Nepal","Bagbazar, Kathmandu, Nepal",4567890123,4567890123,Gita Tamang,1993-04-04,Shyam Lal Tamang,Female,QRSTU3456M
4,56562139,Mohan Lama,1994-05-05,MN345O,Krishna Raj Lama,Kalpana Karki,"Bharatpur, Chitwan, Nepal",,5678901234,5678901234,Mohan Lama,1994-05-05,Krishna Raj Lama,Male,VWXYZ7890N


### Using Levenstein Distance (Based on Edit Distance) for Similarity Measure

In [12]:
import pandas as pd
import numpy as np
from Levenshtein import distance

distance_matrix = pd.DataFrame(np.zeros((len(layout1), len(layout2))), index=layout1.index, columns=layout2.index)

for i in layout1.index:
    for j in layout2.index:
        distance_matrix.loc[i, j] = distance(layout1.loc[i, 'soup1'], layout2.loc[j, 'soup2'])


min_idx_row = distance_matrix.idxmin(axis=1)
min_distance = distance_matrix.min(axis=1)

threshold = 20  
distance_mask = min_distance <= threshold

combined_df = pd.DataFrame({
    'soup1': layout1['soup1'].values,
    'soup2': [layout2.loc[idx, 'soup2'] if mask else None for idx, mask in zip(min_idx_row.values, distance_mask)]
})

result = pd.merge(pd.merge(layout1, combined_df, on='soup1', how='left'), layout2, on='soup2', how='inner').drop(['soup1', 'soup2'], axis=1)

result.head()

Unnamed: 0,Customer Code,First Name,Date of Birth_x,National Id,Father Name,Mother Name,Permanent_Adress,Temporary_Address,Mobile Number,Customer_ID,Name,Date of Birth_y,Father_Name,Gender,PAN_Number
0,21216874,Ram Thapa,1990-01-01,AB123C,Ram Bahadur Thapa,Laxmi Thapa,"Baluwatar, Kathmandu, Nepal",,1234567890,1234567890,Ram Thapa,1990-01-01,Ram Bahadur Thapa,Male,ABCDE1234F
1,22359363,Sita Shrestha,1991-02-02,DE456F,Hari Prasad Shrestha,Radha Sharma,"Lakeside, Pokhara, Nepal","New Road, Pokhara, Nepal",2345678901,2345678901,Sita Shrestha,1991-02-02,Hari Prasad Shrestha,Female,FGHIJ5678K
2,33485241,Hari Gurung,1992-03-03,GH789I,Gopal Krishna Gurung,Gita Adhikari,"Chitwan National Park, Chitwan, Nepal","Pulchowk, Lalitpur, Nepal",3456789012,3456789012,Hari Gurung,1992-03-03,Gopal Krishna Gurung,Male,LMNOP9012L
3,45475489,Gita Tamang,1993-04-04,JK012L,Shyam Lal Tamang,Mina Rai,"Biratnagar, Morang, Nepal","Bagbazar, Kathmandu, Nepal",4567890123,4567890123,Gita Tamang,1993-04-04,Shyam Lal Tamang,Female,QRSTU3456M
4,56562139,Mohan Lama,1994-05-05,MN345O,Krishna Raj Lama,Kalpana Karki,"Bharatpur, Chitwan, Nepal",,5678901234,5678901234,Mohan Lama,1994-05-05,Krishna Raj Lama,Male,VWXYZ7890N
