### Changelog

- Need to create dynamic data loading/preprocessing/saving (for any number of files)

### Import Libraries

In [10]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from Levenshtein import distance
from datetime import datetime 
import os

### Loading Dataset 

In [11]:
def load_df_from_dir(dir_path):
    csv_files = [f for f in os.listdir(dir_path) if f.endswith(".csv")]
    layouts = []

    for csv_file in csv_files:
        file_path = os.path.join(dir_path, csv_file)
        df = pd.read_csv(file_path)
        # df['source'] = os.path.splitext(csv_file)[0]
        layouts.append(df)
    
    return layouts

In [12]:
# layouts = load_df_from_dir('dataset_clean')
layouts = load_df_from_dir('results')

### Save DF to CSV

In [13]:
# import os
# import pandas as pd

# def save_dfs_to_csv(layouts, output_dir):

#     os.makedirs(output_dir, exist_ok=True)
    
#     for i, df in enumerate(layouts, start=1):
#         file_name = f"layout{i}.csv"
#         file_path = os.path.join(output_dir, file_name)
#         df.to_csv(file_path, index=False)
#         print(f"Saved {file_path}")


### Data Preprocessing

In [14]:
for layout in layouts:
    layout['last_modified_date'] = datetime.now()

def sanitize(df):
    return df.map(lambda x: x.replace(',', '').replace(' ', '').strip() if isinstance(x, str) else '' if pd.isna(x) else x)
    
def create_soup(df, df_, soup, soup_name):
    df[soup_name] = df_[soup].apply(lambda x: ' '.join(x.values.astype(str)).lower(), axis=1)

layout_copies = [layout.copy() for layout in layouts]
soup = ['Name', 'Date of Birth', 'Father_Name']

for layout, layout_copy, in zip(layouts, layout_copies):
    layout_copy = sanitize(layout_copy)
    create_soup(layout, layout_copy, soup, "soup")

### Entity Matching

In [15]:
def combine_layouts(A, B, metric='cosine', threshold=0.8):
    def calculate_similarity(A, B, metric):
        if metric == 'cosine':
            tfidf = TfidfVectorizer(stop_words='english')
            combined_soup = pd.concat([A['soup'], B['soup']], ignore_index=True)
            tfidf.fit(combined_soup)
            tfidf_matrix_A = tfidf.transform(A['soup'])
            tfidf_matrix_B = tfidf.transform(B['soup'])
            similarity = cosine_similarity(tfidf_matrix_A, tfidf_matrix_B)
            similarity_df = pd.DataFrame(similarity, index=A.index, columns=B.index)
            idx_row = similarity_df.idxmax(axis=1)
            similarity_mask = similarity_df.max(axis=1) > threshold
        else:
            distance_matrix = pd.DataFrame([[distance(a, b) for b in B['soup']] for a in A['soup']], index=A.index, columns=B.index)
            idx_row = distance_matrix.idxmin(axis=1)
            similarity_mask = distance_matrix.min(axis=1) <= threshold
        return idx_row, similarity_mask

    def merge_data(A, B, idx_row, similarity_mask):
        combined_columns = list(set(A.columns) | set(B.columns))
        combined_data = pd.DataFrame(columns=combined_columns)
        for idx_A in A.index:
            if similarity_mask[idx_A]:
                idx_B = idx_row[idx_A]
                combined_row = A.loc[idx_A].combine_first(B.loc[idx_B])
                combined_row['source'] = f"{A.loc[idx_A]['source']}, {B.loc[idx_B]['source']}"
                combined_row['last_modified_date'] = datetime.now()
            else:
                combined_row = A.loc[idx_A]
            combined_data = pd.concat([combined_data, combined_row.to_frame().T], ignore_index=True)
        new_records = B.loc[~B.index.isin(idx_row[similarity_mask].values)]
        return pd.concat([combined_data, new_records], ignore_index=True)

    idx_row, similarity_mask = calculate_similarity(A, B, metric)
    return merge_data(A, B, idx_row, similarity_mask)


### Saving Results

#### Save Intermittent Result

In [16]:
# def save_layouts(layouts, save_path):
#     final_df = layouts[0]
#     results = [final_df] 

#     initial_part = "1"   
#     for i, df in enumerate(layouts[1:], start=2):
#         final_df = combine_layouts(final_df, df)
#         results.append(final_df)
        
#         initial_part += str(i)
#         final_df.to_csv(f"./{save_path}/result{initial_part}.csv", index=False)
    
#     return final_df, results

In [17]:
# final_df, results = save_layouts(layouts, 'results')

#### Save final result only

In [18]:
# def save_layouts(layouts, save_path):
#     final_df = layouts[0]

#     for df in layouts[1:]:
#         final_df = combine_layouts(final_df, df)
    
#     final_result_path = os.path.join(save_path, 'final_result.csv')
#     final_df.to_csv(final_result_path, index=False)
#     return final_df

In [19]:
# final_df = save_layouts(layouts, 'results')

#### Save final_result and delete source files if successful

In [20]:
import os
import glob

def save_layouts(layouts, save_path, save_filename):
    final_df = layouts[0]

    for df in layouts[1:]:
        final_df = combine_layouts(final_df, df)
    
    final_result_path = os.path.join(save_path, save_filename)
    
    try:
        final_df.to_csv(final_result_path, index=False)
    except Exception as e:
        print(f"Error saving final result: {e}")
        return None

    # If save is successful, delete all other files in save_path except final_result
    files = glob.glob(os.path.join(save_path, '*'))
    for f in files:
        if f != final_result_path:
            os.remove(f)

    return final_df

In [21]:
final_df = save_layouts(layouts, 'results', 'final_result.csv')

In [22]:
final_df

Unnamed: 0,License Number,votersAge,Date of Birth,source,soup,Name,Customer Code,Blood Group,SC Number,Temporary_Address,...,Citizenship Number,Father_Name,Mother Name,National Id,Gender,Permanent_Address,last_modified_date,SpouseName,Customer ID,Mobile Number
0,15-05-58353205,45.0,1990-01-01,"layout1, layout2, layout3, layout4, layout5",ramthapa 1990-01-01 rambahadurthapa,Ram Thapa,21216874.0,AB+,001.01.01,"Gongabu, Kathmandu, Nepal",...,624-93227-32431/660086,Ram Bahadur Thapa,Laxmi Thapa,AB123C,Male,"Baluwatar, Kathmandu, Nepal",2024-05-30 15:12:55.479627,Sita Thapa,3245.0,1234567890
1,21-08-00435579,38.0,1991-02-02,"layout1, layout2, layout3, layout4, layout5",sitashrestha 1991-02-02 hariprasadshrestha,Sita Shrestha,22359363.0,AB-,001.01.02,"New Road, Pokhara, Nepal",...,747-42087-31417/584714,Hari Prasad Shrestha,Radha Sharma,DE456F,Female,"Lakeside, Pokhara, Nepal",2024-05-30 15:12:55.479627,Ravi Sharma,3246.0,2345678901
2,93-12-35351480,52.0,1992-03-03,"layout1, layout2, layout3, layout4, layout5",harigurung 1992-03-03 gopalkrishnagurung,Hari Gurung,33485241.0,B-,001.01.03,"Pulchowk, Lalitpur, Nepal",...,389-45382-93886/821590,Gopal Krishna Gurung,Gita Adhikari,GH789I,Male,"Chitwan National Park, Chitwan, Nepal",2024-05-30 15:12:55.479627,Maya Adhikari,3247.0,3456789012
3,65-03-68139881,30.0,1993-04-04,"layout1, layout2, layout3, layout4, layout5",gitatamang 1993-04-04 shyamlaltamang,Gita Tamang,45475489.0,A-,001.01.04,"Bagbazar, Kathmandu, Nepal",...,571-38785-99733/440035,Shyam Lal Tamang,Mina Rai,JK012L,Female,"Biratnagar, Morang, Nepal",2024-05-30 15:12:55.479627,Surya Rai,3248.0,4567890123
4,,27.0,1994-05-05,"layout1, layout2, layout3, layout4",mohanlama 1994-05-05 krishnarajlama,Mohan Lama,56562139.0,,001.01.05,,...,,Krishna Raj Lama,Kalpana Karki,MN345O,Male,"Bharatpur, Chitwan, Nepal",2024-05-30 15:12:55.437012,Sarita Karki,3249.0,5678901234
5,82-09-81734599,41.0,1995-06-06,"layout1, layout2, layout3, layout4, layout5",radhamagar 1995-06-06 narayankumarmagar,Radha Magar,67698214.0,B-,001.01.06,"Chabahil, Kathmandu, Nepal",...,033-51347-62581/380746,Narayan Kumar Magar,Nima Gurung,PQ678R,Female,"Butwal, Rupandehi, Nepal",2024-05-30 15:12:55.493584,Pemba Gurung,3250.0,6789012345
6,,36.0,1996-07-07,"layout1, layout2, layout3, layout4",krishnarai 1996-07-07 govindabahadurrai,Krishna Rai,78714635.0,,001.01.07,"Kumaripati, Lalitpur, Nepal",...,,Govinda Bahadur Rai,Saru Shrestha,ST901U,Male,"Hetauda, Makwanpur, Nepal",2024-05-30 15:12:55.437012,Laxmi Shrestha,3251.0,7890123456
7,98-03-72394228,50.0,1997-08-08,"layout1, layout2, layout3, layout4, layout5",saritasherpa 1997-08-08 shivanarayansherpa,Sarita Sherpa,89847326.0,A-,001.01.08,"New Baneshwor, Kathmandu, Nepal",...,323-37869-95909/623481,Shiva Narayan Sherpa,Nanu Maharjan,VW234X,Female,"Janakpur, Dhanusa, Nepal",2024-05-30 15:12:55.493584,Raj Maharjan,3252.0,8901234567
8,,43.0,1998-09-09,"layout1, layout2, layout3, layout4",bikashkarki 1998-09-09 bhagirathbahadurkarki,Bikash Karki,91953421.0,,001.01.09,"Boudha, Kathmandu, Nepal",...,,Bhagirath Bahadur Karki,Lhamu Tamang,YZ567A,Male,"Nepalgunj, Banke, Nepal",2024-05-30 15:12:55.445465,Pema Tamang,3253.0,9012345678
9,42-08-02614125,34.0,1999-10-10,"layout1, layout2, layout3, layout4, layout5",nishabhandari 1999-10-10 suryabahadurbhandari,Nisha Bhandari,10203847.0,A-,001.01.10,"Kalanki, Kathmandu, Nepal",...,272-28301-42325/881177,Surya Bahadur Bhandari,Lila KC,BC890D,Female,"Dharan, Sunsari, Nepal",2024-05-30 15:12:55.493584,Manish KC,3254.0,123456789
