### Changelog

- Need to create dynamic data loading/preprocessing/saving (for any number of files)

### Import Libraries

In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from Levenshtein import distance
from datetime import datetime 
import os

### Loading Dataset 

In [3]:
def load_df_from_dir(dir_path):
    csv_files = [f for f in os.listdir(dir_path) if f.endswith(".csv")]
    layouts = []

    for csv_file in csv_files:
        file_path = os.path.join(dir_path, csv_file)
        df = pd.read_csv(file_path)
        # df['source'] = os.path.splitext(csv_file)[0]
        layouts.append(df)
    
    return layouts

In [4]:
layouts = load_df_from_dir('data')

### Data Preprocessing

In [6]:
for layout in layouts:
    layout['last_modified_date'] = datetime.now()

def sanitize(df):
    return df.map(lambda x: x.replace(',', '').replace(' ', '').strip() if isinstance(x, str) else '' if pd.isna(x) else x)
    
def create_soup(df, df_, soup, soup_name):
    df[soup_name] = df_[soup].apply(lambda x: ' '.join(x.values.astype(str)).lower(), axis=1)

layout_copies = [layout.copy() for layout in layouts]
soup = ['Name', 'Date of Birth', 'Father_Name']

for layout, layout_copy, in zip(layouts, layout_copies):
    layout_copy = sanitize(layout_copy)
    create_soup(layout, layout_copy, soup, "soup")

### Entity Matching

In [7]:
def combine_layouts(A, B, metric='cosine', threshold=0.8):
    def calculate_similarity(A, B, metric):
        if metric == 'cosine':
            tfidf = TfidfVectorizer(stop_words='english')
            combined_soup = pd.concat([A['soup'], B['soup']], ignore_index=True)
            tfidf.fit(combined_soup)
            tfidf_matrix_A = tfidf.transform(A['soup'])
            tfidf_matrix_B = tfidf.transform(B['soup'])
            similarity = cosine_similarity(tfidf_matrix_A, tfidf_matrix_B)
            similarity_df = pd.DataFrame(similarity, index=A.index, columns=B.index)
            idx_row = similarity_df.idxmax(axis=1)
            similarity_mask = similarity_df.max(axis=1) > threshold
        else:
            distance_matrix = pd.DataFrame([[distance(a, b) for b in B['soup']] for a in A['soup']], index=A.index, columns=B.index)
            idx_row = distance_matrix.idxmin(axis=1)
            similarity_mask = distance_matrix.min(axis=1) <= threshold
        return idx_row, similarity_mask

    def merge_data(A, B, idx_row, similarity_mask):
        combined_columns = list(set(A.columns) | set(B.columns))
        combined_data = pd.DataFrame(columns=combined_columns)
        for idx_A in A.index:
            if similarity_mask[idx_A]:
                idx_B = idx_row[idx_A]
                combined_row = A.loc[idx_A].combine_first(B.loc[idx_B])
                combined_row['source'] = f"{A.loc[idx_A]['source']}, {B.loc[idx_B]['source']}"
                combined_row['last_modified_date'] = datetime.now()
            else:
                combined_row = A.loc[idx_A]
            combined_data = pd.concat([combined_data, combined_row.to_frame().T], ignore_index=True)
        new_records = B.loc[~B.index.isin(idx_row[similarity_mask].values)]
        return pd.concat([combined_data, new_records], ignore_index=True)

    idx_row, similarity_mask = calculate_similarity(A, B, metric)
    return merge_data(A, B, idx_row, similarity_mask)


### Saving Results

#### Save final result only

In [10]:
def char_to_digit(char):
    if char.isdigit():
        return int(char)
    elif char.isalpha():
        return (ord(char.lower()) - ord('a') + 1) % 10
    else:
        return 0

def string_to_digits(s):
    digits = [char_to_digit(char) for char in s]
    numeric_string = ''.join(map(str, digits))
    
    if len(numeric_string) > 13:
        return numeric_string[:13]
    else:
        return numeric_string.ljust(13, '0')

In [13]:
def save_layouts(layouts, save_path):
    final_df = layouts[0]

    for df in layouts[1:]:
        final_df = combine_layouts(final_df, df)
    
    final_df['uuid'] = final_df['soup'].apply(string_to_digits)
    final_df.to_csv('final_result.csv', index=False)
    return final_df

In [14]:
final_df = save_layouts(layouts)

### Save DF in List to CSV

In [None]:
# import os
# import pandas as pd

# def save_dfs_to_csv(layouts, output_dir):

#     os.makedirs(output_dir, exist_ok=True)
    
#     for i, df in enumerate(layouts, start=1):
#         file_name = f"layout{i}.csv"
#         file_path = os.path.join(output_dir, file_name)
#         df.to_csv(file_path, index=False)
#         print(f"Saved {file_path}")


### Different Ways to Save Files

#### Save Intermittent Result

In [8]:
# def save_layouts(layouts, save_path):
#     final_df = layouts[0]
#     results = [final_df] 

#     initial_part = "1"   
#     for i, df in enumerate(layouts[1:], start=2):
#         final_df = combine_layouts(final_df, df)
#         results.append(final_df)
        
#         initial_part += str(i)
#         final_df.to_csv(f"./{save_path}/result{initial_part}.csv", index=False)
    
#     return final_df, results

In [9]:
# final_df, results = save_layouts(layouts, 'results')

#### Save final_result and delete source files if successful

In [None]:
# import os
# import glob

# def save_layouts(layouts, save_path, save_filename):
#     final_df = layouts[0]

#     for df in layouts[1:]:
#         final_df = combine_layouts(final_df, df)
    
#     final_result_path = os.path.join(save_path, save_filename)
    
#     try:
#         final_df.to_csv(final_result_path, index=False)
#     except Exception as e:
#         print(f"Error saving final result: {e}")
#         return None

#     # If save is successful, delete all other files in save_path except final_result
#     files = glob.glob(os.path.join(save_path, '*'))
#     for f in files:
#         if f != final_result_path:
#             os.remove(f)

#     return final_df

# final_df = save_layouts(layouts, 'results', 'final_result.csv')