In [165]:
import pandas as pd
import numpy as np
import itertools
import re
from thefuzz import fuzz

In [166]:
data = pd.read_excel('Customer_Name.xlsx')
print(data['Name'][2])
data

Mr.     Ram      Raj


Unnamed: 0,Name,Gender
0,Claus$adam,F
1,$/Klaus adam,F
2,Mr. Ram Raj,M
3,CA RamRaj,M
4,Ram-Raj,M
5,Ram Raj,M
6,Raam N Raaj,M
7,Raam Raaj,M
8,Rohit Khatri,M
9,Mohit Yadav,M


In [167]:
# convert the names to a standard form

# converting the names into lower case
data['Name'] = data['Name'].str.lower()

# replacing the charcters other than alphabet by ' '  [ '$$$$alex hail' -> ' alex hail', 'alex- $hail' -> 'alex hail' ]
def replace_non_alphabetic_characters(name):
    # Use a regular expression to replace non-alphabetic characters with spaces
    cleaned_name = re.sub(r'[^a-z]+', ' ', name)
    return cleaned_name

data['Name'] = data['Name'].apply(replace_non_alphabetic_characters)


# removing unnessary spaces at the start and end of the name
data['Name'] = data['Name'].str.strip()
print(data['Name'][2])
data

mr ram raj


Unnamed: 0,Name,Gender
0,claus adam,F
1,klaus adam,F
2,mr ram raj,M
3,ca ramraj,M
4,ram raj,M
5,ram raj,M
6,raam n raaj,M
7,raam raaj,M
8,rohit khatri,M
9,mohit yadav,M


In [168]:
# Removing titles from the name

# Function to remove titles from names
def remove_titles(name):
    # Common titles to be removed (case-insensitive)
    titles = ["mr", "mrs", "miss", "ms", "dr", "prof", "sir", "shree", "sri", "smt", "srimati","begum", "mam", "madam", "ca"]
    # Split the name into words
    name_parts = name.split()
    
    # Remove titles from the name parts
    cleaned_parts = [part for part in name_parts if part not in titles]
    
    # Join the cleaned name parts back together
    cleaned_name = " ".join(cleaned_parts)
    
    return cleaned_name

# Apply the 'remove_titles' function to the entire "Name" column
data['Name'] = data['Name'].apply(remove_titles)

data

Unnamed: 0,Name,Gender
0,claus adam,F
1,klaus adam,F
2,ram raj,M
3,ramraj,M
4,ram raj,M
5,ram raj,M
6,raam n raaj,M
7,raam raaj,M
8,rohit khatri,M
9,mohit yadav,M


In [169]:
# Arranging the name in  the decreasing order of words in a name (because this will make sure the big word combination cover smaller one)

# Split each name into words and calculate the number of words
data['Name_Word_Count'] = [len(name.split()) for name in data['Name']]

# sorting the name column by number of words in a name (Descending order)
data.sort_values('Name_Word_Count',ascending = False,inplace = True)
data.drop('Name_Word_Count', axis = 1, inplace = True)

# sorted_by_words_count
data

Unnamed: 0,Name,Gender
26,anish singh malhotra,M
6,raam n raaj,M
19,raam kumar anand,M
15,aaditya kumar,M
27,anish banerjee,M
1,klaus adam,F
25,anish malhotra,M
24,anisha malhotra,F
23,suresh pawaar,M
22,ramesh pawar,M


In [170]:
# general name format creattion

def general_name_format(name):                            # 'jain aum'  -->  'aumJain'
    # Split the name into first name and last name
    name_parts = name.split()
    
    # Sort the name parts alphabetically
    sorted_name = sorted(name_parts,reverse = False)
    
    # Join the sorted name parts with a space
    sorted_name_str = "".join(sorted_name)
    
    return sorted_name_str

In [171]:
# getting all keys with the same value

def get_keys_with_value(input_dict, target_value):
    keys_with_value = []
    for key, value in input_dict.items():
        if value == target_value:
            keys_with_value.append(key)
    return keys_with_value

In [172]:
def name_words_all_combination(name):              # name: Darshan Raval
    name = name.split()
    name_combinations = []
    for i in range(1,len(name)):                  # going with at least 2 word combination
        name_combinations = name_combinations + list(itertools.combinations(name, i+1))
    name_combinations.reverse()
    return name_combinations

def code_dictionary(name,code):
    
    name_combination = name_words_all_combination(name)
    for i in range(len(name_combination)):
        name_combination[i] = ''.join(sorted(name_combination[i]))
    
    new_dictionary = {}
    for i in range(len(name_combination)):
        new_dictionary.update({name_combination[i]:code+1})
        
    return new_dictionary       

In [173]:
# key with same value will always be together    [fuzz checking of all the words]

name_directory = {}
threshold = 81.90
code = 0
data['Gen_Name'] = data['Name']

for name in data['Name']:
    
    found = False
    
    if len(name_directory.keys()) > 0:
        
        gen_name = general_name_format(name)
        for i in name_directory.keys():
            if fuzz.ratio(gen_name,i) > threshold:
                data['Gen_Name'].replace(name,get_keys_with_value(name_directory, name_directory[i])[0], inplace = True)
                found = True
                break        
            
    if found == False:
        if len(name.split()) == 1:
            name_directory.update({name: code+1})
            code =code + 1
        else:
            name_directory.update(code_dictionary(name,code))
            code = code + 1
            data['Gen_Name'].replace(name,general_name_format(name),inplace = True)

In [174]:
data.sort_values('Gen_Name',inplace = True)
data

Unnamed: 0,Name,Gender,Gen_Name
15,aaditya kumar,M,aadityakumar
13,aditya kumar,M,aadityakumar
20,abhijit mishra,M,abhijithmisra
21,abhijith misra,M,abhijithmisra
1,klaus adam,F,adamklaus
0,claus adam,F,adamklaus
14,aditya jha,M,adityajha
49,alex,M,alexa
50,alexa,F,alexa
41,alexandra,M,alexandria


In [175]:
data['Name Match'] = data['Name']

def replace_duplicates_with_first_occurrence(df):
    # Step 1: Identify duplicate rows based on columns B and D
    duplicates = df.duplicated(subset=['Gender', 'Gen_Name'], keep='first')
    
    # Step 2: Create a mapping of unique values of columns B and D to their first occurrence in column A
    mapping = df.drop_duplicates(subset=['Gender', 'Gen_Name'], keep='first')[['Gender', 'Gen_Name', 'Name Match']].set_index(['Gender', 'Gen_Name'])['Name Match']
    
    # Step 3: Replace the values of column A with the mapped values for the duplicate rows
    df.loc[duplicates, 'Name Match'] = df.loc[duplicates].apply(lambda row: mapping[(row['Gender'], row['Gen_Name'])], axis=1)
    
    return df

replace_duplicates_with_first_occurrence(data)
data.drop(['Gender','Gen_Name'],axis=1,inplace = True)
data

Unnamed: 0,Name,Name Match
15,aaditya kumar,aaditya kumar
13,aditya kumar,aaditya kumar
20,abhijit mishra,abhijit mishra
21,abhijith misra,abhijit mishra
1,klaus adam,klaus adam
0,claus adam,klaus adam
14,aditya jha,aditya jha
49,alex,alex
50,alexa,alexa
41,alexandra,alexandra
