In [58]:
import re
from difflib import SequenceMatcher

# Function to normalize address
def normalize_address(address):
    address = re.sub(r'\b(Marg|Lane|Township)\b', '', address, flags=re.IGNORECASE)
    address = re.sub(r'\W+', ' ', address)
    return address.strip().lower()


In [59]:
# Function to check exact letter match
def exact_letter_match(name1, name2):
    return name1.lower() == name2.lower()


In [60]:
# Function to check abbreviated name match
def abbreviated_name_match(name1, name2):
    name1_parts = name1.split()
    name2_parts = name2.split()
    if len(name1_parts) == 2 and len(name2_parts) == 2:
        return (name1_parts[0][0].lower() == name2_parts[0][0].lower() and
                name1_parts[1].lower() == name2_parts[1].lower())
    return False


In [61]:
# Function to check ignoring middle names match
def ignore_middle_names(name1, name2):
    parts1 = name1.split()
    parts2 = name2.split()
    if len(parts1) == 2 and len(parts2) == 3:
        return parts1[0].lower() == parts2[0].lower() and parts1[1].lower() == parts2[2].lower()
    if len(parts1) == 3 and len(parts2) == 2:
        return parts1[0].lower() == parts2[0].lower() and parts1[2].lower() == parts2[1].lower()
    return False


In [62]:
# Function to check matching any part of the name
def match_any_part(name1, name2):
    parts1 = name1.lower().split()
    parts2 = name2.lower().split()
    return any(part in parts2 for part in parts1) or any(part in parts1 for part in parts2)


In [63]:
# Function to check circular match
def circular_match(name1, name2):
    return set(name1.lower().split()) == set(name2.lower().split())


In [64]:
# Function to check single letter abbreviation match
def single_letter_abbreviation(name1, name2):
    parts1 = name1.split()
    parts2 = name2.split()
    if len(parts1) == 2 and len(parts2) == 2:
        return (parts1[0][0].lower() == parts2[0][0].lower() and
                parts1[1].lower() == parts2[1].lower())
    return False


In [65]:
# Function to match names based on various rules
def name_match(input_name, extracted_name):
    match_score = 0
    if exact_letter_match(input_name, extracted_name):
        match_score += 20
    if abbreviated_name_match(input_name, extracted_name):
        match_score += 20
    if ignore_middle_names(input_name, extracted_name):
        match_score += 20
    if match_any_part(input_name, extracted_name):
        match_score += 20
    if circular_match(input_name, extracted_name):
        match_score += 20
    if single_letter_abbreviation(input_name, extracted_name):
        match_score += 20
    return min(match_score, 100)


In [66]:
# Function to calculate similarity ratio
def similarity_ratio(a, b):
    return SequenceMatcher(None, a, b).ratio()



In [67]:
# Functions to match specific address components
def house_flat_match(input_address, extracted_address):
    # Extract house/flat number from addresses
    house_flat_input = re.search(r'\b\d+[a-zA-Z]?\b', input_address)
    house_flat_extracted = re.search(r'\b\d+[a-zA-Z]?\b', extracted_address)
    if house_flat_input and house_flat_extracted:
        return similarity_ratio(house_flat_input.group(), house_flat_extracted.group()) * 100
    return 0


In [68]:
def street_road_match(input_address, extracted_address):
    # Extract street/road name from addresses
    input_address = normalize_address(input_address)
    extracted_address = normalize_address(extracted_address)
    input_street = ' '.join([word for word in input_address.split() if not word.isdigit()])
    extracted_street = ' '.join([word for word in extracted_address.split() if not word.isdigit()])
    return similarity_ratio(input_street, extracted_street) * 100


In [69]:
def city_match(input_address, extracted_address):
    # Extract city from addresses
    city_input = re.search(r'\b[a-zA-Z]+\b', input_address, re.IGNORECASE)
    city_extracted = re.search(r'\b[a-zA-Z]+\b', extracted_address, re.IGNORECASE)
    if city_input and city_extracted:
        return similarity_ratio(city_input.group(), city_extracted.group()) * 100
    return 0


In [70]:
def floor_number_match(input_address, extracted_address):
    # Extract floor number from addresses
    floor_input = re.search(r'\b\d+[a-zA-Z]*\b', input_address)
    floor_extracted = re.search(r'\b\d+[a-zA-Z]*\b', extracted_address)
    if floor_input and floor_extracted:
        return similarity_ratio(floor_input.group(), floor_extracted.group()) * 100
    return 0


In [71]:
def pincode_match(input_address, extracted_address):
    # Extract pin code from addresses
    pincode_input = re.search(r'\b\d{6}\b', input_address)
    pincode_extracted = re.search(r'\b\d{6}\b', extracted_address)
    if pincode_input and pincode_extracted:
        return similarity_ratio(pincode_input.group(), pincode_extracted.group()) * 100
    return 0


In [72]:
def premise_building_match(input_address, extracted_address):
    # Extract premise/building from addresses
    premise_input = re.search(r'\b[a-zA-Z]+\b', input_address, re.IGNORECASE)
    premise_extracted = re.search(r'\b[a-zA-Z]+\b', extracted_address, re.IGNORECASE)
    if premise_input and premise_extracted:
        return similarity_ratio(premise_input.group(), premise_extracted.group()) * 100
    return 0


In [73]:
def landmark_match(input_address, extracted_address):
    # Extract landmark from addresses if any (e.g., near, beside)
    landmark_input = re.search(r'\bnear\b.*', input_address, re.IGNORECASE)
    landmark_extracted = re.search(r'\bnear\b.*', extracted_address, re.IGNORECASE)
    if landmark_input and landmark_extracted:
        return similarity_ratio(landmark_input.group(), landmark_extracted.group()) * 100
    return 0


In [74]:
def state_match(input_address, extracted_address):
    # Extract state from addresses
    state_input = re.search(r'\b[a-zA-Z]+\b', input_address, re.IGNORECASE)
    state_extracted = re.search(r'\b[a-zA-Z]+\b', extracted_address, re.IGNORECASE)
    if state_input and state_extracted:
        return similarity_ratio(state_input.group(), state_extracted.group()) * 100
    return 0


In [75]:
# Function to match addresses based on normalization and field-specific matching

def address_match(input_address, extracted_address, cutoff=70):
    # Calculate individual component scores
    house_flat_score = house_flat_match(input_address, extracted_address)
    street_road_score = street_road_match(input_address, extracted_address)
    city_score = city_match(input_address, extracted_address)
    floor_number_score = floor_number_match(input_address, extracted_address)
    pincode_score = pincode_match(input_address, extracted_address)
    premise_building_score = premise_building_match(input_address, extracted_address)
    landmark_score = landmark_match(input_address, extracted_address)
    state_score = state_match(input_address, extracted_address)

    # Assign weights to each component (example weights, adjust as needed)
    weights = {
        "house_flat": 0.15,
        "street_road": 0.15,
        "city": 0.15,
        "floor_number": 0.10,
        "pincode": 0.20,
        "premise_building": 0.10,
        "landmark": 0.10,
        "state": 0.05
    }

    # Compute weighted average of scores
    total_score = (
        (house_flat_score or 0) * weights["house_flat"] +
        (street_road_score or 0) * weights["street_road"] +
        (city_score or 0) * weights["city"] +
        (floor_number_score or 0) * weights["floor_number"] +
        (pincode_score or 0) * weights["pincode"] +
        (premise_building_score or 0) * weights["premise_building"] +
        (landmark_score or 0) * weights["landmark"] +
        (state_score or 0) * weights["state"]
    )

    # Normalize to 100
    normalized_score = total_score * 100

    # Return the score only if it meets the cutoff
    return normalized_score if normalized_score >= cutoff else 0


In [76]:
# Weighted sum of the individual scores
    
def calculate_final_score(house_flat_score, street_road_score, city_score, floor_number_score, 
                          pincode_score, premise_building_score, landmark_score, state_score, cutoff):
    final_score = (
        house_flat_score * 0.15 + 
        street_road_score * 0.15 + 
        city_score * 0.10 + 
        floor_number_score * 0.10 + 
        pincode_score * 0.15 + 
        premise_building_score * 0.10 + 
        landmark_score * 0.10 + 
        state_score * 0.15
    )

    if final_score >= cutoff:
        return final_score
    return final_score



In [77]:
# Function to check exact match for UID
def uid_match(input_uid, extracted_uid):
    return 100 if input_uid == extracted_uid else 0


In [78]:
# Function to match names based on various rules
def name_match(input_name, extracted_name):
    match_score = 0
    if exact_letter_match(input_name, extracted_name):
        match_score += 20
    if abbreviated_name_match(input_name, extracted_name):
        match_score += 20
    if ignore_middle_names(input_name, extracted_name):
        match_score += 20
    if match_any_part(input_name, extracted_name):
        match_score += 20
    if circular_match(input_name, extracted_name):
        match_score += 20
    if single_letter_abbreviation(input_name, extracted_name):
        match_score += 20
    return min(match_score, 100)


In [79]:
# Function to evaluate overall match

def overall_match(input_name, extracted_name, input_address, extracted_address, input_uid, extracted_uid):
    # Compute scores with fallback to 0 for None
    name_score = name_match(input_name, extracted_name) or 0
    address_score = address_match(input_address, extracted_address) or 0
    uid_score = uid_match(input_uid, extracted_uid) or 0

    # Calculate the weighted overall score
    overall_score = (name_score * 0.4) + (address_score * 0.4) + (uid_score * 0.2)  # 40% Name, 40% Address, 20% UID
    return overall_score



In [80]:
import pandas as pd

file_path = "/Users/admin/Documents/My_projects/adar/dataset/input file.xlsx"
df = pd.read_excel(file_path)
print(df.columns)  # Print column names to verify



Index(['SrNo', 'House Flat Number', 'House Flat Number Match Score', 'Town',
       'Street Road Name', 'Street Road Name Match Score',
       'Street Road Name Match Score.1', 'City', 'City Match Score',
       ' Floor Number', 'Floor Number Match Score', 'Country', 'PINCODE',
       'PINCODE Match Score', 'Premise Building Name',
       'Premise Building Name Match Score', 'Landmark', 'Landmark Match Score',
       'State', 'State Match Score', 'Name', 'Name extracted from OVD',
       'Name match percentage', 'Name Match Score', 'UID',
       'UID Extracted From OVD', 'UID Match Score',
       'Address Extracted From OVD', 'Final Address Match',
       'Final Address Match Score', 'Overall Match', 'Final Remarks',
       'Document Type'],
      dtype='object')


In [81]:
import pandas as pd

def run_tests_from_excel(file_path):
    # Load the Excel file
    df = pd.read_excel(file_path)
    
    # Normalize column names
    df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')  # Standardize column names

    # Check available columns
    print("Available columns:", df.columns)
    
    # Ensure required columns exist
    required_columns = ['name', 'uid', 'house_flat_number', 'street_road_name', 'city', 'state', 'pincode']
    for col in required_columns:
        if col not in df.columns:
            raise KeyError(f"Missing required column: {col}")
    
    # Create 'address' column by converting all elements to strings
    df['address'] = df[['house_flat_number', 'street_road_name', 'city', 'state', 'pincode']].fillna('').astype(str).agg(', '.join, axis=1)

    # Iterate through each row of the DataFrame
    for index, row in df.iterrows():
        input_name = row['name']
        extracted_name = row['name']  # Assuming same column for simplicity
        input_address = row['address']
        extracted_address = row['address']  # Assuming same column for simplicity
        input_uid = row['uid']
        extracted_uid = row['uid']  # Assuming same column for simplicity
        
        # Run your test functions (replace with actual function definitions)
        print(f"Testing: {input_name} vs {extracted_name}, {input_address} vs {extracted_address}, {input_uid} vs {extracted_uid}")
        print(f"Name Match Score: {name_match(input_name, extracted_name)}")
        print(f"Address Match Score: {address_match(input_address, extracted_address)}")
        print(f"UID Match Score: {uid_match(input_uid, extracted_uid)}")

# Example usage
file_path = "/Users/admin/Documents/My_projects/adar/dataset/input file.xlsx"
run_tests_from_excel(file_path)


Available columns: Index(['srno', 'house_flat_number', 'house_flat_number_match_score', 'town',
       'street_road_name', 'street_road_name_match_score',
       'street_road_name_match_score.1', 'city', 'city_match_score',
       'floor_number', 'floor_number_match_score', 'country', 'pincode',
       'pincode_match_score', 'premise_building_name',
       'premise_building_name_match_score', 'landmark', 'landmark_match_score',
       'state', 'state_match_score', 'name', 'name_extracted_from_ovd',
       'name_match_percentage', 'name_match_score', 'uid',
       'uid_extracted_from_ovd', 'uid_match_score',
       'address_extracted_from_ovd', 'final_address_match',
       'final_address_match_score', 'overall_match', 'final_remarks',
       'document_type'],
      dtype='object')
Testing: Nitish Kumar Sharma vs Nitish Kumar Sharma, Flat 404, Buddha Nagar, Noida, Uttarpradesh, 201301 vs Flat 404, Buddha Nagar, Noida, Uttarpradesh, 201301, 424831815689 vs 424831815689
Name Match Score: 