In [None]:
import pandas as pd
import re
from difflib import SequenceMatcher

# Function to remove non-ASCII characters from a string
def remove_non_ascii(text):
    return re.sub(r'[^\x00-\x7F]+', '', text)

def match_address(test_address, num_matches=1, similarity_threshold=0.9):
    test_parts = test_address.split()[::-1]  # Reverse the parts of the test address

    if not any(part.isdigit() and len(part) == 6 for part in test_parts):
        # Test address does not contain a 6-digit number part, return None
        return None

    matches = []
    ratios = []

    for _, row in train_data.iterrows():
        train_parts = row[address_column].split()[::-1]  # Reverse the parts of the train address

        # Compare each part of the addresses
        ratio = 0
        for test_part in test_parts:
            best_ratio = 0
            for train_part in train_parts:
                part_ratio = SequenceMatcher(None, train_part, test_part).ratio()
                best_ratio = max(best_ratio, part_ratio)
            ratio += best_ratio

        ratio /= len(test_parts)  # Calculate the average ratio

        if ratio >= similarity_threshold:
            ratios.append(ratio)
            matches.append(row[address_column])

    # Sort the matches based on the ratio in descending order
    sorted_matches = [x for _, x in sorted(zip(ratios, matches), reverse=True)]

    # Check if pincode or city match exists
    for match in sorted_matches:
        match_parts = match.split()[::-1]  # Reverse the parts of the match address
        if any(part.isdigit() and len(part) == 6 for part in match_parts):
            # Pincode match found, return the matches
            return sorted_matches[:num_matches]
        elif any(part in match_parts for part in test_parts):
            # City match found, return the matches
            return sorted_matches[:num_matches]

    # No pincode or city match found, return None
    return None



# Step 1: Define the paths to your train and validation CSV files
train_csv_path = '/content/drive/MyDrive/Capstone/tenk_data_post_shuffled.csv'
validation_csv_path = '/content/drive/MyDrive/Capstone/tenk_data_post_shuffled_valid.csv'

# Column names in the CSV files
address_column = 'address'

# Load the train dataset and remove non-ASCII characters from the address column
train_data = pd.read_csv(train_csv_path)
train_data[address_column] = train_data[address_column].apply(remove_non_ascii)

# Load the validation dataset and remove non-ASCII characters from the address column
validation_data = pd.read_csv(validation_csv_path)
validation_data[address_column] = validation_data[address_column].apply(remove_non_ascii)

# Step 7: Test the address matching function
# Test data
test_data = [
    "A-203 reshma regency  behind jmi subhash chandra bose lane i.e.suryapet suryapet 508214",
    "3rd flat shiv dartshan society  near gurukul behind bhonsla military school vittal mallya road hinotia vidisha 464258",
    "rajyog society. flat. 19  shastri chowk  alandi road  bhosari  pune-411039  411039",
    "room no.3 anand nagar malhar road behind pratyush hall kurad 4220000",
    "murdanda bijapur(cgh) 494447 4a vineyard society indira chowk  off law college road"
    # Add more test addresses as needed
]

# Find closest matches for each test address
for test_address in test_data:
    closest_matches = match_address(test_address, num_matches=1, similarity_threshold=0.9)
    print(f"Test Address: {test_address}")
    if closest_matches:
        print(f"Closest Matches:")
        for match in closest_matches:
            print(match)
    else:
        print("Closest Matches: None")
    print()
