In [1]:
from fuzzywuzzy import process
import re

# Function to remove extra spaces and newlines
def remove_extra_spaces(text):
    # Replace consecutive spaces and newlines with a single space
    return re.sub(r'\s+', ' ', text.strip())

# Function to normalize text by converting to lowercase and removing non-alphanumeric characters
def normalize_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    return text

# Example dataset_texts
dataset_texts = ["shf", "settlement", "fidusia", "pinalty", "umk3"]

# Example sentences
sentences = [
    "Pembayaran SHF",
    "Pembayaran S    H F",
    "Pembyaran S                                    H                        F",
    "settlement cabang jabotabek",
    "sett      tle ment telah dilakukan",
    "pembayaran setlement",
    "melakukan setelment",
    "pembayaran fiducia",
    "fiducia dari bu ima"
]

# Process each sentence
for sentence in sentences:
    # Preprocess the sentence: remove extra spaces and normalize text
    cleaned_sentence = remove_extra_spaces(sentence)
    normalized_sentence = normalize_text(cleaned_sentence)
    
    # Split normalized sentence into words
    words = normalized_sentence.split()
    
    # Initialize variables to store the highest similarity score and the closest match
    highest_similarity_score = 0
    closest_match = None
    
    # Process each word in the sentence
    for word in words:
        # Using process.extractOne to find the closest match for each word
        closest_match_for_word, similarity_score = process.extractOne(word, dataset_texts)
        
        # Check if the current similarity score is higher than the highest similarity score found so far
        if similarity_score > highest_similarity_score:
            highest_similarity_score = similarity_score
            closest_match = closest_match_for_word
    
    print("Original Sentence:", sentence)
    print("Preprocessed Sentence:", cleaned_sentence)
    print("Closest Match:", closest_match)
    print("Highest Similarity Score:", highest_similarity_score)
    print()


Original Sentence: Pembayaran SHF
Preprocessed Sentence: Pembayaran SHF
Closest Match: shf
Highest Similarity Score: 100

Original Sentence: Pembayaran S    H F
Preprocessed Sentence: Pembayaran S H F
Closest Match: shf
Highest Similarity Score: 90

Original Sentence: Pembyaran S                                    H                        F
Preprocessed Sentence: Pembyaran S H F
Closest Match: shf
Highest Similarity Score: 90

Original Sentence: settlement cabang jabotabek
Preprocessed Sentence: settlement cabang jabotabek
Closest Match: settlement
Highest Similarity Score: 100

Original Sentence: sett      tle ment telah dilakukan
Preprocessed Sentence: sett tle ment telah dilakukan
Closest Match: settlement
Highest Similarity Score: 90

Original Sentence: pembayaran setlement
Preprocessed Sentence: pembayaran setlement
Closest Match: settlement
Highest Similarity Score: 95

Original Sentence: melakukan setelment
Preprocessed Sentence: melakukan setelment
Closest Match: settlement
Hig

