In [2]:
import pandas as pd
from fuzzywuzzy import process
import re

# Function to remove extra spaces and newlines
def remove_extra_spaces(text):
    # Replace consecutive spaces and newlines with a single space
    return re.sub(r'\s+', ' ', text.strip())

# Function to normalize text by converting to lowercase and removing non-alphanumeric characters
def normalize_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    return text

# Function to handle missing descriptions by replacing them with 0
def handle_missing_descriptions(text, placeholder=0):
    return text if text else placeholder

# Example dataset_texts
dataset_texts = ["shf", "settlement", "fidusia", "pinalty", "umk3"]

# Mapping dataset_texts to numbers
text_to_number_mapping = {text: i+1 for i, text in enumerate(dataset_texts)}
text_to_number_mapping[""] = 0  # Placeholder for cases with low similarity score

# Example sentences
sentences = [
    "Pembayaran SHF",
    "Pembayaran S    H F",
    "Pembyaran S                                    H                        F",
    "settlement cabang jabotabek",
    "sett      tle ment telah dilakukan",
    "pembayaran setlement",
    "melakukan setelment",
    "pembayaran fiducia",
    "fiducia dari bu ima",
    ""  # Example of missing description
]

# Process each sentence
processed_data = []
for sentence in sentences:
    # Preprocess the sentence: remove extra spaces and normalize text
    cleaned_sentence = remove_extra_spaces(sentence)
    normalized_sentence = normalize_text(cleaned_sentence)
    
    # Handle missing descriptions
    handled_description = handle_missing_descriptions(normalized_sentence)
    
    # Split normalized sentence into words
    words = normalized_sentence.split()
    
    # Initialize variables to store the highest similarity score and the closest match
    highest_similarity_score = 0
    closest_match = None
    
    # Process each word in the sentence
    for word in words:
        # Using process.extractOne to find the closest match for each word
        closest_match_for_word, similarity_score = process.extractOne(word, dataset_texts)
        
        # Update highest_similarity_score and closest_match
        if similarity_score > highest_similarity_score:
            highest_similarity_score = similarity_score
            closest_match = closest_match_for_word
    
    # Map closest_match to a number based on the text_to_number_mapping
    closest_match_number = text_to_number_mapping.get(closest_match, 0)
    
    processed_data.append([sentence, 100, closest_match_number])  # Adding Nominal as 100 for all sentences

# Convert processed_data to a DataFrame
df = pd.DataFrame(processed_data, columns=['Deskripsi', 'Nominal', 'Verifikasi'])

df

Unnamed: 0,Deskripsi,Nominal,Verifikasi
0,Pembayaran SHF,100,1
1,Pembayaran S H F,100,1
2,Pembyaran S ...,100,1
3,settlement cabang jabotabek,100,2
4,sett tle ment telah dilakukan,100,2
5,pembayaran setlement,100,2
6,melakukan setelment,100,2
7,pembayaran fiducia,100,3
8,fiducia dari bu ima,100,3
9,,100,0
