In [2]:
import pandas as pd
from fuzzywuzzy import process
import re

# Function to remove extra spaces and newlines
def remove_extra_spaces(text):
    # Replace consecutive spaces and newlines with a single space
    return re.sub(r'\s+', ' ', text.strip())

# Function to normalize text by converting to lowercase and removing non-alphanumeric characters
def normalize_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    return text

# Example dataset_texts
dataset_texts = ["shf", "settlement", "fidusia", "pinalty", "umk3"]

# Mapping dataset_texts to numbers
text_to_number_mapping = {text: i+1 for i, text in enumerate(dataset_texts)}
text_to_number_mapping[""] = 0  # Placeholder for cases with low similarity score

# Load the DataFrame
df = pd.read_csv('dummy.csv')

# Preprocess the Deskripsi column
df['Deskripsi'] = df['Deskripsi'].apply(remove_extra_spaces)
df['Deskripsi'] = df['Deskripsi'].apply(normalize_text)

# Process each sentence
closest_words = []
closest_words_num = []
for sentence in df['Deskripsi']:
    # Split normalized sentence into words
    words = sentence.split()
    
    # Initialize variables to store the highest similarity score and the closest match
    highest_similarity_score = 0
    closest_match = None
    
    # Process each word in the sentence
    for word in words:
        # Using process.extractOne to find the closest match for each word
        closest_match_for_word, similarity_score = process.extractOne(word, dataset_texts)
        
        # Update highest_similarity_score and closest_match
        if similarity_score > highest_similarity_score:
            highest_similarity_score = similarity_score
            closest_match = closest_match_for_word
    
    # Map closest_match to a number based on the text_to_number_mapping
    closest_match_number = text_to_number_mapping.get(closest_match, 0)
    
    closest_words.append(closest_match)
    closest_words_num.append(closest_match_number)

# Add the closest_words and closest_words_num to the DataFrame
df['closest_words'] = closest_words
df['closest_words_num'] = closest_words_num

# Save the DataFrame to a CSV file
df.to_csv('processed_data.csv', index=False)

In [3]:
df

Unnamed: 0,Deskripsi,Nominal,Verifikasi,closest_words,closest_words_num
0,pembayaran shf,200,SHF,shf,1
1,pembayaran s h f,200,SHF,shf,1
2,pembyaran s h f,200,SHF,shf,1
3,abcabcabc,200,SHF,fidusia,3
4,settlement cabang jabotabek,250,BUF/BUP,settlement,2
5,setlement,300,BUF/BUP,settlement,2
6,setlement,500,BUF/BUP,settlement,2
7,setelment,1000000,BUF/BUP,settlement,2
8,fiducia,2000000,fidusia,fidusia,3
9,fiducia,2001000,fidusia,fidusia,3
