In [2]:
import re
import textdistance

# Function to remove extra spaces and newlines
def remove_extra_spaces(text):
    return re.sub(r'\s+', ' ', text.strip())

# Function to correct typos using fuzzy string matching
def correct_typos(text, reference_texts):
    corrected_text = text
    for ref_text in reference_texts:
        similarity = textdistance.jaccard(text.lower(), ref_text.lower())
        if similarity > 0.8:  # Adjust similarity threshold as needed
            corrected_text = ref_text
            break
    return corrected_text

# Function to normalize text by converting to lowercase and removing non-alphanumeric characters
def normalize_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    return text

# Function to handle missing descriptions by replacing them with a placeholder
def handle_missing_descriptions(text, placeholder="[MISSING]"):
    return text if text else placeholder

# Example usage
description = "Pembayaran S    H F"
reference_texts = ["fidusia", "fiducia", "fiducya"]  # List of reference texts for fuzzy string matching
cleaned_description = remove_extra_spaces(description)
corrected_description = correct_typos(cleaned_description, reference_texts)
normalized_description = normalize_text(corrected_description)
handled_description = handle_missing_descriptions(normalized_description)

print("Original Description:", description)
print("Cleaned Description:", cleaned_description)
print("Corrected Description:", corrected_description)
print("Normalized Description:", normalized_description)
print("Handled Description:", handled_description)

Original Description: Pembayaran S    H F
Cleaned Description: Pembayaran S H F
Corrected Description: Pembayaran S H F
Normalized Description: pembayaran s h f
Handled Description: pembayaran s h f


In [3]:
import pandas as pd
import re
import textdistance

# Function to remove extra spaces and newlines
def remove_extra_spaces(text):
    # Replace consecutive spaces and newlines with a single space
    return re.sub(r'\s+', ' ', text.strip())

# Function to correct typos using techniques like spell-checking or fuzzy string matching
def correct_typos(text, dataset_texts):
    corrected_words = []
    for word in text.split():
        max_similarity = 0
        corrected_word = word
        for dataset_word in dataset_texts:
            similarity = textdistance.levenshtein.normalized_similarity(word.lower(), dataset_word.lower())
            if similarity > max_similarity:
                corrected_word = dataset_word
                max_similarity = similarity
        corrected_words.append(corrected_word)
    return ' '.join(corrected_words)

# Function to normalize text by converting to lowercase and removing non-alphanumeric characters
def normalize_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    return text

# Function to handle missing descriptions by replacing them with a placeholder
def handle_missing_descriptions(text, placeholder="[MISSING]"):
    return text if text else placeholder

# Load the dataset
df = pd.read_csv('dummy.csv')

# Step 1: Preprocess text data
# Clean the "Deskripsi" column
df['Deskripsi'] = df['Deskripsi'].apply(remove_extra_spaces)

# Extract all individual words from the "Deskripsi" column
all_words = ' '.join(df['Deskripsi']).split()
dataset_texts = list(set(all_words))  # Get unique words

# Step 2: Correct typos in the "Deskripsi" column
df['Deskripsi'] = df['Deskripsi'].apply(lambda x: correct_typos(x, dataset_texts))

# Step 3: Normalize text data
df['Deskripsi'] = df['Deskripsi'].apply(normalize_text)

# Step 4: Handle missing descriptions
df['Deskripsi'] = df['Deskripsi'].apply(handle_missing_descriptions)

# Step 5: Combine preprocessed text data and numerical data
X_text = df['Deskripsi'].values
X_numeric = df['Nominal'].values
y = df['Verifikasi'].values

# Your remaining code for model training and evaluation goes here


In [6]:
import pandas as pd
import re
import textdistance

# Function to remove extra spaces and newlines
def remove_extra_spaces(text):
    # Replace consecutive spaces and newlines with a single space
    return re.sub(r'\s+', ' ', text.strip())

# Function to correct typos using techniques like spell-checking or fuzzy string matching
def correct_typos(text, dataset_texts):
    corrected_words = []
    for word in text.split():
        max_similarity = 0
        corrected_word = word
        for dataset_word in dataset_texts:
            similarity = textdistance.levenshtein.normalized_similarity(word.lower(), dataset_word.lower())
            if similarity > max_similarity:
                corrected_word = dataset_word
                max_similarity = similarity
        corrected_words.append(corrected_word)
    return ' '.join(corrected_words)

# Function to normalize text by converting to lowercase and removing non-alphanumeric characters
def normalize_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    return text

# Function to handle missing descriptions by replacing them with a placeholder
def handle_missing_descriptions(text, placeholder="[MISSING]"):
    return text if text else placeholder

# Load the dataset
df = pd.read_csv('dummy.csv')

# Step 1: Preprocess text data
# Clean the "Deskripsi" column
df['Deskripsi'] = df['Deskripsi'].apply(remove_extra_spaces)

# Extract all individual words from the "Deskripsi" column
all_words = ' '.join(df['Deskripsi']).split()
dataset_texts = list(set(all_words))  # Get unique words

# Step 2: Correct typos in the "Deskripsi" column
df['Deskripsi'] = df['Deskripsi'].apply(lambda x: correct_typos(x, dataset_texts))

# Step 3: Normalize text data
df['Deskripsi'] = df['Deskripsi'].apply(normalize_text)

# Step 4: Handle missing descriptions
df['Deskripsi'] = df['Deskripsi'].apply(handle_missing_descriptions)

# Example usage
description = "fi    du   sia"
cleaned_description = remove_extra_spaces(description)
corrected_description = correct_typos(cleaned_description, dataset_texts)
normalized_description = normalize_text(corrected_description)
handled_description = handle_missing_descriptions(normalized_description)

print("Original Description:", description)
print("Cleaned Description:", cleaned_description)
print("Corrected Description:", corrected_description)
print("Normalized Description:", normalized_description)
print("Handled Description:", handled_description)


Original Description: fi    du   sia
Cleaned Description: fi du sia
Corrected Description: F u fidusia
Normalized Description: f u fidusia
Handled Description: f u fidusia


In [7]:
dataset_texts

['u',
 'pinalty',
 'settlement',
 'Pembayaran',
 'm',
 'setelment',
 'SHF',
 'fidusia',
 'ty',
 'fiducia',
 'abcaba',
 '!@#',
 'Pembyaran',
 'jabotabek',
 'F',
 'k',
 'setlement',
 'abc',
 'umk',
 'S',
 'pinalti',
 'pi',
 'abcabafidusia',
 '3',
 'abca',
 'abcabcabc',
 'H',
 'nal',
 'cabang']