<a href="https://colab.research.google.com/github/LashawnFofung/Python-Document-Preparation-and-Extraction/blob/main/Perform_Text_Cleaning_and_Standardization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Perform Text Cleaning & Standardization

Data: *this.txt_file*

In [None]:
import re
import unicodedata
import string

# Load the messy text file
print("ðŸ“„ Loading messy text file...")
with open('text_sample.txt', 'r', encoding='utf-8') as file:
    original_text = file.read()

print("Original text:")
print(f"'{original_text}'")
print(f"Length: {len(original_text)} characters")


ðŸ“„ Loading messy text file...
Original text:
'ThÃ­s Ã­s Ã  prÃ³blemÃ¡tic tÃ©xt fÃ­le!! It contains    **extra spaces** ,,,,, special characters!!!ðŸ’¥ðŸ”¥ðŸš€
Some words are misspelleddd,   and encoding  issues   likÃ© thÃ­s cÃ¤usÃ© problÃ«ms.    
        
Prices are inconsistent:  $29.99, 29.99 USD, 29,99$.

Emails & phone numbers may be embedded: contact@domain.com, (123)-456-7890.

Repeated punctuations!!!!! should be removed, along with **random symbols** like @@,##.

stopwords like "the", "is", and "a" appear often.

HTML tags might be present: <div>This is inside a div</div>

And sometimes, contractions won't expand: "can't", "won't", "shouldn't".

Random numeric values: 123456, 98765, 2024.'
Length: 639 characters


In [None]:
# Initialize cleaned text
cleaned_text = original_text

print("\nðŸ§¹ CLEANING PROCESS")
print("=" * 40)



ðŸ§¹ CLEANING PROCESS


In [None]:
# 1. Fix encoding issues and remove accents
print("1. Fixing encoding issues...")
def remove_accents(text):
    # Normalize to decomposed form and remove combining characters
    nfd = unicodedata.normalize('NFD', text)
    without_accents = ''.join(char for char in nfd if unicodedata.category(char) != 'Mn')
    return unicodedata.normalize('NFC', without_accents)

cleaned_text = remove_accents(cleaned_text)


1. Fixing encoding issues...


In [None]:
# 2. Remove HTML tags
print("2. Removing HTML tags...")
cleaned_text = re.sub(r'<[^>]+>', '', cleaned_text)


2. Removing HTML tags...


In [None]:
# 3. Expand contractions
print("3. Expanding contractions...")
contractions = {
    "can't": "cannot", "won't": "will not", "shouldn't": "should not",
    "couldn't": "could not", "wouldn't": "would not", "don't": "do not",
    "doesn't": "does not", "didn't": "did not", "isn't": "is not",
    "aren't": "are not", "wasn't": "was not", "weren't": "were not"
}

for contraction, expansion in contractions.items():
    cleaned_text = re.sub(re.escape(contraction), expansion, cleaned_text, flags=re.IGNORECASE)



3. Expanding contractions...


In [None]:
# 4. Standardize prices
print("4. Standardizing prices...")
# $29.99 format is already good
cleaned_text = re.sub(r'(\d+\.?\d*)\s*USD', r'$\1', cleaned_text)  # 29.99 USD -> $29.99
cleaned_text = re.sub(r'(\d+),(\d{2})\$', r'$\1.\2', cleaned_text)  # 29,99$ -> $29.99


4. Standardizing prices...


In [None]:
# 5. Clean phone numbers
print("5. Cleaning phone numbers...")
cleaned_text = re.sub(r'\((\d{3})\)-(\d{3})-(\d{4})', r'\1-\2-\3', cleaned_text)



5. Cleaning phone numbers...


In [None]:
# 6. Remove emojis and excessive symbols
print("6. Removing emojis and symbols...")
# Remove emojis
emoji_pattern = re.compile("["
                         u"\U0001F600-\U0001F64F"  # emoticons
                         u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                         u"\U0001F680-\U0001F6FF"  # transport & map symbols
                         u"\U0001F1E0-\U0001F1FF"  # flags
                         "]+", flags=re.UNICODE)
cleaned_text = emoji_pattern.sub('', cleaned_text)

# Remove excessive symbols
cleaned_text = re.sub(r'[*@#]{2,}', '', cleaned_text)
cleaned_text = re.sub(r'[&]', 'and', cleaned_text)



6. Removing emojis and symbols...


In [None]:
# 7. Fix repeated punctuation
print("7. Fixing repeated punctuation...")
cleaned_text = re.sub(r'[!]{2,}', '!', cleaned_text)
cleaned_text = re.sub(r'[?]{2,}', '?', cleaned_text)
cleaned_text = re.sub(r'[.]{3,}', '...', cleaned_text)
cleaned_text = re.sub(r'[,]{2,}', ',', cleaned_text)


7. Fixing repeated punctuation...


In [None]:
# 8. Fix basic spelling errors
print("8. Fixing spelling errors...")
spelling_fixes = {
    'misspelleddd': 'misspelled',
    'cause': 'cause',
    'problems': 'problems',
    'like': 'like'
}

for error, correct in spelling_fixes.items():
    cleaned_text = re.sub(rf'\b{re.escape(error)}\b', correct, cleaned_text, flags=re.IGNORECASE)


8. Fixing spelling errors...


In [None]:
# 9. Normalize whitespace
print("9. Normalizing whitespace...")
# Replace multiple spaces with single space
cleaned_text = re.sub(r'\s+', ' ', cleaned_text)
# Remove leading/trailing whitespace
cleaned_text = cleaned_text.strip()


9. Normalizing whitespace...


In [None]:
# 10. Capitalize sentences properly
print("10. Fixing capitalization...")
# Split by sentence-ending punctuation
sentences = re.split(r'([.!?]+)', cleaned_text)
result = []

for i in range(0, len(sentences), 2):
    if i < len(sentences):
        sentence = sentences[i].strip()
        if sentence:
            # Capitalize first letter
            sentence = sentence[0].upper() + sentence[1:].lower() if len(sentence) > 1 else sentence.upper()
            result.append(sentence)

        # Add punctuation back
        if i + 1 < len(sentences):
            result.append(sentences[i + 1])

cleaned_text = ''.join(result)


10. Fixing capitalization...


In [None]:
# Final cleanup
cleaned_text = cleaned_text.strip()

print("\nâœ¨ RESULTS")
print("=" * 40)

print("CLEANED TEXT:")
print(f"'{cleaned_text}'")

print(f"\nSTATISTICS:")
print(f"Original length: {len(original_text)} characters")
print(f"Cleaned length: {len(cleaned_text)} characters")
print(f"Reduction: {len(original_text) - len(cleaned_text)} characters")
print(f"Percentage reduction: {((len(original_text) - len(cleaned_text)) / len(original_text) * 100):.1f}%")




âœ¨ RESULTS
CLEANED TEXT:
'This is a problematic text file!It contains extra spaces , special characters!Some words are misspelled, and encoding issues like this cause problems.Prices are inconsistent: $29.99, $29.99, $29.99.Emails and phone numbers may be embedded: contact@domain.Com, 123-456-7890.Repeated punctuations!Should be removed, along with random symbols like ,.Stopwords like "the", "is", and "a" appear often.Html tags might be present: this is inside a div and sometimes, contractions will not expand: "cannot", "will not", "should not".Random numeric values: 123456, 98765, 2024.'

STATISTICS:
Original length: 639 characters
Cleaned length: 568 characters
Reduction: 71 characters
Percentage reduction: 11.1%


In [None]:
# Extract useful information
emails = re.findall(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', cleaned_text)
phones = re.findall(r'\b\d{3}-\d{3}-\d{4}\b', cleaned_text)
prices = re.findall(r'\$\d+\.?\d*', cleaned_text)

print(f"\nEXTRACTED INFORMATION:")
print(f"Emails found: {emails}")
print(f"Phone numbers found: {phones}")
print(f"Prices found: {prices}")



EXTRACTED INFORMATION:
Emails found: ['contact@domain.Com']
Phone numbers found: ['123-456-7890']
Prices found: ['$29.99', '$29.99', '$29.99']


In [None]:
# Save cleaned text
with open('text_sample_cleaned.txt', 'w', encoding='utf-8') as file:
    file.write(cleaned_text)

print(f"\nðŸ’¾ Cleaned text saved as 'text_sample_cleaned.txt'")

print(f"\nðŸŽ‰ Text cleaning completed successfully!")



ðŸ’¾ Cleaned text saved as 'text_sample_cleaned.txt'

ðŸŽ‰ Text cleaning completed successfully!


In [None]:
# Show before/after comparison
print(f"\nðŸ“Š BEFORE vs AFTER COMPARISON:")
print("=" * 50)
print("BEFORE:")
print(original_text[:200] + "..." if len(original_text) > 200 else original_text)
print("\nAFTER:")
print(cleaned_text[:200] + "..." if len(cleaned_text) > 200 else cleaned_text)



ðŸ“Š BEFORE vs AFTER COMPARISON:
BEFORE:
ThÃ­s Ã­s Ã  prÃ³blemÃ¡tic tÃ©xt fÃ­le!! It contains    **extra spaces** ,,,,, special characters!!!ðŸ’¥ðŸ”¥ðŸš€
Some words are misspelleddd,   and encoding  issues   likÃ© thÃ­s cÃ¤usÃ© problÃ«ms.    
        
Prices are...

AFTER:
This is a problematic text file!It contains extra spaces , special characters!Some words are misspelled, and encoding issues like this cause problems.Prices are inconsistent: $29.99, $29.99, $29.99.Em...
