In [4]:
import re

# Sample input
formatted_text = "<SUBJECT LINE> Employees details<END><BODY TEXT>Attached are 2 files,\n1st one is pairoll, 2nd is healtcare!<END>"

# Find the content between tags and handle spaces and punctuation
subject_match = re.search(r'<SUBJECT LINE>(.*?)<END>', formatted_text)
body_match = re.search(r'<BODY TEXT>(.*?)<END>', formatted_text, re.DOTALL)

# Extract contents without tags
subject_content = subject_match.group(1).strip() if subject_match else ''
body_content = body_match.group(1).strip().replace('\n', ' ') if body_match else ''

# Concatenate with proper punctuation and spacing
decoded_text = f"{subject_content}. {body_content}"

# Display the result
print(decoded_text)

Employees details. Attached are 2 files, 1st one is pairoll, 2nd is healtcare!


In [19]:
import re

input_text = "<SUBJECT LINE> Employees details<END><BODY TEXT>Attached are 2 files,\n1st one is pairoll, 2nd is healtcare!<END>"
# Remove start and end tags
decoded_text = re.sub(r"<.*?>", "", input_text)
# Replace newline characters with spaces
decoded_text = re.sub(r'\n', ' ', decoded_text)
# Normalize multiple spaces to a single space
decoded_text = re.sub(r'\s+', ' ', decoded_text).strip()

# Insert period after "Employees details" if it directly precedes ' Attached'
decoded_text = re.sub(r"Employees details\s*Attached", "Employees details. Attached", decoded_text)

print("Step 1 Output:\n", decoded_text)

Step 1 Output:
 Employees details. Attached are 2 files, 1st one is pairoll, 2nd is healtcare!


In [6]:
lowercased_text = decoded_text.lower()

print("Step 2 Output:\n", lowercased_text)

Step 2 Output:
 employees details. attached are 2 files, 1st one is pairoll, 2nd is healtcare!


In [27]:
digit_to_word = {
    '0': 'zero', '1': 'one', '2': 'two', '3': 'three', 
    '4': 'four', '5': 'five', '6': 'six', '7': 'seven', 
    '8': 'eight', '9': 'nine'
}
# Replace specific ordinal numbers with words
# lowercased_text = re.sub(r'\b1st\b', 'first', lowercased_text)
# lowercased_text = re.sub(r'\b2nd\b', 'second', lowercased_text)

# Replace digits with words
digit_converted_text = re.sub(r'\d', lambda x: digit_to_word[x.group()], lowercased_text)

print("Step 3 Output:\n", digit_converted_text)

Step 3 Output:
 employees details. attached are two files, first one is pairoll, second is healtcare!


In [8]:
# Remove punctuation
no_punctuation_text = re.sub(r'[^\w\s]', '', digit_converted_text)

print("Step 4 Output:\n", no_punctuation_text)

Step 4 Output:
 employees details attached are two files first one is pairoll second is healtcare


In [28]:
from textblob import TextBlob

# Perform spelling correction
spelling_corrected_text = str(TextBlob(no_punctuation_text).correct())

print("Step 5 Output:\n", spelling_corrected_text)

Step 5 Output:
 employees details attached are two files first one is patrol second is healtcare


In [20]:
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))
filtered_words = [word for word in spelling_corrected_text.split() if word not in stop_words]
no_stopwords_text = ' '.join(filtered_words)

print("Step 6 Output:\n", no_stopwords_text)

Step 6 Output:
 employees details attached two files first one payroll second healthcare


In [21]:
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()
stemmed_words = [stemmer.stem(word) for word in no_stopwords_text.split()]
stemmed_text = ' '.join(stemmed_words)

print("Step 7 Output:\n", stemmed_text)

Step 7 Output:
 employe detail attach two file first one payrol second healthcar


In [22]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(word) for word in stemmed_text.split()]
lemmatized_text = ' '.join(lemmatized_words)

print("Step 8 Output:\n", lemmatized_text)

Step 8 Output:
 employe detail attach two file first one payrol second healthcar
