In [9]:
# Handling Noisy Text Data

# Step 1: Import Necessary Libraries
import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from textblob import TextBlob
import matplotlib.pyplot as plt

# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Step 2: Sample Noisy Text Data
data = {
    'text': [
        "I luvvv this product!!! It's soooo good 😊😊",
        "Totally w0rth the price... would buy again!!!",
        "Terrible service!! Waited 4ever 😡",
        "Best. Purchase. Ever. Highly recommend!!",
        "meh... it's ok, not gr8 but not bad either.",
        "I h8 when this happens!!!",
        "OMG!!! This is the best thing I've ever bought!!!",
        "Worst experience ever!!! Do not buy!!!",
        "Soooooo happy with this!!!",
        "Not what I expected... kinda disappointed."
    ]
}
df = pd.DataFrame(data)

# Step 3: Define Text Cleaning Function
def clean_text(text, correct_spelling=False):
    # Convert to lowercase
    text = text.lower()

    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

    # Remove user mentions and hashtags
    text = re.sub(r'\@\w+|\#','', text)

    # Remove emojis and non-ASCII characters
    text = text.encode('ascii', 'ignore').decode('ascii')

    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Remove numbers
    text = re.sub(r'\d+', '', text)

    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    # Tokenize text
    tokens = nltk.word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # Lemmatize tokens
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    # Optional: Correct spelling
    if correct_spelling:
        tokens = [str(TextBlob(word).correct()) for word in tokens]

    # Join tokens back to string
    cleaned_text = ' '.join(tokens)

    return cleaned_text

# Step 4: Apply Cleaning Function to Data
df['cleaned_text'] = df['text'].apply(lambda x: clean_text(x, correct_spelling=False))

# Step 5: Display Original and Cleaned Text
print("Original vs. Cleaned Text:")
for original, cleaned in zip(df['text'], df['cleaned_text']):
    print(f"Original: {original}")
    print(f"Cleaned: {cleaned}\n")

# Step 6: Visualize Word Frequency Before and After Cleaning
from collections import Counter

# Tokenize original and cleaned text
original_tokens = [word for text in df['text'] for word in nltk.word_tokenize(text.lower())]
cleaned_tokens = [word for text in df['cleaned_text'] for word in nltk.word_tokenize(text.lower())]

# Get word frequencies
original_freq = Counter(original_tokens)
cleaned_freq = Counter(cleaned_tokens)

# Plot top 10 words before cleaning
original_common = original_freq.most_common(10)
words_orig, counts_orig = zip(*original_common)
plt.figure(figsize=(10,5))
plt.bar(words_orig, counts_orig)
plt.title('Top 10 Words Before Cleaning')
plt.show()

# Plot top 10 words after cleaning
cleaned_common = cleaned_freq.most_common(10)
words_clean, counts_clean = zip(*cleaned_common)
plt.figure(figsize=(10,5))
plt.bar(words_clean, counts_clean)
plt.title('Top 10 Words After Cleaning')
plt.show()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - '/root/nltk_data'
    - '/usr/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************
