In [1]:
pip install textblob

Collecting textblob
  Downloading textblob-0.19.0-py3-none-any.whl.metadata (4.4 kB)
Downloading textblob-0.19.0-py3-none-any.whl (624 kB)
   ---------------------------------------- 0.0/624.3 kB ? eta -:--:--
   ---------------- ----------------------- 262.1/624.3 kB ? eta -:--:--
   ---------------------------------------- 624.3/624.3 kB 2.3 MB/s eta 0:00:00
Installing collected packages: textblob
Successfully installed textblob-0.19.0
Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import re
import emoji
import string
import nltk

from bs4 import BeautifulSoup
from textblob import TextBlob  # Alternative for spelling correction
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk import pos_tag

# Download required NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')  # For lemmatization
nltk.download('omw-1.4')  # WordNet lexical database
nltk.download('averaged_perceptron_tagger')  # For POS tagging
nltk.download('punkt')  # For tokenization

# Initialize tools
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Dictionary of slang words and their replacements
slang_dict = {
    "tbh": "to be honest",
    "omg": "oh my god",
    "lol": "laugh out loud",
    "idk": "I don't know",
    "brb": "be right back",
    "btw": "by the way",
    "imo": "in my opinion",
    "smh": "shaking my head",
    "fyi": "for your information",
    "np": "no problem",
    "ikr": "I know right",
    "asap": "as soon as possible",
    "bff": "best friend forever",
    "gg": "good game",
    "hmu": "hit me up",
    "rofl": "rolling on the floor laughing"
}

# Contractions dictionary
contractions_dict = {
    "wasn't": "was not",
    "isn't": "is not",
    "aren't": "are not",
    "weren't": "were not",
    "doesn't": "does not",
    "don't": "do not",
    "didn't": "did not",
    "can't": "cannot",
    "couldn't": "could not",
    "shouldn't": "should not",
    "wouldn't": "would not",
    "won't": "will not",
    "haven't": "have not",
    "hasn't": "has not",
    "hadn't": "had not",
    "i'm": "i am",
    "you're": "you are",
    "he's": "he is",
    "she's": "she is",
    "it's": "it is",
    "we're": "we are",
    "they're": "they are",
    "i've": "i have",
    "you've": "you have",
    "we've": "we have",
    "they've": "they have",
    "i'd": "i would",
    "you'd": "you would",
    "he'd": "he would",
    "she'd": "she would",
    "we'd": "we would",
    "they'd": "they would",
    "i'll": "i will",
    "you'll": "you will",
    "he'll": "he will",
    "she'll": "she will",
    "we'll": "we will",
    "they'll": "they will",
    "let's": "let us",
    "that's": "that is",
    "who's": "who is",
    "what's": "what is",
    "where's": "where is",
    "when's": "when is",
    "why's": "why is"
}

# Function to remove URLs
def remove_urls(text):
    return re.sub(r'http\S+|www\S+', '', text)
    
# Function to remove HTML tags
def remove_html(text):
    return BeautifulSoup(text, "html.parser").get_text()

# Function to remove emojis
def remove_emojis(text):
    return emoji.replace_emoji(text, replace='')

# Function to replace slang words
def replace_slang(text):
    slang_pattern = r'\b(' + '|'.join(map(re.escape, slang_dict.keys())) + r')\b'
    return re.sub(slang_pattern, lambda match: slang_dict[match.group(0).lower()], text, flags=re.IGNORECASE)

# Function to replace contractions
def replace_contractions(text):
    contractions_pattern = r'\b(' + '|'.join(map(re.escape, contractions_dict.keys())) + r')\b'
    return re.sub(contractions_pattern, lambda match: contractions_dict[match.group(0).lower()], text, flags=re.IGNORECASE)

# Function to remove punctuation
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

# Function to remove numbers
def remove_numbers(text):
    return re.sub(r'\d+', '', text)

# Function to correct spelling using TextBlob
def correct_spelling(text):
    return str(TextBlob(text).correct())

# Function to remove stopwords
def remove_stopwords(text):
    words = text.split()
    return " ".join([word for word in words if word.lower() not in stop_words])

# Function to get WordNet POS tags
def get_wordnet_pos(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN  # Default

# Function to lemmatize text
def lemmatize_text(text):
    words = word_tokenize(text)
    pos_tags = pos_tag(words)
    return " ".join([lemmatizer.lemmatize(word, get_wordnet_pos(tag)) for word, tag in pos_tags])

# Function to tokenize text
def tokenize_text(text):
    return word_tokenize(text)

# Function to preprocess text
def preprocess_text(text):
    text = text.lower()                # Step 1: Convert to lowercase
    text = remove_urls(text)           # Step 2: Remove URLs
    text = remove_html(text)           # Step 3: Remove HTML
    text = remove_emojis(text)         # Step 4: Remove Emojis
    text = replace_slang(text)         # Step 5: Replace Slang
    text = replace_contractions(text)  # Step 6: Expand Contractions
    text = remove_punctuation(text)    # Step 7: Remove Punctuation
    text = remove_numbers(text)        # Step 8: Remove Numbers
    text = correct_spelling(text)      # Step 9: Correct Spelling
    text = remove_stopwords(text)      # Step 10: Remove Stopwords
    text = lemmatize_text(text)        # Step 11: Lemmatization
    return tokenize_text(text)         # Step 12: Tokenization

# Load dataset
df = pd.read_csv("Review.csv")  # Replace with your file

# Apply preprocessing
df["processed"] = df["Review"].apply(preprocess_text)

# Save the processed dataset
df.to_csv("Processed_Reviews.csv", index=False)

# Display first few rows
print(df[["Review", "processed"]].head())


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


                                              Review  \
0  The product arrived on time. Packaging was gre...   
1           THIS PRODUCT IS JUST AMAZING! I LOVE IT.   
2  I bought this phone for $799, and it has a 120...   
3  Wow!!! This product is awesome... but a bit ex...   
4                The laptop works perfectly fine.      

                                           processed  
0  [product, arrive, time, pack, great, quality, ...  
1                             [product, amaze, love]  
2              [buy, phone, display, totally, worth]  
3                 [product, awesome, bit, expensive]  
4                    [lawton, work, perfectly, fine]  
