In [1]:
pip install emoji

Defaulting to user installation because normal site-packages is not writeable
Looking in links: /usr/share/pip-wheels
Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install autocorrect

Defaulting to user installation because normal site-packages is not writeable
Looking in links: /usr/share/pip-wheels
Note: you may need to restart the kernel to use updated packages.


In [3]:
# =========================
# STEP 1: Read the source data
# =========================
import pandas as pd

path = "Review.csv"
df = pd.read_csv(path, encoding="latin1")   # same as ISO-8859-1
print(df.head())

# Display column content without truncation
pd.set_option('display.max_colwidth', None)  # Set to None for unlimited width
print(df)


# =========================
# STEP 2: Perform Text Pre-Processing
# =========================

# a) Convert text to lowercase
def convert_to_lowercase(text):
    return text.lower()

df["lowercased"] = df["Review"].apply(convert_to_lowercase)
pd.set_option('display.max_colwidth', None)
print(df["lowercased"])


# b) Remove URLs
import re

def remove_urls(text):
    return re.sub(r'http\S+|www\S+', '', text)

df["urls_removed"] = df["lowercased"].apply(remove_urls)
pd.set_option('display.max_colwidth', None)
print(df["urls_removed"])


# c) Remove HTML tags
from bs4 import BeautifulSoup

def remove_html_tags(text):
    return BeautifulSoup(text, "html.parser").get_text()

df["html_removed"] = df["urls_removed"].apply(remove_html_tags)
pd.set_option('display.max_colwidth', None)
print(df["html_removed"])


# d) Remove emojis
import emoji

def remove_emojis(text):
    return emoji.replace_emoji(text, replace='')

df["emojis_removed"] = df["html_removed"].apply(remove_emojis)
pd.set_option('display.max_colwidth', None)
print(df["emojis_removed"])


# e) Replace internet slang/chat words
slang_dict = {
    "tbh": "to be honest",
    "omg": "oh my god",
    "lol": "laugh out loud",
    "idk": "I don't know",
    "brb": "be right back",
    "btw": "by the way",
    "imo": "in my opinion",
    "smh": "shaking my head",
    "fyi": "for your information",
    "np": "no problem",
    "ikr": "I know right",
    "asap": "as soon as possible",
    "bff": "best friend forever",
    "gg": "good game",
    "hmu": "hit me up",
    "rofl": "rolling on the floor laughing"
}

def replace_slang(text):
    escaped_slang_words = []
    for word in slang_dict.keys():
        escaped_word = re.escape(word)
        escaped_slang_words.append(escaped_word)

    slang_pattern = r'\b(' + '|'.join(escaped_slang_words) + r')\b'

    def replace_match(match):
        slang_word = match.group(0)
        return slang_dict[slang_word.lower()]

    replaced_text = re.sub(slang_pattern, replace_match, text, flags=re.IGNORECASE)
    return replaced_text

df["slangs_replaced"] = df["emojis_removed"].apply(replace_slang)
pd.set_option('display.max_colwidth', None)
print(df["slangs_replaced"])


# f) Replace contractions
contractions_dict = {
    "wasn't": "was not",
    "isn't": "is not",
    "aren't": "are not",
    "weren't": "were not",
    "doesn't": "does not",
    "don't": "do not",
    "didn't": "did not",
    "can't": "cannot",
    "couldn't": "could not",
    "shouldn't": "should not",
    "wouldn't": "would not",
    "won't": "will not",
    "haven't": "have not",
    "hasn't": "has not",
    "hadn't": "had not",
    "i'm": "i am",
    "you're": "you are",
    "he's": "he is",
    "she's": "she is",
    "it's": "it is",
    "we're": "we are",
    "they're": "they are",
    "i've": "i have",
    "you've": "you have",
    "we've": "we have",
    "they've": "they have",
    "i'd": "i would",
    "you'd": "you would",
    "he'd": "he would",
    "she'd": "she would",
    "we'd": "we would",
    "they'd": "they would",
    "i'll": "i will",
    "you'll": "you will",
    "he'll": "he will",
    "she'll": "she will",
    "we'll": "we will",
    "they'll": "they will",
    "let's": "let us",
    "that's": "that is",
    "who's": "who is",
    "what's": "what is",
    "where's": "where is",
    "when's": "when is",
    "why's": "why is"
}

escaped_contractions = []
for contraction in contractions_dict.keys():
    escaped_contraction = re.escape(contraction)
    escaped_contractions.append(escaped_contraction)

joined_contractions = "|".join(escaped_contractions)
contractions_pattern = r'\b(' + joined_contractions + r')\b'
compiled_pattern = re.compile(contractions_pattern, flags=re.IGNORECASE)

def replace_contractions(text):
    def replace_match(match):
        matched_word = match.group(0)
        lower_matched_word = matched_word.lower()
        expanded_form = contractions_dict[lower_matched_word]
        return expanded_form

    expanded_text = compiled_pattern.sub(replace_match, text)
    return expanded_text

df["contractions_replaced"] = df["slangs_replaced"].apply(replace_contractions)
pd.set_option('display.max_colwidth', None)
print(df["contractions_replaced"])


# g) Remove punctuations and special characters
import string

def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

df["punctuations_removed"] = df["contractions_replaced"].apply(remove_punctuation)
pd.set_option('display.max_colwidth', None)
print(df["punctuations_removed"])


# h) Remove numbers
def remove_numbers(text):
    return re.sub(r'\d+', '', text)

df["numbers_removed"] = df["punctuations_removed"].apply(remove_numbers)
pd.set_option('display.max_colwidth', None)
print(df["numbers_removed"])


# i) Correct spelling mistakes
from autocorrect import Speller

spell = Speller(lang='en')

def correct_spelling(text):
    return spell(text)

df["spelling_corrected"] = df["numbers_removed"].apply(correct_spelling)
pd.set_option('display.max_colwidth', None)
print(df["spelling_corrected"])


# j) Remove stopwords
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    words = text.split()
    filtered_words = []
    for word in words:
        lower_word = word.lower()
        if lower_word not in stop_words:
            filtered_words.append(word)
    return " ".join(filtered_words)

df["stopwords_removed"] = df["spelling_corrected"].apply(remove_stopwords)
pd.set_option('display.max_colwidth', None)
print(df["stopwords_removed"])


# k) Stemming
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

def stem_text(text):
    if not isinstance(text, str):
        return ""
    words = text.split()
    stemmed_words = [stemmer.stem(word) for word in words]
    return " ".join(stemmed_words)

df["stemmed_words"] = df["stopwords_removed"].apply(stem_text)
pd.set_option('display.max_colwidth', None)
print(df["stemmed_words"])


# l) Lemmatization (with POS tagging)
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk import pos_tag

nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('punkt_tab')

lemmatizer = WordNetLemmatizer()

def get_wordnet_pos(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def lemmatize_text(text):
    if not isinstance(text, str):
        return ""
    words = word_tokenize(text)
    pos_tags = pos_tag(words)
    lemmatized_words = [lemmatizer.lemmatize(word, get_wordnet_pos(tag)) for word, tag in pos_tags]
    return " ".join(lemmatized_words)

df["lemmatized"] = df["stopwords_removed"].apply(lemmatize_text)
pd.set_option('display.max_colwidth', None)
print(df["lemmatized"])


# =========================
# STEP 3: Save the result to a file
# =========================
df.to_csv("Processed_Reviews.csv", index=False)

                                              Review
0  The product arrived on time. Packaging was gre...
1           THIS PRODUCT IS JUST AMAZING! I LOVE IT.
2  I bought this phone for $799, and it has a 120...
3  Wow!!! This product is awesome... but a bit ex...
4                The laptop works perfectly fine.   
                                                                           Review
0   The product arrived on time. Packaging was great, and the quality is amazing!
1                                        THIS PRODUCT IS JUST AMAZING! I LOVE IT.
2     I bought this phone for $799, and it has a 120Hz display. Totally worth it!
3                         Wow!!! This product is awesome... but a bit expensive??
4                                             The laptop works perfectly fine.   
5    Check out the full product details here: https://example.com/product-details
6         <div><h2>Great Purchase!</h2><p>I am happy with this product.</p></div>
7                The battr

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/d8db6fe4-06fa-44a2-a998-
[nltk_data]     5bb20b593b23/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/d8db6fe4-06fa-44a2-a998-
[nltk_data]     5bb20b593b23/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /home/d8db6fe4-06fa-44a2-a998-
[nltk_data]     5bb20b593b23/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/d8db6fe4-06fa-44a2-a998-
[nltk_data]     5bb20b593b23/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/d8db6fe4-06fa-44a2-a998-
[nltk_data]     5bb20b593b23/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


0          product arrived time packaging great quality amazing
1                                          product amazing love
2                         bought phone hz display totally worth
3                             wow product awesome bit expensive
4                                   laptop works perfectly fine
5                                    check full product details
6                                 great purchased happy product
7                   battery life excellent charging cable short
8                            cannot believe good expect quality
9                    love product fast delivery amazing quality
10                  honest wasnt expecting much oh god awesome
11                                  best product ever used life
12    shoes comfortable fitting nicely worked perfectly jogging
Name: stopwords_removed, dtype: object
0     product arriv time packag great qualiti amaz
1                                product amaz love
2              bought phone