In [1]:
# Load dataset
import pandas as pd

In [2]:
import pandas as pd

file_path = "Review.csv"

try:
    df = pd.read_csv(file_path, encoding='utf-8')
    print(df.head()) # Shows the first few rows
except UnicodeDecodeError:
    print(f"Error: Could not read {file_path}. Ensure it's UTF-8 encoded.")

                                              Review
0  The product arrived on time. Packaging was gre...
1           THIS PRODUCT IS JUST AMAZING! I LOVE IT.
2  I bought this phone for $799, and it has a 120...
3  Wow!!! This product is awesome... but a bit ex...
4                The laptop works perfectly fine.   


In [3]:
# Display column content without truncation
pd.set_option('display.max_colwidth', None) # Set to None for unlimited width
print(df)

                                                                           Review
0   The product arrived on time. Packaging was great, and the quality is amazing!
1                                        THIS PRODUCT IS JUST AMAZING! I LOVE IT.
2     I bought this phone for $799, and it has a 120Hz display. Totally worth it!
3                         Wow!!! This product is awesome... but a bit expensive??
4                                             The laptop works perfectly fine.   
5    Check out the full product details here: https://example.com/product-details
6         <div><h2>Great Purchase!</h2><p>I am happy with this product.</p></div>
7                The battry life is excelent, but the chargin cable is too short.
8                       I can't believe it's so good! Didn't expect such quality.
9                   Love this product! ???? Fast delivery ??, amazing quality! ??
10                       TBH, I wasn't expecting much, but OMG, this is awesome!!
11              

In [5]:
# Lowercase conversion
def convert_to_lowercase(text):
 return text.lower()
df["lowercased"] = df["Review"].apply(convert_to_lowercase)
# Display column content without truncation
pd.set_option('display.max_colwidth', None) # Set to None for unlimited width
print(df["lowercased"])

0     the product arrived on time. packaging was great, and the quality is amazing!
1                                          this product is just amazing! i love it.
2       i bought this phone for $799, and it has a 120hz display. totally worth it!
3                           wow!!! this product is awesome... but a bit expensive??
4                                               the laptop works perfectly fine.   
5      check out the full product details here: https://example.com/product-details
6           <div><h2>great purchase!</h2><p>i am happy with this product.</p></div>
7                  the battry life is excelent, but the chargin cable is too short.
8                         i can't believe it's so good! didn't expect such quality.
9                     love this product! ???? fast delivery ??, amazing quality! ??
10                         tbh, i wasn't expecting much, but omg, this is awesome!!
11                            this is the best product i have ever used in m

In [6]:
# Removal of URLs
import re

In [7]:
# remove any URLs that start with "http" or "www" from the text
def remove_urls(text):
 return re.sub(r'http\S+|www\S+', '', text)
df["urls_removed"] = df["lowercased"].apply(remove_urls)
# Display column content without truncation
pd.set_option('display.max_colwidth', None) # Set to None for unlimited width
print(df["urls_removed"])

0     the product arrived on time. packaging was great, and the quality is amazing!
1                                          this product is just amazing! i love it.
2       i bought this phone for $799, and it has a 120hz display. totally worth it!
3                           wow!!! this product is awesome... but a bit expensive??
4                                               the laptop works perfectly fine.   
5                                         check out the full product details here: 
6           <div><h2>great purchase!</h2><p>i am happy with this product.</p></div>
7                  the battry life is excelent, but the chargin cable is too short.
8                         i can't believe it's so good! didn't expect such quality.
9                     love this product! ???? fast delivery ??, amazing quality! ??
10                         tbh, i wasn't expecting much, but omg, this is awesome!!
11                            this is the best product i have ever used in m

In [8]:
# Removal of HTML tags
from bs4 import BeautifulSoup
# extracts only the text, removing all HTML tags
def remove_html_tags(text):
 return BeautifulSoup(text, "html.parser").get_text()
df["html_removed"] = df["urls_removed"].apply(remove_html_tags)
# Display column content without truncation
pd.set_option('display.max_colwidth', None) # Set to None for unlimited width
print(df["html_removed"])

0     the product arrived on time. packaging was great, and the quality is amazing!
1                                          this product is just amazing! i love it.
2       i bought this phone for $799, and it has a 120hz display. totally worth it!
3                           wow!!! this product is awesome... but a bit expensive??
4                                               the laptop works perfectly fine.   
5                                         check out the full product details here: 
6                                      great purchase!i am happy with this product.
7                  the battry life is excelent, but the chargin cable is too short.
8                         i can't believe it's so good! didn't expect such quality.
9                     love this product! ???? fast delivery ??, amazing quality! ??
10                         tbh, i wasn't expecting much, but omg, this is awesome!!
11                            this is the best product i have ever used in m

In [9]:
pip install emoji

Note: you may need to restart the kernel to use updated packages.


In [10]:
import emoji

def remove_emojis(text):
 return emoji.replace_emoji(text, replace='')
df["emojis_removed"] = df["html_removed"].apply(remove_emojis)

pd.set_option('display.max_colwidth', None)
print(df["emojis_removed"])

0     the product arrived on time. packaging was great, and the quality is amazing!
1                                          this product is just amazing! i love it.
2       i bought this phone for $799, and it has a 120hz display. totally worth it!
3                           wow!!! this product is awesome... but a bit expensive??
4                                               the laptop works perfectly fine.   
5                                         check out the full product details here: 
6                                      great purchase!i am happy with this product.
7                  the battry life is excelent, but the chargin cable is too short.
8                         i can't believe it's so good! didn't expect such quality.
9                     love this product! ???? fast delivery ??, amazing quality! ??
10                         tbh, i wasn't expecting much, but omg, this is awesome!!
11                            this is the best product i have ever used in m

In [11]:
slang_dict = {
"tbh": "to be honest",
"omg": "oh my god",
"lol": "laugh out loud",
"idk": "I don't know",
"brb": "be right back",
"btw": "by the way",
"imo": "in my opinion",
"smh": "shaking my head",
"fyi": "for your information",
"np": "no problem",
"ikr": "I know right",
"asap": "as soon as possible",
"bff": "best friend forever",
"gg": "good game",
"hmu": "hit me up",
"rofl": "rolling on the floor laughing"
}

In [12]:
def replace_slang(text):
 escaped_slang_words = [] 

 for word in slang_dict.keys():
  escaped_word = re.escape(word) 
  escaped_slang_words.append(escaped_word) 

  slang_pattern = r'\b(' + '|'.join(escaped_slang_words) + r')\b'

 def replace_match(match):
  slang_word = match.group(0) 
  return slang_dict[slang_word.lower()] 

 replaced_text = re.sub(slang_pattern, replace_match, text, flags=re.IGNORECASE)

 return replaced_text

df["slangs_replaced"] = df["emojis_removed"].apply(replace_slang)

pd.set_option('display.max_colwidth', None) 
print(df["slangs_replaced"])

0     the product arrived on time. packaging was great, and the quality is amazing!
1                                          this product is just amazing! i love it.
2       i bought this phone for $799, and it has a 120hz display. totally worth it!
3                           wow!!! this product is awesome... but a bit expensive??
4                                               the laptop works perfectly fine.   
5                                         check out the full product details here: 
6                                      great purchase!i am happy with this product.
7                  the battry life is excelent, but the chargin cable is too short.
8                         i can't believe it's so good! didn't expect such quality.
9                     love this product! ???? fast delivery ??, amazing quality! ??
10          to be honest, i wasn't expecting much, but oh my god, this is awesome!!
11                            this is the best product i have ever used in m

In [13]:
contractions_dict = {
"wasn't": "was not",
"isn't": "is not",
"aren't": "are not",
"weren't": "were not",
"doesn't": "does not",
"don't": "do not",
"didn't": "did not",
"can't": "cannot",
"couldn't": "could not",
"shouldn't": "should not",
"wouldn't": "would not",
"won't": "will not",
"haven't": "have not",
"hasn't": "has not",
"hadn't": "had not",
"i'm": "i am",
"you're": "you are",
"he's": "he is",
"she's": "she is",
"it's": "it is",
"we're": "we are",
"they're": "they are",
"i've": "i have",
"you've": "you have",
"we've": "we have",
"they've": "they have",
"i'd": "i would",
"you'd": "you would",
"he'd": "he would",
"she'd": "she would",
"we'd": "we would",
"they'd": "they would",
"i'll": "i will",
"you'll": "you will",
"he'll": "he will",
"she'll": "she will",
"we'll": "we will",
"they'll": "they will",
"let's": "let us",
"that's": "that is",
"who's": "who is",
"what's": "what is",
"where's": "where is",
"when's": "when is",
"why's": "why is"
}

In [14]:
escaped_contractions = []
for contraction in contractions_dict.keys():
    escaped_contraction = re.escape(contraction)
    escaped_contractions.append(escaped_contraction)

joined_contractions = "|".join(escaped_contractions)

contractions_pattern = r'\b(' + joined_contractions + r')\b'

compiled_pattern = re.compile(contractions_pattern, flags=re.IGNORECASE)

def replace_contractions(text):
    def replace_match(match):
        matched_word = match.group(0)
        lower_matched_word = matched_word.lower()
        expanded_form = contractions_dict.get(lower_matched_word) #Use .get() to avoid key errors.
        if expanded_form:
            return expanded_form
        else:
            return matched_word #Return the original word if not found in the dictionary.

    expanded_text = compiled_pattern.sub(replace_match, text)
    return expanded_text

df["contractions_replaced"] = df["slangs_replaced"].apply(replace_contractions)

pd.set_option('display.max_colwidth', None)
print(df["contractions_replaced"])

0     the product arrived on time. packaging was great, and the quality is amazing!
1                                          this product is just amazing! i love it.
2       i bought this phone for $799, and it has a 120hz display. totally worth it!
3                           wow!!! this product is awesome... but a bit expensive??
4                                               the laptop works perfectly fine.   
5                                         check out the full product details here: 
6                                      great purchase!i am happy with this product.
7                  the battry life is excelent, but the chargin cable is too short.
8                      i cannot believe it is so good! did not expect such quality.
9                     love this product! ???? fast delivery ??, amazing quality! ??
10         to be honest, i was not expecting much, but oh my god, this is awesome!!
11                            this is the best product i have ever used in m

In [15]:
# Remove punctuations and special characters
import string
# Function to remove punctuation
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))
# Apply the function to the column
df["punctuations_removed"] = df["contractions_replaced"].apply(remove_punctuation)
# Display column content without truncation
pd.set_option('display.max_colwidth', None) # Set to None for unlimited width
print(df["punctuations_removed"])

0     the product arrived on time packaging was great and the quality is amazing
1                                         this product is just amazing i love it
2        i bought this phone for 799 and it has a 120hz display totally worth it
3                                wow this product is awesome but a bit expensive
4                                             the laptop works perfectly fine   
5                                       check out the full product details here 
6                                     great purchasei am happy with this product
7                 the battry life is excelent but the chargin cable is too short
8                     i cannot believe it is so good did not expect such quality
9                             love this product  fast delivery  amazing quality 
10           to be honest i was not expecting much but oh my god this is awesome
11                          this is the best product i have ever used in my life
12    the shoes were comfort

In [16]:
# Remove numbers
def remove_numbers(text):
    return re.sub(r'\d+', '', text) # Removes all numeric characters
# Apply the function to the column
df["numbers_removed"] = df["punctuations_removed"].apply(remove_numbers)
# Display column content without truncation
pd.set_option('display.max_colwidth', None) # Set to None for unlimited width
print(df["numbers_removed"])

0     the product arrived on time packaging was great and the quality is amazing
1                                         this product is just amazing i love it
2              i bought this phone for  and it has a hz display totally worth it
3                                wow this product is awesome but a bit expensive
4                                             the laptop works perfectly fine   
5                                       check out the full product details here 
6                                     great purchasei am happy with this product
7                 the battry life is excelent but the chargin cable is too short
8                     i cannot believe it is so good did not expect such quality
9                             love this product  fast delivery  amazing quality 
10           to be honest i was not expecting much but oh my god this is awesome
11                          this is the best product i have ever used in my life
12    the shoes were comfort

In [17]:
pip install pyspellchecker




In [18]:
from spellchecker import SpellChecker
# Initialize spell checker
spell = SpellChecker()

def correct_spelling(text):
    words = text.split()
    corrected_words = [spell.correction(word) if spell.correction(word) else word for word in words]
    return " ".join(corrected_words)  # Apply correction

df["spelling_corrected"] = df["numbers_removed"].apply(correct_spelling)
pd.set_option('display.max_colwidth', None)
print(df["spelling_corrected"])

0     the product arrived on time packaging was great and the quality is amazing
1                                         this product is just amazing i love it
2               i bought this phone for and it has a hz display totally worth it
3                                wow this product is awesome but a bit expensive
4                                                the laptop works perfectly fine
5                                        check out the full product details here
6                                      great purchase am happy with this product
7              the battery life is excellent but the charging cable is too short
8                     i cannot believe it is so good did not expect such quality
9                                love this product fast delivery amazing quality
10           to be honest i was not expecting much but oh my god this is awesome
11                          this is the best product i have ever used in my life
12    the shoes were comfort

In [19]:
import nltk
from nltk.corpus import stopwords
import pandas as pd  # Import pandas if it's not already imported

# Download stopwords if not already downloaded
nltk.download('stopwords')

# Define stopwords list
stop_words = set(stopwords.words('english'))

# Function to remove stopwords
def remove_stopwords(text):
    if isinstance(text, str):  # Check if the input is a string
        words = text.split()  # Split text into words
        filtered_words = []  # Create an empty list to store words after stopword removal
        for word in words:  # Loop through each word in the list of words
            lower_word = word.lower()  # Convert the word to lowercase for uniform comparison
            if lower_word not in stop_words:  # Check if the lowercase word is NOT in the stopwords list
                filtered_words.append(word)  # If it's not a stopword, add it to the filtered list
        return " ".join(filtered_words)  # Join words back into a sentence
    else:
        return "" #Return empty string if not a string.

# Assuming you have a DataFrame 'df' with a column 'spelling_corrected'
# If you don't have it, create a dummy DataFrame for testing:
data = {'spelling_corrected': ["This is a test sentence with some stopwords.", None, "Another example sentence."]}
df = pd.DataFrame(data)

# Apply the function to the column
df["stopwords_removed"] = df["spelling_corrected"].apply(remove_stopwords)

# Display column content without truncation
pd.set_option('display.max_colwidth', None)  # Set to None for unlimited width
print(df["stopwords_removed"])

0     test sentence stopwords.
1                             
2    Another example sentence.
Name: stopwords_removed, dtype: object


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [20]:
import nltk
from nltk.stem import PorterStemmer
import pandas as pd  # Ensure pandas is imported

# Initialize the stemmer
stemmer = PorterStemmer()

# Function to apply stemming
def stem_text(text):
    if not isinstance(text, str):
        return ""  # Return empty string for non-string inputs
    
    words = text.split()
    stemmed_words = [stemmer.stem(word) for word in words]  # Apply stemming
    return " ".join(stemmed_words)

# Assuming you have a DataFrame 'df' with a column 'stopwords_removed'
# If you don't have it, create a dummy DataFrame for testing:
data = {'stopwords_removed': ["running quickly", None, "The cats are playing."]}
df = pd.DataFrame(data)

# Apply the function
df["stemmed_words"] = df["stopwords_removed"].apply(stem_text)

# Display column content without truncation
pd.set_option('display.max_colwidth', None)  # Set to None for unlimited width
print(df["stemmed_words"])

0             run quickli
1                        
2    the cat are playing.
Name: stemmed_words, dtype: object


In [12]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk import pos_tag
import pandas as pd  # Ensure pandas is imported

# Download the required resources
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

# Function to map NLTK POS tags to WordNet POS tags
def get_wordnet_pos(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN  # Default to noun

# Function to lemmatize text with POS tagging
def lemmatize_text(text):
    if not isinstance(text, str):
        return ""
    words = word_tokenize(text)
    try:
        pos_tags = pos_tag(words)
    except LookupError:
        nltk.download('averaged_perceptron_tagger')
        pos_tags = pos_tag(words)

    lemmatized_words = [lemmatizer.lemmatize(word, get_wordnet_pos(tag)) for word, tag in pos_tags]
    return " ".join(lemmatized_words)

# Assuming you have a DataFrame 'df' with a column 'stopwords_removed'
# If you don't have it, create a dummy DataFrame for testing:
data = {'stopwords_removed': ["The running cats are playing.", None, "He is a good runner"]}
df = pd.DataFrame(data)

# Apply the function to the column
df["lemmatized"] = df["stopwords_removed"].apply(lemmatize_text)

# Display column content without truncation
pd.set_option('display.max_colwidth', None)
print(df["lemmatized"])

0    The running cat be play .
1                             
2          He be a good runner
Name: lemmatized, dtype: object


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [14]:
df.to_csv("Processed_Reviews.csv", index=False)