In [2]:
import pandas as pd

In [3]:
data = pd.read_excel(r"data/sentiment_preprocessing.xlsx")
data.head(5)

Unnamed: 0,Review,Sentiment
0,Wow!!! This product is absolutely amazing! I l...,Positive
1,"Ugh, such a horrible experience. Never buying ...",Negative
2,"Not bad, but not great either. It’s okay for t...",Neutral
3,Can't believe this is happening!!! 😱 Ordered a...,Negative
4,Thank you for your fast delivery! 🚚 I'm super ...,Positive


In [5]:
data.duplicated().sum()

0

In [6]:
data.isnull().sum()

Review       0
Sentiment    0
dtype: int64

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57 entries, 0 to 56
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Review     57 non-null     object
 1   Sentiment  57 non-null     object
dtypes: object(2)
memory usage: 1.0+ KB


In [9]:
data['Sentiment'].value_counts()

Sentiment
Positive    29
Negative    25
Neutral      3
Name: count, dtype: int64

## Text Preprocessing

### 1. Lower casing

In [12]:
data['Review'] = data['Review'].str.lower()
data.Review.head()

0    wow!!! this product is absolutely amazing! i l...
1    ugh, such a horrible experience. never buying ...
2    not bad, but not great either. it’s okay for t...
3    can't believe this is happening!!! 😱 ordered a...
4    thank you for your fast delivery! 🚚 i'm super ...
Name: Review, dtype: object

### 2. Removing HTML tags

In [15]:
import re
def remove_html_tags(text):
    pattern = re.compile('<.*?>')
    return pattern.sub(r"",text)

In [16]:
data['Review'] = data['Review'].apply(remove_html_tags)
data.Review.head()

0    wow!!! this product is absolutely amazing! i l...
1    ugh, such a horrible experience. never buying ...
2    not bad, but not great either. it’s okay for t...
3    can't believe this is happening!!! 😱 ordered a...
4    thank you for your fast delivery! 🚚 i'm super ...
Name: Review, dtype: object

### 3. Removing URL's

In [17]:
import re
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r"", text)

In [18]:
data['Review'] = data['Review'].apply(remove_urls)

### 4. Removing Punctuation

In [19]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [20]:
exclude = string.punctuation
def remove_punctuation(text):
    return text.translate(str.maketrans('','',exclude))

In [21]:
data['Review'] =  data['Review'].apply(remove_punctuation)
data['Review'].head()

0    wow this product is absolutely amazing i love ...
1    ugh such a horrible experience never buying fr...
2    not bad but not great either it’s okay for the...
3    cant believe this is happening 😱 ordered a pac...
4    thank you for your fast delivery 🚚 im super im...
Name: Review, dtype: object

1. **`str.maketrans('', '', exclude)`**:
   - `str.maketrans()` is a method that creates a translation table for character substitution or removal.
   - The three arguments are:
     - **First argument (`''`)**: Specifies the characters to replace. Since this is empty, no replacement is done.
     - **Second argument (`''`)**: Specifies the characters to replace them with. Since this is empty, no replacement happens.
     - **Third argument (`exclude`)**: Contains the characters to be removed.
   - This means that the `exclude` characters will be mapped for removal.

2. **`text.translate(...)`**:
   - The `translate()` method applies the translation table generated by `str.maketrans()` to the string `text`.
   - Any character found in `exclude` is removed from the `text`.


### 5. Spelling Correction

In [23]:
from textblob import TextBlob

def correct_spell(text):
    textblob = TextBlob(text)
    return str(TextBlob(text).correct())

In [25]:
data['Review'] = data['Review'].apply(correct_spell)
data.Review.head(8)

0    now this product is absolutely amazing i love ...
1    ugh such a horrible experience never buying fr...
2    not bad but not great either it’s okay for the...
3    can believe this is happening 😱 ordered a pack...
4    thank you for your fast delivery 🚚 in super im...
5    org this pp keeps crashing 😠 what a piece of j...
6    i love programming she said and honestly i agr...
7    avoid this place at all costs 🤮 the food was a...
Name: Review, dtype: object

### 6. Removing stop words

In [26]:
from nltk.corpus import stopwords
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [28]:
def remove_stopwords(text):
    new_text = []
    for word in text.split():
        if word in stopwords.words('english'):
            new_text.append("")
        else:
            new_text.append(word)
    x = new_text[:]
    new_text.clear()
    return " ".join(x)

In [29]:
data['Review'] = data['Review'].apply(remove_stopwords)
data['Review'].head(10)

0      product  absolutely amazing  love  😍 highly ...
1    ugh   horrible experience never buying   store...
2               bad   great either it’s okay   price 🤷
3     believe   happening 😱 ordered  package  month...
4    thank    fast delivery 🚚  super impressed   se...
5    org  pp keeps crashing 😠   piece  june don’t w...
6     love programming  said  honestly  agree patro...
7    avoid  place   costs 🤮  food  awful   service ...
8    fantastic book  put   📚 highly recommend   mys...
9                it’s  another cadet nothing special 🤔
Name: Review, dtype: object

### 7. Handling Emojis

In [31]:
# This code will convert the emojis into their respective text representation

# import emoji
# print(emoji.demojize('product  absolutely amazing  love  😍 highly'))

## Here we remove emojis since they are not necessary
def remove_emoji(text):
    emoji_pattern = re.compile("[\U0001F600-\U0001F64F|\U0001F300-\U0001F5FF|\U0001F680-\U0001F6FF|\U0001F700-\U0001F77F|\U0001F780-\U0001F7FF|\U0001F800-\U0001F8FF|\U0001F900-\U0001F9FF|\U0001FA00-\U0001FA6F|\U0001FA70-\U0001FAFF|\U00002702-\U000027B0]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [32]:
data['Review'] = data['Review'].apply(remove_emoji)
data['Review'].head()

0      product  absolutely amazing  love   highly r...
1    ugh   horrible experience never buying   store...
2                bad   great either it’s okay   price 
3     believe   happening  ordered  package  month ...
4    thank    fast delivery   super impressed   ser...
Name: Review, dtype: object