# Text Cleaning and Pre-processing

In [1]:
# import required libraries
import pandas as pd
import numpy as np
import seaborn as sns
import re
import nltk
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score
from sklearn.tree import DecisionTreeClassifier

In [2]:
# Read data
df = pd.read_csv("/content/amazon.txt", delimiter='\t', header=None)
df.columns = ["Review_text", "Review_class"]

In [3]:
# Print a head of data
df.head(10)

Unnamed: 0,Review_text,Review_class
0,So there is no way for me to plug it in here i...,0
1,"Good case, Excellent value.",1
2,Great for the jawbone.,1
3,Tied to charger for conversations lasting more...,0
4,The mic is great.,1
5,I have to jiggle the plug to get it to line up...,0
6,If you have several dozen or several hundred c...,0
7,If you are Razr owner...you must have this!,1
8,"Needless to say, I wasted my money.",0
9,What a waste of money and time!.,0


In [4]:
# Download data from NLTK Downloader
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
# Data preprocessing on 'Review_text' feature
def clean_text(text):
    # convert text into lower case
    text = text.lower()

    # Remove URLs
    pattern_url = re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    text = pattern_url.sub('', text)

    # Remove emojis
    pattern_emoji = re.compile("["
                               u"\U0001F600-\U0001FFFF"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    text = pattern_emoji.sub(r'', text)

    # Replace contractions
    contractions = {"i'm": "i am", "he's": "he is", "she's": "she is", "that's": "that is",
                    "what's": "what is", "where's": "where is", "'ll": " will", "'ve": " have",
                    "'re": " are", "'d": " would", "won't": "will not", "don't": "do not",
                    "did't": "did not", "can't": "can not", "it's": "it is", "couldn't": "could not",
                    "have't": "have not"}
    text = ' '.join(contractions.get(word, word) for word in text.split())

    # Remove special characters and punctuations
    text = re.sub(r"[,.\"!@#$%^&*(){}?/;`~:<>+=-]", "", text)

    # Tokenize
    tokens = word_tokenize(text)

    # Remove non-alphabetic words
    words = [word for word in tokens if word.isalpha()]

    # Join words back into a sentence
    cleaned_text = ' '.join(words)
    # Return clean data
    return cleaned_text


In [6]:
# Call function
# Apply the clean_text function to each review in the DataFrame
df['Cleaned_Review'] = df['Review_text'].apply(clean_text)
# Display the cleaned reviews for the first 20 rows
df['Cleaned_Review'].head(20).tolist()

['so there is no way for me to plug it in here in the us unless i go by a converter',
 'good case excellent value',
 'great for the jawbone',
 'tied to charger for conversations lasting more than minutesmajor problems',
 'the mic is great',
 'i have to jiggle the plug to get it to line up right to get decent volume',
 'if you have several dozen or several hundred contacts then imagine the fun of sending each of them one by one',
 'if you are razr owneryou must have this',
 'needless to say i wasted my money',
 'what a waste of money and time',
 'and the sound quality is great',
 'he was very impressed when going from the original battery to the extended battery',
 'if the two were seperated by a mere ft i started to notice excessive static and garbled sound from the headset',
 'very good quality though',
 'the design is very odd as the ear clip is not very comfortable at all',
 'highly recommend for any one who has a blue tooth phone',
 'i advise everyone do not be fooled',
 'so far so