In [None]:
#Import Library
import os, sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
import seaborn as sns
from collections import Counter
import string
import keras
from keras.preprocessing import sequence
from keras.models import Sequential
from keras import layers
from keras.optimizers import Adam
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from tqdm import tqdm_notebook as tqdm
from keras.utils import pad_sequences
from keras import mixed_precision
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
from wordcloud import WordCloud
from nltk import FreqDist
from keras.preprocessing.text import Tokenizer
from keras.layers import LSTM, Dense, Embedding, Bidirectional
stop = stopwords.words('english')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
#from google.colab import files
#files= files.upload()

In [None]:
df=pd.read_excel('rain_coat.xlsx')
df.head()

Unnamed: 0,reviews
0,\nThis rain parka is a nice weight and flatter...
1,\nPerfect light weight waterproof rain coat. W...
2,"\nMy wife needed a new North face rain jacket,..."
3,"\ntrue size , 164cm and 53 kg small is ok\n"
4,\nI love this coat. I got it for trip (leaving...


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5309 entries, 0 to 5308
Data columns (total 1 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   reviews  5300 non-null   object
dtypes: object(1)
memory usage: 41.6+ KB


**Data Cleaning and Preparation**

In [None]:
# Check the number of duplicate records by grouping the data on UserId and ProductId
df[df.duplicated(['reviews'])].count()

reviews    1387
dtype: int64

In [None]:
df.rename(columns = {'reviews':'review'}, inplace = True)
df.head()

Unnamed: 0,review
0,\nThis rain parka is a nice weight and flatter...
1,\nPerfect light weight waterproof rain coat. W...
2,"\nMy wife needed a new North face rain jacket,..."
3,"\ntrue size , 164cm and 53 kg small is ok\n"
4,\nI love this coat. I got it for trip (leaving...


In [None]:
df.drop_duplicates(keep=False, inplace=True)


In [None]:
df.shape

(2531, 1)

In [None]:
# Check the number of null values in the dataset
df.isnull().sum()

review    0
dtype: int64

In [None]:
# Null values are dropped
df.dropna(inplace=True)
df.shape

(2531, 1)

In [None]:
#Checking null values again
df.isnull().sum()

review    0
dtype: int64

**Feature Engineering**

1. Calculating Number of stopwords

In [None]:
df['stopwords'] = df['review'].apply(lambda x: len([x for x in x.split() if x in stop]))
df[['review','stopwords']].head()

Unnamed: 0,review,stopwords
2,"\nMy wife needed a new North face rain jacket,...",5
3,"\ntrue size , 164cm and 53 kg small is ok\n",2
8,\na nice coat- but sleeves seems way too long ...,6
120,\nI bought this fir Alaska Cruise and it worke...,11
213,"\nDefinitely cut big, first rain stayed dry, s...",3


2. Calculating Punctuation

In [None]:
def count_punct(text):
    count = sum([1 for char in text if char in string.punctuation])
    return count

df['punctuation'] = df['review'].apply(lambda x: count_punct(x))
df[['review','punctuation']].head()

Unnamed: 0,review,punctuation
2,"\nMy wife needed a new North face rain jacket,...",3
3,"\ntrue size , 164cm and 53 kg small is ok\n",1
8,\na nice coat- but sleeves seems way too long ...,4
120,\nI bought this fir Alaska Cruise and it worke...,2
213,"\nDefinitely cut big, first rain stayed dry, s...",9


3. Number of hashtag characters

In [None]:
df['hastags'] = df['review'].apply(lambda x: len([x for x in x.split() if x.startswith('#')]))
df[['review','hastags']].head()

Unnamed: 0,review,hastags
2,"\nMy wife needed a new North face rain jacket,...",0
3,"\ntrue size , 164cm and 53 kg small is ok\n",0
8,\na nice coat- but sleeves seems way too long ...,0
120,\nI bought this fir Alaska Cruise and it worke...,0
213,"\nDefinitely cut big, first rain stayed dry, s...",0


In [None]:
df.hastags.loc[df.hastags != 0].count()

1

There is no hashtag in the review

Since the data is imbalance, we will handle the unbalanced class in order to avoid biased models and inaccurate prediction

**Text Cleaning**

The steps for cleaning unstructured data are:
1. Checking reviews for links, html tags, and punctuation.
Take out characters such as '/', '*', and '@'.

2. Filtering Stop Words: Because they don't contribute much meaning to a document, words like "in," "is," and "an" are frequently utilized as stop words. Words like "product," "amazon," "good," and "great" have been added as stop words because they are often used in all of the evaluations.
3. Eliminate words with fewer than three characters. terms such as "the," "he," and "she."
4. checking the reviews are in english or not

In [None]:
pip install langdetect

Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py) ... [?25l[?25hdone
  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993225 sha256=e8fd5f93d04c7be52b7e082be2e44eeb8d8cb61030294c32390e6984a921771d
  Stored in directory: /root/.cache/pip/wheels/95/03/7d/59ea870c70ce4e5a370638b5462a7711ab78fba2f655d05106
Successfully built langdetect
Installing collected packages: langdetect
Successfully installed langdetect-1.0.9


We exclude reviews that are not in English and concentrate on evaluations that are written in the language.

In [None]:
#Checking english review
from langdetect import detect

def is_english(text):
    try:
        return detect(text) == 'en'
    except:
        return False


In [None]:
df['is_english'] = df['review'].apply(is_english)
df = df[df['is_english'] == True]
# Reset the index
df.reset_index(drop=True, inplace=True)

In [None]:
df.shape

(2294, 5)

1. Removing Stopwords, Punctuation, symbols, url

In [None]:
import re
import string

punctuation =  set(string.punctuation)
stop_words = set(stopwords.words("english"))
stop_words.update(["br", "href","good","great","amazon","product", "counter",'north','face','carhartt',"also",
"im",
"ive", 'I'])

def Clean_Text(review):
  x = review
  x = re.sub(r'https?:\S*', '', x)
  x = re.sub(r'{link}', '', x) #remove links
  x = re.sub(r'@[\w]*', '', x)
  x = re.sub(r'[^A-Za-z0-9]+', ' ', x)
  x = re.sub(r'\b[0-9]+\b', '', x)
  x = re.sub(r'&[a-z]+', '', x)
  x = ''.join(w for w in x if w not in punctuation)
  x = x.replace('[^a-zA-Z#]',' ') # Remove special

  x = [w.lower() for w in x.split() if w.lower() not in stop_words]
  x = [w for w in x if len(w) > 3]
  return x

In [None]:
df['review'] = df['review'].str.replace('\n', '')

In [None]:
df['review'] = df['review'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
df['review'].head()

0    My wife needed new North face rain jacket, las...
1                     true size , 164cm 53 kg small ok
2    nice coat- sleeves seems way long size 'small'...
3    I bought fir Alaska Cruise worked Worked perfe...
4    Definitely cut big, first rain stayed dry, sec...
Name: review, dtype: object

2. Removing Emoji

In [None]:
#removing emoji
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [None]:
# remove all emojis from df
df['review'] = df['review'].apply(lambda x: remove_emoji(x))

3. Handle spell correction

In [None]:
from textblob import TextBlob
df['review'].apply(lambda x: str(TextBlob(x).correct()))

0       By wife needed new North face rain jacket, las...
1                         true size , 164cm 53 g small ok
2       nice coat- sleeves seems way long size 'small'...
3       I bought fir Alaska Bruise worked Worked perfe...
4       Definitely cut big, first rain stayed dry, sec...
                              ...                        
2289    The reversible side great. Totally beyond expe...
2290    The jacket I received used, stained, incorrect...
2291                         Took I received disgusting!!
2292    His jacket nice, arrived large split seat anot...
2293                                        Good size fit
Name: review, Length: 2294, dtype: object

5. Removing unnecessary element in review

In [None]:
# Apply a first round of text cleaning techniques
import re
import string

def clean_text_step1(text):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

round1 = lambda x: clean_text_step1(x)

In [None]:
df['review'] = df.review.apply(round1)
df.review.head()

0    my wife needed new north face rain jacket last...
1                             true size    kg small ok
2    nice coat sleeves seems way long size small  h...
3    i bought fir alaska cruise worked worked perfe...
4    definitely cut big first rain stayed dry secon...
Name: review, dtype: object

In [None]:

# Apply a second round of cleaning
def clean_text_step2(text):
    '''Get rid of some additional punctuation and non-sensical text that was missed the first time around.'''
    text = re.sub('[‘’“”…]', '', text)
    text = re.sub('\n', '', text)
    return text

round2 = lambda x: clean_text_step2(x)

In [None]:
df['review'] = df.review.apply(round2)
df.review

0       my wife needed new north face rain jacket last...
1                                true size    kg small ok
2       nice coat sleeves seems way long size small  h...
3       i bought fir alaska cruise worked worked perfe...
4       definitely cut big first rain stayed dry secon...
                              ...                        
2289    the reversible side great totally beyond expec...
2290    the jacket i received used stained incorrect c...
2291                           look i received disgusting
2292    this jacket nice arrived large split seam anot...
2293                                        good size fit
Name: review, Length: 2294, dtype: object

In [None]:
# Apply the clean text on all reviews
df['review_new']  = df['review'].apply(lambda x: Clean_Text(x))
df.head()

Unnamed: 0,review,stopwords,punctuation,hastags,is_english,review_new
0,my wife needed new north face rain jacket last...,5,3,0,True,"[wife, needed, rain, jacket, last, year, truel..."
1,true size kg small ok,2,1,0,True,"[true, size, small]"
2,nice coat sleeves seems way long size small h...,6,4,0,True,"[nice, coat, sleeves, seems, long, size, small..."
3,i bought fir alaska cruise worked worked perfe...,11,2,0,True,"[bought, alaska, cruise, worked, worked, perfe..."
4,definitely cut big first rain stayed dry secon...,3,9,0,True,"[definitely, first, rain, stayed, second, rain..."


In [None]:
df=df.drop(['review'],axis=1)
df.head()

Unnamed: 0,stopwords,punctuation,hastags,is_english,review_new
0,5,3,0,True,"[wife, needed, rain, jacket, last, year, truel..."
1,2,1,0,True,"[true, size, small]"
2,6,4,0,True,"[nice, coat, sleeves, seems, long, size, small..."
3,11,2,0,True,"[bought, alaska, cruise, worked, worked, perfe..."
4,3,9,0,True,"[definitely, first, rain, stayed, second, rain..."


In [None]:
df.rename(columns = {'review_new':'review'}, inplace = True)
df.head()

Unnamed: 0,stopwords,punctuation,hastags,is_english,review
0,5,3,0,True,"[wife, needed, rain, jacket, last, year, truel..."
1,2,1,0,True,"[true, size, small]"
2,6,4,0,True,"[nice, coat, sleeves, seems, long, size, small..."
3,11,2,0,True,"[bought, alaska, cruise, worked, worked, perfe..."
4,3,9,0,True,"[definitely, first, rain, stayed, second, rain..."


In [None]:
#  A list of clean words is the output
print(df['review'][1])

['true', 'size', 'small']


In [None]:
df['review'].sample(5)

903     [maybe, large, side, okay, even, though, packa...
92                                           [advertised]
697     [thought, wanted, shell, raincoat, reviews, wi...
603     [happy, purchase, exactly, like, described, pi...
1132                             [loves, second, version]
Name: review, dtype: object

In [None]:
#Add adition step to remove stopwords
# Default NLTK English stop words
default_stopwords = set(stopwords.words('english'))

# Customized stop words list
custom_stopwords = set(default_stopwords)

# Add additional stop words
additional_stopwords = ['I', 'am', 'i', 'they','is','it']
custom_stopwords.update(additional_stopwords)

# Function to remove stop words from a sentence
def remove_stopwords(sentence):
    tokens = nltk.word_tokenize(sentence)
    filtered_tokens = [word for word in tokens if word.lower() not in custom_stopwords]
    filtered_sentence = ' '.join(filtered_tokens)
    return filtered_sentence



**Normalize Data**

In [None]:
#authorisasi
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#Save the clean data
df.to_csv('/content/drive/MyDrive/amazon/review_clean.csv', index=False)