# Data Exploration and Sentiment Analysis

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/amazon-review-data-for-nlp/amazon_reviews_us_Office_Products_v1_00.tsv


In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
data = pd.read_csv("/kaggle/input/amazon-review-data-for-nlp/amazon_reviews_us_Office_Products_v1_00.tsv", sep='\t', on_bad_lines= 'skip')
data

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
0,US,43081963,R18RVCKGH1SSI9,B001BM2MAC,307809868,"Scotch Cushion Wrap 7961, 12 Inches x 100 Feet",Office Products,5,0.0,0.0,N,Y,Five Stars,Great product.,2015-08-31
1,US,10951564,R3L4L6LW1PUOFY,B00DZYEXPQ,75004341,"Dust-Off Compressed Gas Duster, Pack of 4",Office Products,5,0.0,1.0,N,Y,"Phffffffft, Phfffffft. Lots of air, and it's C...",What's to say about this commodity item except...,2015-08-31
2,US,21143145,R2J8AWXWTDX2TF,B00RTMUHDW,529689027,Amram Tagger Standard Tag Attaching Tagging Gu...,Office Products,5,0.0,0.0,N,Y,but I am sure I will like it.,"Haven't used yet, but I am sure I will like it.",2015-08-31
3,US,52782374,R1PR37BR7G3M6A,B00D7H8XB6,868449945,AmazonBasics 12-Sheet High-Security Micro-Cut ...,Office Products,1,2.0,3.0,N,Y,and the shredder was dirty and the bin was par...,Although this was labeled as &#34;new&#34; the...,2015-08-31
4,US,24045652,R3BDDDZMZBZDPU,B001XCWP34,33521401,"Derwent Colored Pencils, Inktense Ink Pencils,...",Office Products,4,0.0,0.0,N,Y,Four Stars,Gorgeous colors and easy to use,2015-08-31
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2640249,US,53005790,RLI7EI10S7SN0,B00000DM9M,223408988,PalmOne III Leather Belt Clip Case,Office Products,4,26.0,26.0,N,N,Great value! A must if you hate to carry thing...,I can't live anymore whithout my Palm III. But...,1998-12-07
2640250,US,52188548,R1F3SRK9MHE6A3,B00000DM9M,223408988,PalmOne III Leather Belt Clip Case,Office Products,4,18.0,18.0,N,N,Attaches the Palm Pilot like an appendage,Although the Palm Pilot is thin and compact it...,1998-11-30
2640251,US,52090046,R23V0C4NRJL8EM,0807865001,307284585,Gods and Heroes of Ancient Greece,Office Products,4,9.0,16.0,N,N,"Excellent information, pictures and stories, I...",This book had a lot of great content without b...,1998-10-15
2640252,US,52503173,R13ZAE1ATEUC1T,1572313188,870359649,Microsoft EXCEL 97/ Visual Basic Step-by-Step ...,Office Products,5,0.0,0.0,N,N,class text,I am teaching a course in Excel and am using t...,1998-08-22


*** As we can see this is a huge dataset with real entries which can be used for variety of Data Science or NLP projects.**

In [4]:
data.shape

(2640254, 15)

### 1. Data Preparation

In [5]:
# dropping na values
data.dropna(inplace = True)

In [6]:
data.shape

(2640021, 15)

In [7]:
# For sentiment analysis we only need 'star_rating' and 'review_body' features.
data = data[['star_rating','review_body']]
data

Unnamed: 0,star_rating,review_body
0,5,Great product.
1,5,What's to say about this commodity item except...
2,5,"Haven't used yet, but I am sure I will like it."
3,1,Although this was labeled as &#34;new&#34; the...
4,4,Gorgeous colors and easy to use
...,...,...
2640249,4,I can't live anymore whithout my Palm III. But...
2640250,4,Although the Palm Pilot is thin and compact it...
2640251,4,This book had a lot of great content without b...
2640252,5,I am teaching a course in Excel and am using t...


In [8]:
# Finding the unique values in 'star_rating' feature.
data.star_rating.unique()

array([5, 1, 4, 2, 3, '5', '4', '3', '1', '2'], dtype=object)

In [9]:
data.star_rating.value_counts()

star_rating
5    1458907
4     389580
1     286063
3     179854
2     129031
5     123767
4      28757
1      20895
3      13818
2       9349
Name: count, dtype: int64

* We can see that there are multiple rating values depicting same values.
* We have rating from 1 to 5 in numeric and string format.

In [10]:
# Converting the rating in string to numeric values
rating_dict = {
'1': int(1),
'2': int(2),
'3': int(3),
'4': int(4),
'5': int(5),
1:1,
2:2,
3:3,
4:4,
5:5
}
data.star_rating = data.star_rating.map(rating_dict)

* For sentiment analysis we need positive and negative rating only. Hence, we can ignore the neutral ratings of 3.
* We will create a new feature with sentiment 0 and 1 representing negative and positive sentiment respectively.

In [11]:
# Dropping rows with rating values of 3
data = data[data.star_rating != 3]

In [12]:
# Creating a new feature with sentiment values as 0 and 1
data['target_rating'] = data.star_rating.apply(lambda x: 1 if x>3 else 0)

In [13]:
# Dropping the 'star_rating' feature
data.drop(['star_rating'], axis = 1, inplace= True)

In [14]:
data.target_rating.value_counts()

target_rating
1    2001011
0     445338
Name: count, dtype: int64

* We can see this is unbalanced dataset with positive review significantly more than negative ones.
* Such dataset can cause bias.
* To solve this we will be taking a subset of this data with equal number of positive and negative sentiment values.

In [15]:
# Let's take 70K values for both target_rating values randomly
# Taking higher number of rows might cause RAM issues later on
positive = data[data.target_rating == 1].sample(n=70000, random_state = 25)
negative = data[data.target_rating == 0].sample(n=70000, random_state = 25)
data = pd.concat([positive, negative]).dropna().reset_index(drop = True)
data.shape

(140000, 2)

### 2. Data Cleaning

In [16]:
# Converting all the reviews from the feature 'review_body' into lower case.
data['review_body'] = data['review_body'].str.lower()
data.tail()

Unnamed: 0,review_body,target_rating
139995,half of the cartridges were half empty. don't ...,0
139996,this product did not work well with my hp colo...,0
139997,our mfc - 8840dn worked fine until a paper jam...,0
139998,my roku keeps falling off my tv. i used two st...,0
139999,"this is way over priced, about 1000% overprice...",0


In [17]:
# Let's check for html
import re
def find_html_tags(text):
    html_tags = re.findall(r'<[^>]+>', text)
    return html_tags
print(data['review_body'].apply(find_html_tags))

0               []
1               []
2               []
3               []
4               []
            ...   
139995          []
139996          []
139997          []
139998          []
139999    [<br />]
Name: review_body, Length: 140000, dtype: object


In [18]:
# Let's check for URLs
url_pattern = re.compile(r'https?://[^\s]+')
def find_url_tags(text):
    html_tags = url_pattern.findall(text)
    return html_tags
(data['review_body'].apply(find_url_tags)).sum()[:5]

['http://www.amazon.com/bankers-box-storage-lift-off-letter/dp/b000flze7i/ref=sr_1_1?ie=utf8&qid=1392237757&sr=8-1&keywords=bankers+box+703),',
 'http://www.oehha.ca.gov/prop65.html',
 'http://youtu.be/hujp6rmkt0m<br',
 'http://www.amazon.com/blue-donuts-caller-id/dp/b00ltxwi3e',
 'http://www.amazon.com/gp/product/b006nfht0k/ref=cm_cr_ryp_prd_ttl_sol_9']

* we need to remove the html and URLs, as they are not useful for our sentiment analysis.

In [19]:
# Removing HTML
data['review_body'] = data['review_body'].apply(lambda text: re.sub('<.*?>', '', text))

In [20]:
# Removing the URLs
data['review_body'] = data['review_body'].apply(lambda text: re.sub(r'https?://[^\s]+', '', text))

In [21]:
# Removing non alphabetical characters. For example, &.
data['review_body'] = data['review_body'].apply(lambda text: re.sub(r'[^a-zA-Z\s]', '', text))

In [22]:
data['review_body'].str.contains("&").sum()

0

In [23]:
# Removing extra spaces
data['review_body'] = data['review_body'].apply(lambda text: re.sub(r'\s+', ' ', text.strip()))

* We need to perform contractions. For instance, the word can't is same as can not.

In [24]:
!pip install contractions

Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting textsearch>=0.0.21 (from contractions)
  Downloading textsearch-0.0.24-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting anyascii (from textsearch>=0.0.21->contractions)
  Downloading anyascii-0.3.2-py3-none-any.whl.metadata (1.5 kB)
Collecting pyahocorasick (from textsearch>=0.0.21->contractions)
  Downloading pyahocorasick-2.1.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (13 kB)
Downloading contractions-0.1.73-py2.py3-none-any.whl (8.7 kB)
Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Downloading anyascii-0.3.2-py3-none-any.whl (289 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m289.9/289.9 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hDownloading pyahocorasick-2.1.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (11

In [25]:
# Performing contractions
import contractions
data['review_body'] = data['review_body'].apply(lambda x: contractions.fix(x))

In [26]:
data.isnull().sum()

review_body      0
target_rating    0
dtype: int64

### 3. Data Preprocessing

In [27]:
import nltk

In [28]:
nltk.download('wordnet', download_dir='/usr/share/nltk_data/corpora')

[nltk_data] Downloading package wordnet to
[nltk_data]     /usr/share/nltk_data/corpora...


True

In [29]:
!unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/

Archive:  /usr/share/nltk_data/corpora/wordnet.zip
   creating: /usr/share/nltk_data/corpora/wordnet/
  inflating: /usr/share/nltk_data/corpora/wordnet/lexnames  
  inflating: /usr/share/nltk_data/corpora/wordnet/data.verb  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.adv  
  inflating: /usr/share/nltk_data/corpora/wordnet/adv.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.verb  
  inflating: /usr/share/nltk_data/corpora/wordnet/cntlist.rev  
  inflating: /usr/share/nltk_data/corpora/wordnet/data.adj  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.adj  
  inflating: /usr/share/nltk_data/corpora/wordnet/LICENSE  
  inflating: /usr/share/nltk_data/corpora/wordnet/citation.bib  
  inflating: /usr/share/nltk_data/corpora/wordnet/noun.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/verb.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/README  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.sense  
  inflating: /usr/share/nltk_data

In [30]:
from nltk.corpus import wordnet
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

In [31]:
# Remove stop words using NLTK
stop = nltk.corpus.stopwords.words('english')
data['review_body'] = data['review_body'].apply(lambda x: ' '.join([x for x in str(x).split() if x not in (stop)]))

In [32]:
data.head(4)

Unnamed: 0,review_body,target_rating
0,dey coo,1
1,first laser jet purchase home works great ever...,1
2,service excellent phone worked well desirable ...,1
3,looks better expected feels nice hold writes w...,1


* We will perform lemmatization using nltk library

In [33]:
lemmatizer = WordNetLemmatizer()

# Function to map POS tag to first character lemmatize() accepts
def get_wordnet_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)  # Default to noun if not found

#perform lemmatization
def lemmatize_text(text):
    lemmatized_text = ' '.join(lemmatizer.lemmatize(word, pos=get_wordnet_pos(word)) for word in nltk.word_tokenize(text))
    return lemmatized_text

In [34]:
# Apply lemmatization to text_column
data['review_body'] = data['review_body'].apply(lemmatize_text)
data.head()

Unnamed: 0,review_body,target_rating
0,dey coo,1
1,first laser jet purchase home work great every...,1
2,service excellent phone work well desirable fe...,1
3,look well expect feel nice hold writes well th...,1
4,bought month old son begin sit he enjoy ever s...,1


In [35]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 140000 entries, 0 to 139999
Data columns (total 2 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   review_body    140000 non-null  object
 1   target_rating  140000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 2.1+ MB


### 4. Feature Extraction

In [36]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [37]:
data.isnull().sum()

review_body      0
target_rating    0
dtype: int64

In [42]:
x_train, x_test, y_train, y_test = train_test_split(data['review_body'], data['target_rating'], test_size=0.2, random_state=15)

In [43]:
vectorizer= TfidfVectorizer(max_features=15000)
tf_x_train = vectorizer.fit_transform(x_train)
tf_x_test = vectorizer.transform(x_test)

In [44]:
# Metrics for model evaluation
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

### 5. Perceptron

In [45]:
#Perceptron
from sklearn.linear_model import Perceptron
perceptron = Perceptron()
perceptron.fit(tf_x_train, y_train)
# Make predictions on the train set
y_pred_perceptron_train = perceptron.predict(tf_x_train)
# Make predictions on the test set
y_pred_perceptron_test = perceptron.predict(tf_x_test)

In [46]:
# Metrics for train perceptron
accuracy_train_perceptron = accuracy_score(y_train, y_pred_perceptron_train)
precision_train_perceptron = precision_score(y_train, y_pred_perceptron_train, average='binary')
recall_train_perceptron = recall_score(y_train, y_pred_perceptron_train, average='binary')
f1_train_perceptron = f1_score(y_train, y_pred_perceptron_train, average='binary')
print('Perceptron Train Metrics: Accuracy = ', accuracy_train_perceptron, ', Precision = ', precision_train_perceptron, ', Recall = ', recall_train_perceptron, ', F1 = ', f1_train_perceptron)

Perceptron Train Metrics: Accuracy =  0.882375 , Precision =  0.8856452514470149 , Recall =  0.8778887915780443 , F1 =  0.8817499640959356


In [47]:
# Metrics for test perceptron
accuracy_test_perceptron = accuracy_score(y_test, y_pred_perceptron_test)
precision_test_perceptron = precision_score(y_test, y_pred_perceptron_test, average='binary')
recall_test_perceptron = recall_score(y_test, y_pred_perceptron_test, average='binary')
f1_test_perceptron = f1_score(y_test, y_pred_perceptron_test, average='binary')
print('Perceptron Test Metrics: Accuracy = ', accuracy_test_perceptron, ', Precision = ', precision_test_perceptron, ', Recall = ', recall_test_perceptron, ', F1 = ', f1_test_perceptron)


Perceptron Test Metrics: Accuracy =  0.8435714285714285 , Precision =  0.849714327041296 , Recall =  0.8361682442530781 , F1 =  0.8428868641939882


### 6. Naive Bayes.

In [48]:
from sklearn.naive_bayes import MultinomialNB
# Intiailize and train naive Bayes
nb = MultinomialNB()
nb.fit(tf_x_train, y_train)
# Make predictions on the train set
y_pred_nb_train = nb.predict(tf_x_train)
# Make predictions on the test set
y_pred_nb_test = nb.predict(tf_x_test)

In [49]:
# Metrics for train
accuracy_train_nb = accuracy_score(y_train, y_pred_nb_train)
precision_train_nb = precision_score(y_train, y_pred_nb_train, average='binary', pos_label=0)
recall_train_nb = recall_score(y_train, y_pred_nb_train, average='binary', pos_label=0)
f1_train_nb = f1_score(y_train, y_pred_nb_train, average='binary', pos_label=0)
print('Naive Bayes Train Metrics: Accuracy = ', accuracy_train_nb, ', Precision = ', precision_train_nb, ', Recall = ', recall_train_nb, ', F1 = ', f1_train_nb)

Naive Bayes Train Metrics: Accuracy =  0.8670625 , Precision =  0.8653129326564664 , Recall =  0.8697436263402972 , F1 =  0.8675226223206898


In [50]:
# Metrics for test
accuracy_test_nb = accuracy_score(y_test, y_pred_nb_test)
precision_test_nb = precision_score(y_test, y_pred_nb_test, average='binary', pos_label=0)
recall_test_nb = recall_score(y_test, y_pred_nb_test, average='binary', pos_label=0)
f1_test_nb = f1_score(y_test, y_pred_nb_test, average='binary', pos_label=0)

print('Naive Bayes Test Metrics: Accuracy = ', accuracy_test_nb, ', Precision = ', precision_test_nb, ', Recall = ', recall_test_nb, ', F1 = ', f1_test_nb)

Naive Bayes Test Metrics: Accuracy =  0.8560357142857142 , Precision =  0.8515525308379412 , Recall =  0.8611369990680335 , F1 =  0.8563179468900375


Follow me on kaggle and Linkedin www.linkedin.com/in/rajput-krishna
* Feel free to ask questions and connect for data and ML realated content.