In [None]:
import numpy as np
import pandas as pd
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
#Importing File
#Since it is txt we use delimiter and give header names Review and Sentiment

amazon_review_data = pd.read_csv("data.txt", delimiter = "\t", header=None, names=["Review", "Sentiment"])
print(amazon_review_data)

                                                Review  Sentiment
0    So there is no way for me to plug it in here i...          0
1                          Good case, Excellent value.          1
2                               Great for the jawbone.          1
3    Tied to charger for conversations lasting more...          0
4                                    The mic is great.          1
..                                                 ...        ...
995  The screen does get smudged easily because it ...          0
996  What a piece of junk.. I lose more calls on th...          0
997                       Item Does Not Match Picture.          0
998  The only thing that disappoint me is the infra...          0
999  You can not answer calls with the unit, never ...          0

[1000 rows x 2 columns]


In [None]:
#We convert the int type of Sentiment to object and then replace 1 with Positive and 0 with Negative

amazon_review_data['Sentiment'] = amazon_review_data['Sentiment'].apply(str)
amazon_review_data['Sentiment'] = amazon_review_data['Sentiment'].replace('1','Positive')
amazon_review_data['Sentiment'] = amazon_review_data['Sentiment'].replace('0','Negative')
print(amazon_review_data)

                                                Review Sentiment
0    So there is no way for me to plug it in here i...  Negative
1                          Good case, Excellent value.  Positive
2                               Great for the jawbone.  Positive
3    Tied to charger for conversations lasting more...  Negative
4                                    The mic is great.  Positive
..                                                 ...       ...
995  The screen does get smudged easily because it ...  Negative
996  What a piece of junk.. I lose more calls on th...  Negative
997                       Item Does Not Match Picture.  Negative
998  The only thing that disappoint me is the infra...  Negative
999  You can not answer calls with the unit, never ...  Negative

[1000 rows x 2 columns]


## Text Preprocessing

In [None]:
#We remove all the stopwords in the text
from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('english'))

def remove_stopwords(text):
    return ' '.join([word for word in text.split() if word not in STOPWORDS])

amazon_review_data.Review = amazon_review_data.Review.apply(lambda text: remove_stopwords(text))
amazon_review_data.head(5)

Unnamed: 0,Review,Sentiment
0,So way plug US unless I go converter.,Negative
1,"Good case, Excellent value.",Positive
2,Great jawbone.,Positive
3,Tied charger conversations lasting 45 minutes....,Negative
4,The mic great.,Positive


In [None]:
#We remove all punctuations in the text
import string
PUNCTUATIONS = string.punctuation

def remove_punctuation(text):
    return text.translate(str.maketrans('', '', PUNCTUATIONS))

amazon_review_data.Review = amazon_review_data["Review"].apply(lambda text: remove_punctuation(text))

In [None]:
#We remove all URLs present in the text
amazon_review_data.Review = amazon_review_data.Review.str.replace(r'https?://\S+|www\.\S+', '', regex=True)

In [None]:
#We remove all mentions and hashtags present in the text
amazon_review_data.Review = amazon_review_data.Review.str.replace(r'(@\S+|#\S+)', '', regex=True)

In [None]:
#Lemmatization
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

def text_lemmatize(text):
    return ' '.join([lemmatizer.lemmatize(word) for word in text.split()])

amazon_review_data['text_lemmatized'] = amazon_review_data.Review.apply(lambda text: text_lemmatize(text))
amazon_review_data[['Review', 'text_lemmatized']].head()

Unnamed: 0,Review,text_lemmatized
0,So way plug US unless I go converter,So way plug US unless I go converter
1,Good case Excellent value,Good case Excellent value
2,Great jawbone,Great jawbone
3,Tied charger conversations lasting 45 minutesM...,Tied charger conversation lasting 45 minutesMA...
4,The mic great,The mic great


##Naive Bayes classifier

In [None]:
# We make Training & Testing Datasets
X_train, X_test, y_train, y_test = train_test_split(amazon_review_data['Review'], amazon_review_data['Sentiment'], test_size=0.35, random_state=42)

# Conversion of data into Numerical Features
vectorizer = CountVectorizer()
X_train_counts = vectorizer.fit_transform(X_train)
X_test_counts = vectorizer.transform(X_test)

In [None]:
# Training of Naive Bayes classifier
nb_model = MultinomialNB()
nb_model.fit(X_train_counts, y_train)

# Predictions
y_pred = nb_model.predict(X_test_counts)

In [None]:
# Accuracy Test
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: {:.2f}%".format(accuracy*100))

Accuracy: 78.00%


In [None]:
#Classification Report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

    Negative       0.79      0.74      0.77       170
    Positive       0.77      0.82      0.79       180

    accuracy                           0.78       350
   macro avg       0.78      0.78      0.78       350
weighted avg       0.78      0.78      0.78       350



In [None]:
# confusion_matrix
confusion_matrix(y_test, y_pred)

array([[126,  44],
       [ 33, 147]])

In [None]:
#User Input
user_text = input('Enter an Amazon Review: ')
user_counts = vectorizer.transform([user_text])
user_sentiment = nb_model.predict(user_counts)[0]
print('Sentiment:', user_sentiment)

Enter an Amazon Review: Good Product
Sentiment: Positive
