In [2]:
from warnings import filterwarnings
filterwarnings('ignore')
import numpy as np
import pandas as pd
import seaborn as sns
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [3]:
# importing Reviews dataset of amazon
data = pd.read_csv(r"D:\DataSets\Git Hub datasets-pycat\datasets\amazon.csv")
data.head()

Unnamed: 0,reviewText,Positive
0,This is a one of the best apps acording to a b...,1
1,This is a pretty good version of the game for ...,1
2,this is a really cool game. there are a bunch ...,1
3,"This is a silly game and can be frustrating, b...",1
4,This is a terrific game on any pad. Hrs of fun...,1


## Over view the data

In [5]:
data.shape

(20000, 2)

In [6]:
data.Positive.unique()

array([1, 0], dtype=int64)

In [7]:
data.Positive.value_counts()

1    15233
0     4767
Name: Positive, dtype: int64

In [8]:
data.reviewText.nunique()

20000

In [9]:
data.isna().sum()

reviewText    0
Positive      0
dtype: int64

## Preprocesses the text to remove stopwords and lemmatize the text

In [11]:
#text preprocessing
def text_processing(text:str):
    # convert the text to lower case and tokenize to words
    word_list = word_tokenize(text.lower())
    
    #remove stop words from the word list
    processed_list = [word for word in word_list if word not in stopwords.words('english')]
    
    #lematization removing end letter eg. ing ly etc
    lemmatizer = WordNetLemmatizer()
    lemmatized_text = [lemmatizer.lemmatize(word) for word in processed_list]
    
    #make sentance again
    final_text = ' '.join(lemmatized_text)
    
    return final_text

In [12]:
#apply the text processing step to each sentace in our dataset using apply function
data['reviewText'] = data['reviewText'].apply(text_processing)

In [13]:
data.head()

Unnamed: 0,reviewText,Positive
0,one best apps acording bunch people agree bomb...,1
1,pretty good version game free . lot different ...,1
2,really cool game . bunch level find golden egg...,1
3,"silly game frustrating , lot fun definitely re...",1
4,terrific game pad . hr fun . grandkids love . ...,1


In [14]:
#using sentiment amalyzer in nltk module
analyzer = SentimentIntensityAnalyzer()

## Trying out prediction with 1st entry in our dataset

In [16]:
state = analyzer.polarity_scores(data.iloc[0,0])
state

{'neg': 0.146, 'neu': 0.548, 'pos': 0.306, 'compound': 0.5423}

In [17]:
score = 1 if state['pos']>0 else 0
score

1

## Creating Funtion to apply score calculation to all entries in our dataset

In [19]:
def sentiment_analyzer(text):
    state = analyzer.polarity_scores(text)
    score = 1 if state['pos']>0 else 0
    return score

In [20]:
#Predicting output for all 20k entries in new column name prediction
data['prediction'] = data['reviewText'].apply(sentiment_analyzer)

In [21]:
data.head()

Unnamed: 0,reviewText,Positive,prediction
0,one best apps acording bunch people agree bomb...,1,1
1,pretty good version game free . lot different ...,1,1
2,really cool game . bunch level find golden egg...,1,1
3,"silly game frustrating , lot fun definitely re...",1,1
4,terrific game pad . hr fun . grandkids love . ...,1,1


In [22]:
# Using sklearn to calcualte accuracy and view confustion matrix
from sklearn.metrics import accuracy_score, confusion_matrix

In [23]:
print('accuracy_score:',accuracy_score(data['Positive'],data['prediction'])*100,'%')

accuracy_score: 78.94 %


In [24]:
print('confusion_matrix:\n',confusion_matrix(data['Positive'],data['prediction']))

confusion_matrix:
 [[ 1131  3636]
 [  576 14657]]


In [25]:
review = ['Negative','Positive']

## Trying with custom input

In [42]:
def get_result(text:str):
    
    processed_text = text_processing(text)
    
    score = sentiment_analyzer(processed_text)
    
    return review[score]
    

In [41]:
print(get_result(input('Enter your review: ')))

Enter your review: look old color not matching
look old color matching
Negative
