In [1]:
# importing the required libraries

import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from rake_nltk import Rake
from collections import defaultdict

In [2]:
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download("vader_lexicon")

In [3]:
readata = pd.read_excel('Reviews.xlsx')

In [4]:
array = readata[['rating','body']].values.tolist()

In [5]:
# reading dataset (in 'array')
# communicates with the backend

In [6]:
# converting the dataset into pandas dataframe

data=pd.DataFrame(array, columns=['rating','text'])

In [7]:
# calculating the ratings' count

rating={1:0,2:0,3:0,4:0,5:0}
rating_order=data['rating'].value_counts().to_dict()
rating.update(rating_order)
rating

{1: 0, 2: 0, 3: 0, 4: 30, 5: 0}

In [8]:
# checking if there are any text rows with null value

print(data[data['text'].isnull()])  # rows with null values
print(data[data['text'].apply(lambda x: not isinstance(x, str))])  # rows with non-string values

Empty DataFrame
Columns: [rating, text]
Index: []
Empty DataFrame
Columns: [rating, text]
Index: []


In [9]:
#filling the null values

data['text'] = data['text'].fillna('')  # replace null values with empty string
data['text'] = data['text'].apply(lambda x: '' if not isinstance(x, str) else x)  # replace non-string values with empty string

In [10]:
# extracting keywords using RAKE

r = Rake(min_length=3, max_length=4,stopwords=stopwords.words('english'),
    punctuations = [')','(',',',':','),',').','.'])
def extract_keywords(text):
    r.extract_keywords_from_text(text)
    return r.get_ranked_phrases()

data['keywords'] = data['text'].apply(extract_keywords)

In [11]:
data.head()

Unnamed: 0,rating,text,keywords
0,4,The OnePlus Nord CE 3 5G is a device that does...,"[warp charge feature ensures, offers reliable ..."
1,4,I recently upgraded to the OnePlus Nord CE 3 L...,"[phone handles various apps, :** pros :** 1, *..."
2,4,I am using this phone since April 2023 . This ...,"[phone since april 2023, run heavy games, past..."
3,4,"Won't say budget phone, but not bad.Got it on ...",[say budget phone]
4,4,If you need a good phone with good battery lif...,"[average camerascreen feels laggy, increase vi..."


In [12]:
# cleaning the data

def clean(text):
    text = re.sub('[^a-zA-Z0-9]', ' ', text) # Removes special characters  
    text = text.lower() # Converts to lowercase
    text = re.sub('\s+', ' ', text) # Remove extra whitespace
    return text

data['Cleaned Reviews'] = data['text'].apply(clean)

In [13]:
# tokenization

def tokenize_text(text):
    tokens = nltk.word_tokenize(text)
    return tokens

In [14]:
# POS tagging

def pos_tag(tokens):
    tagged_tokens = nltk.pos_tag(tokens)
    return tagged_tokens

In [15]:
# extracting keywords from the Cleaned Reviews such that the adjective is followed by a noun

pos_dict = {'J': 'A'}

def token_stop_adjectives(text):
    tags = pos_tag(tokenize_text(text))
    newlist = []
    for i in range(len(tags)-1):
        word, tag = tags[i]
        if word not in set(stopwords.words('english')) and tag[0] in pos_dict:
            if tags[i+1][1][0] == 'N':
                newlist.append((word + ' ' + tags[i+1][0]))
        elif word not in set(stopwords.words('english')) and tag[0] not in pos_dict:
            continue
    return newlist

data['POS tagged'] = data['Cleaned Reviews'].apply(token_stop_adjectives)
data.head()

Unnamed: 0,rating,text,keywords,Cleaned Reviews,POS tagged
0,4,The OnePlus Nord CE 3 5G is a device that does...,"[warp charge feature ensures, offers reliable ...",the oneplus nord ce 3 5g is a device that does...,"[excellent user, slim design, clean user, frie..."
1,4,I recently upgraded to the OnePlus Nord CE 3 L...,"[phone handles various apps, :** pros :** 1, *...",i recently upgraded to the oneplus nord ce 3 l...,"[sleek design, lightweight build, extended per..."
2,4,I am using this phone since April 2023 . This ...,"[phone since april 2023, run heavy games, past...",i am using this phone since april 2023 this ph...,"[good i, heavy games, heating issue, great i]"
3,4,"Won't say budget phone, but not bad.Got it on ...",[say budget phone],won t say budget phone but not bad got it on 1...,"[good sound, battery life]"
4,4,If you need a good phone with good battery lif...,"[average camerascreen feels laggy, increase vi...",if you need a good phone with good battery lif...,"[good phone, good battery, free software, pain..."


In [16]:
sia=SentimentIntensityAnalyzer()

In [17]:
# performing Sentimental Analysis

word_scores = defaultdict(list)

# calculating the sentiment scores for each word in each sentence
for sentence in data['POS tagged']:
    for word in sentence:
        scores = sia.polarity_scores(word)
        word_scores[word].append(scores['compound'])

for sentence in data['keywords']:
    for word in sentence:
        scores = sia.polarity_scores(word)
        word_scores[word].append(scores['compound'])

# sorting the words by compound score
positive_words = sorted(word_scores, key=lambda w: max(word_scores[w]), reverse=True)[:50]
negative_words = sorted(word_scores, key=lambda w: min(word_scores[w]))[:50]

In [18]:
# algorithm to calculate the number of positive and negative words to be returned on the basis of the Ratings

pos={1: 10, 2: 8, 3: 6, 4: 4,5: 2}
neg={1: 2, 2: 4, 3: 6, 4: 8,5: 10}

count_pos=0
count_neg=0

for key in rating:
    count_pos += round(rating[key]/pos[key])
    count_neg += round(rating[key]/neg[key])

print(count_pos)
print(count_neg)

8
4


In [19]:
# generating an array of product's features

p=0
n=0
returnwords=[]
for word in positive_words:
    if(p<count_pos and max(word_scores[word])>0.5 ):
        returnwords.append({'text':clean(word), 'value':round(max(word_scores[word])*100,2)})
        p=p+1
    else:
        break

for word in negative_words:
    if(n<count_neg):
        returnwords.append({'text':clean(word), 'value':round(min(word_scores[word])*100,2)})
        n=n+1
    else:
        break        
        
returnwords

[{'text': 'excellent value', 'value': 72.69},
 {'text': 'best phones', 'value': 63.69},
 {'text': 'best brand', 'value': 63.69},
 {'text': 'best package', 'value': 63.69},
 {'text': 'best brand appearance', 'value': 63.69},
 {'text': 'great option', 'value': 62.49},
 {'text': 'great i', 'value': 62.49},
 {'text': 'great performance', 'value': 62.49},
 {'text': 'poor delivery', 'value': -47.67},
 {'text': 'late n poor delivery', 'value': -47.67},
 {'text': 'painful finger', 'value': -44.04},
 {'text': '5 5 ', 'value': -42.15}]

In [20]:
from wordcloud import WordCloud 
from PIL import Image

In [21]:
# Create a string containing all the words
font_path = 'E:\Computer Science\Machine Learning\Projects\ReviewEZ\Font\static\Quicksand-Regular.ttf'
text = ' '.join(word['text'] for word in returnwords)

# Generate the word cloud
wordcloud = WordCloud(font_path, width=800, height=600, background_color='white').generate(text)
img = wordcloud.to_image()

In [None]:
from io import BytesIO
import base64

img = BytesIO()
wordcloud.to_image().save(WordCloud, format="PNG")
img_str = base64.b64encode(img.getvalue()).decode("utf-8")
return {"image": img_str}

In [22]:
img.show()