## Task 1

In [2]:
pip install sentistrength

Note: you may need to restart the kernel to use updated packages.


In [3]:
pip install textblob

Note: you may need to restart the kernel to use updated packages.


In [5]:
import pandas as pd
from sentistrength import PySentiStr
import numpy as np
import nltk
import textblob
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\danan\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


### Add the restaurant reviews dataset path

In [7]:
# input initial dataset
spreadsheet = pd.read_csv('path/to/Restaurant_reviews.csv')

In [8]:
# remove recenlty unneccesary columns
df = spreadsheet.drop(['Restaurant','Reviewer','Metadata','Time','Pictures','7514'],axis=1)
df.head(10)

Unnamed: 0,Review,Rating
0,"The ambience was good, food was quite good . h...",5
1,Ambience is too good for a pleasant evening. S...,5
2,A must try.. great food great ambience. Thnx f...,5
3,Soumen das and Arun was a great guy. Only beca...,5
4,Food is good.we ordered Kodi drumsticks and ba...,5
5,"Ambiance is good, service is good, food is aPr...",5
6,"Its a very nice place, ambience is different, ...",5
7,Well after reading so many reviews finally vis...,4
8,"Excellent food , specially if you like spicy f...",5
9,Came for the birthday treat of a close friend....,5


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Review  9955 non-null   object
 1   Rating  9962 non-null   object
dtypes: object(2)
memory usage: 156.4+ KB


### Add sentristrength jar and data folder locations

In [10]:
# package import for sentistrength
senti = PySentiStr()
senti.setSentiStrengthPath('path/to/SentiStrengthCom.jar')
senti.setSentiStrengthLanguageFolderPath('path/to/SentStrength_Data/')

In [11]:
# removing empty lines in dataframe
df['Review'].replace('', np.nan, inplace=True)
df.dropna(subset=['Review'], inplace=True)

In [12]:
# get sentistrength sentiments for all reviews
sentiStrengthValues = []

for review in df['Review']:
    sentiment_score = senti.getSentiment(review,score='scale')
    sentiStrengthValues.append(sentiment_score)

sentiStrengthValuesFlatList = [item for sublist in sentiStrengthValues for item in sublist]

In [13]:
# mapper for -5 to +5, to -1 to +1
def map_values(input_array):
    result = []    
    for value in input_array:
        if value >= -5 and value <= -1:
            result.append(-1)
        elif value == 0:
            result.append(0)
        elif value >= 1 and value <= 5:
            result.append(1)
        else:
            result.append(None)    
    return result

In [14]:
sentiStrengthValuesNormalized = map_values(sentiStrengthValuesFlatList)

In [15]:
df['sentiStrength'] = sentiStrengthValuesFlatList
df['sentiStrengthNormalized'] = sentiStrengthValuesNormalized

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9955 entries, 0 to 9999
Data columns (total 4 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Review                   9955 non-null   object
 1   Rating                   9955 non-null   object
 2   sentiStrength            9955 non-null   int64 
 3   sentiStrengthNormalized  9955 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 388.9+ KB


In [17]:
# Vector creation using sentiStrength
V1 = df['sentiStrength'].values

In [18]:
df.to_csv(r'save/to/a/path/sentiNormalizedAddedDataset.csv',index=False, sep=',', encoding='utf-8')

## Task 2

In [7]:
# creating separate dataframe for task2
df_2 = df.copy()

In [20]:
# since vader and textblob are able to identify difference between SAD and sad and punctuations, did not preprocess the data
# https://towardsdatascience.com/an-short-introduction-to-vader-3f3860208d53

In [8]:
SIA = SentimentIntensityAnalyzer()
vaderValues = []
textBlobValues = []

In [9]:
# vader sentiments
for review in df_2['Review']:
    sentiment_score = SIA.polarity_scores(review)
    agg_score = sentiment_score['compound']
    vaderValues.append(agg_score)

In [10]:
# textblob sentiments
for review in df_2['Review']:
    sentiment_score = textblob.TextBlob(review).sentiment.polarity
    textBlobValues.append(sentiment_score)

In [11]:
df_2['vaderValue'] = vaderValues
df_2['textBlobValues'] = textBlobValues

In [12]:
# V2 and V3 vector creation
V2 = df_2['vaderValue']
V3 = df_2['textBlobValues']

In [None]:
# this output file will be the input for next task
df_2.to_csv(r'save/to/a/path/sentiVaderBlob.csv',index=False, sep=',', encoding='utf-8')