In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd /content/drive/MyDrive/Semester/Personal/

/content/drive/MyDrive/Semester/Personal


In [None]:
# !pip install vaderSentiment
!pip install transformers
!pip install tokenizers

### Imports

In [4]:
import numpy as np
import pandas as pd
from nltk import word_tokenize
from scipy.special import softmax

from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig
import torch

import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

### Read file

In [None]:
df = pd.read_csv('./places_review.csv')

In [None]:
df.head()

Unnamed: 0,review,place
0,"Hello friends, I would like to share about the...",Pandharpur
1,Baralikadu Tourist place is very popular. This...,Baralikadu
2,I really like this place its awesome nice won...,Kakkadampoyil
3,If you visit MP and Miss chindwara than you ha...,Chhindwara
4,Its a must see place in andaman trip. Scuba di...,Havelock Island


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 605 entries, 0 to 604
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   review  605 non-null    object
 1   place   605 non-null    object
dtypes: object(2)
memory usage: 9.6+ KB


In [None]:
df['place'].unique()

array(['Pandharpur', 'Baralikadu', 'Kakkadampoyil', 'Chhindwara',
       'Havelock Island', 'Tuljapur', 'Manali', 'Sinhagad', 'Kodaikanal',
       'Murdeshwar', 'Agra', 'Eco Park - New Town - Kolkata',
       'Machranga Dweep (Kingfisher Island)', 'Daltonganj', 'Sripuram',
       'Bangalore', 'Visakhapatnam', 'Aurangabad', 'Badrinath',
       'Srirangam', 'Malakonda', 'Ooty', 'Dalhousie', 'Konark',
       'Amritsar', 'Kasauli', 'Netarhat', 'Ram Jhula', 'Lakshadweep',
       'Baroda', 'Bhangarh', 'Hyderabad', 'Pondicherry', 'Mahabaleshwar',
       'Mukteshwar', 'Rameshwaram', 'Kuldhara', 'Lucknow',
       'Tapola - Mahabaleshwar', 'Gorakhpur'], dtype=object)

### preprocessing pipeline

In [None]:
def text_preprocessing(text):
    """Text preprocessing"""
    text = text.lower()
    words = word_tokenize(text)

    return " ".join(words)

In [None]:
df['review'] = df['review'].apply(lambda x: text_preprocessing(x))

### RoBERTa sentiment

In [None]:
MODEL = f"siebert/sentiment-roberta-large-english"
tokenizer = AutoTokenizer.from_pretrained(MODEL,model_max_length=512)
config = AutoConfig.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

In [None]:
def get_sentiment_score(text):
    encoded_input = tokenizer(text, truncation=True, padding=True, return_tensors='pt')
    output = model(**encoded_input)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    pred = scores.argmax(-1)
    if(model.config.id2label[pred]=='POSITIVE'):
        return scores[pred]
    else:
        return (-1*scores[pred])

In [None]:
df['sentiment_score'] = df['review'].apply(lambda x: get_sentiment_score(x))

In [None]:
df.head()

Unnamed: 0,review,place,sentiment_score
0,"hello friends , i would like to share about th...",Pandharpur,0.998893
1,baralikadu tourist place is very popular . thi...,Baralikadu,0.998832
2,i really like this place its awesome nice wond...,Kakkadampoyil,0.998931
3,if you visit mp and miss chindwara than you ha...,Chhindwara,0.997793
4,its a must see place in andaman trip . scuba d...,Havelock Island,0.99887


In [10]:
new_df = df.groupby(by=['place'])['sentiment_score'].apply(np.mean).reset_index(name='overall_score')

In [14]:
new_df.sort_values(by=['overall_score'], ascending=False, inplace=True, ignore_index=True)

In [15]:
new_df.head(10)

Unnamed: 0,place,overall_score
0,Tapola - Mahabaleshwar,0.99891
1,Visakhapatnam,0.998865
2,Kasauli,0.998834
3,Mukteshwar,0.998796
4,Murdeshwar,0.998786
5,Aurangabad,0.998743
6,Baroda,0.998699
7,Chhindwara,0.998627
8,Gorakhpur,0.998459
9,Malakonda,0.997868
