### Imports

In [74]:
import re
import numpy as np
import pandas as pd
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import unicodedata
import inflect
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

### Read file

In [2]:
df = pd.read_csv('./scrapper/reviews_crawler/places_review.csv')

In [3]:
df.head()

Unnamed: 0,review,place
0,"Hello friends, I would like to share about the...",Pandharpur
1,Baralikadu Tourist place is very popular. This...,Baralikadu
2,I really like this place its awesome nice won...,Kakkadampoyil
3,If you visit MP and Miss chindwara than you ha...,Chhindwara
4,Its a must see place in andaman trip. Scuba di...,Havelock Island


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 605 entries, 0 to 604
Data columns (total 2 columns):
review    605 non-null object
place     605 non-null object
dtypes: object(2)
memory usage: 9.5+ KB


In [9]:
df['place'].unique()

array(['Pandharpur', 'Baralikadu', 'Kakkadampoyil', 'Chhindwara',
       'Havelock Island', 'Tuljapur', 'Manali', 'Sinhagad', 'Kodaikanal',
       'Murdeshwar', 'Agra', 'Eco Park - New Town - Kolkata',
       'Machranga Dweep (Kingfisher Island)', 'Daltonganj', 'Sripuram',
       'Bangalore', 'Visakhapatnam', 'Aurangabad', 'Badrinath',
       'Srirangam', 'Malakonda', 'Ooty', 'Dalhousie', 'Konark',
       'Amritsar', 'Kasauli', 'Netarhat', 'Ram Jhula', 'Lakshadweep',
       'Baroda', 'Bhangarh', 'Hyderabad', 'Pondicherry', 'Mahabaleshwar',
       'Mukteshwar', 'Rameshwaram', 'Kuldhara', 'Lucknow',
       'Tapola - Mahabaleshwar', 'Gorakhpur'], dtype=object)

### UDF for preprocessing

In [29]:
def remove_non_ascii(words):
    """Remove non-ASCII characters from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words

In [27]:
def remove_punctuation(words):
    """Remove punctuation from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', word)
        if new_word != '':
            new_words.append(new_word)
    return new_words

In [28]:
def replace_numbers(words):
    """Replace all interger occurrences in list of tokenized words with textual representation"""
    p = inflect.engine()
    new_words = []
    for word in words:
        if word.isdigit():
            new_word = p.number_to_words(word)
            new_words.append(new_word)
        else:
            new_words.append(word)
    return new_words

In [30]:
def remove_stopwords(words):
    """Remove stop words from list of tokenized words"""
    new_words = []
    for word in words:
        if word not in stopwords.words('english'):
            new_words.append(word)
    return new_words

In [31]:
def lemmatize_verbs(words):
    """Lemmatize verbs in list of tokenized words"""
    lemmatizer = WordNetLemmatizer()
    lemmas = []
    for word in words:
        lemma = lemmatizer.lemmatize(word, pos='v')
        lemmas.append(lemma)
    return lemmas

### preprocessing pipeline

In [59]:
def text_preprocessing(text):
    """Text preprocessing"""
    text = text.lower()
    words = word_tokenize(text)
    words = remove_punctuation(words)
    words = replace_numbers(words)
    words = remove_stopwords(words)
    words = lemmatize_verbs(words)
    
    return " ".join(words)

In [60]:
df['review'] = df['review'].apply(lambda x: text_preprocessing(x))
# df.iloc[:10]['review'].apply(lambda x: text_preprocessing(x))  ## for testing

In [61]:
new_df = df.groupby(by=['place'])['review'].apply(list).reset_index(name='reviews')

In [62]:
new_df.head()

Unnamed: 0,place,reviews
0,Agra,[agra two thousand sixteen see taj mahal frien...
1,Amritsar,[amritsar good tourist place many foriegn tour...
2,Aurangabad,[aurangabad know ajanta ellora cave much see m...
3,Badrinath,[great visit place devotion god mahakaal one t...
4,Bangalore,[sneak peak bangalore musical fountain locate ...


### Vader sentiment

In [75]:
def get_vader_sentiment_score(texts):
    analyser = SentimentIntensityAnalyzer()
    scores = []
    for text in texts:
        scores.append(analyser.polarity_scores(text)['compound'])
    return np.mean(scores)

In [77]:
new_df['score'] = new_df['reviews'].apply(lambda x: get_vader_sentiment_score(x))

In [102]:
new_df.sort_values(by=['score'], ascending=False, inplace=True)

In [104]:
new_df.index = np.arange(0,len(new_df))

In [105]:
new_df.head(10)

Unnamed: 0,place,reviews,score
0,Malakonda,[sri malyadri lakshmi narasimha swamy vari dev...,0.99215
1,Visakhapatnam,[visakhapatnam know city destiny city one rare...,0.970111
2,Havelock Island,[must see place andaman trip scuba dive snoork...,0.95949
3,Kasauli,[kasauli one mostly visit tourist place majori...,0.954621
4,Tapola - Mahabaleshwar,[tapola best holiday destination winter anyone...,0.9505
5,Baroda,[spend twentyseven yrs life tell best place li...,0.943863
6,Mahabaleshwar,[one best place visit tour get holiday package...,0.9225
7,Aurangabad,[aurangabad know ajanta ellora cave much see m...,0.921595
8,Dalhousie,[travel dalhousie two thousand eleven school t...,0.921
9,Ram Jhula,[ram jhula situate rishikesh place lord shiva ...,0.9148
