In [1]:
import nltk
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.sentiment import SentimentIntensityAnalyzer
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
Review_df = pd.read_csv("ReviewsNew.csv")
Review_df.head()

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments,host_is_superhost,review_scores_value
0,109,449036,2011-08-15,927861,Edwin,The host canceled my reservation the day befor...,f,4.0
1,109,74506539,2016-05-15,22509885,Jenn,Me and two friends stayed for four and a half ...,f,4.0
2,2708,13994902,2014-06-09,10905424,Kuberan,i had a wonderful stay. Everything from start ...,t,4.85
3,2708,14606598,2014-06-23,2247288,Camilla,Charles is just amazing and he made my stay sp...,t,4.85
4,2708,39597339,2015-07-25,27974696,Fallon,Staying with Chas was an absolute pleasure. He...,t,4.85


In [3]:
import os
import matplotlib.pyplot as plt
from wordcloud import WordCloud

## Cleaning Data

In [4]:
# Checking for missing data
Review_df.isnull().sum()

listing_id                0
id                        0
date                      0
reviewer_id               0
reviewer_name             2
comments               1107
host_is_superhost         0
review_scores_value     506
dtype: int64

In [5]:
# Drop NA
Review_Clean = Review_df.dropna()

In [6]:
# Checking for duplicates
print("Number of duplicates: ", Review_Clean.duplicated(keep = "first").sum())

Number of duplicates:  0


In [7]:
Review_Clean.shape

(1142258, 8)

In [8]:
Review_Clean.dtypes

listing_id               int64
id                       int64
date                    object
reviewer_id              int64
reviewer_name           object
comments                object
host_is_superhost       object
review_scores_value    float64
dtype: object

## Text Cleaning

In [6]:
import re
import string

def text_cleaning(text):
    
    # Covert Upper case to lower case
    text = text.lower()
    
    ## Decontract text
    # specific
    text = re.sub(r"won\'t", "will not", text)
    text = re.sub(r"can\'t", "can not", text)
    text = re.sub(r"won\’t", "will not", text)
    text = re.sub(r"can\’t", "can not", text)
    text = re.sub(r"\'t've", " not have", text)
    text = re.sub(r"\'d've", " would have", text)
    text = re.sub(r"\'clock", "f the clock", text)
    text = re.sub(r"\'cause", " because", text)
    
    # general
    text = re.sub(r"n\'t", " not", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'s", " is", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'t", " not", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'m", " am", text)
    text = re.sub(r"n\’t", " not", text)
    text = re.sub(r"\’re", " are", text)
    text = re.sub(r"\’s", " is", text)
    text = re.sub(r"\’d", " would", text)
    text = re.sub(r"\’ll", " will", text)
    text = re.sub(r"\’t", " not", text)
    text = re.sub(r"\’ve", " have", text)
    text = re.sub(r"\’m", " am", text) 
    
    # remove all puctuation
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    #text = re.sub('\w*\d\w*', '', text) #remove digit/number
    
    # remove all special characters
    text = re.sub(r'\W', ' ', text)

    # remove break
    text = re.sub('[‘’“”…]', '', text)
    text = re.sub('\n', '', text)
    
    # remove prefixed 'b'
    text = re.sub(r'^b\s+', '', text)
    
    # substituting multiple spaces with single space
    text = re.sub(r'\s+', ' ', text, flags=re.I)
    
    return text

In [7]:
Review_Clean['comments'] = Review_Clean['comments'].apply(lambda x: text_cleaning(x))

# Sentiment Analysis

In [8]:
from nltk.corpus import stopwords
import os
from wordcloud import WordCloud
import matplotlib.pyplot as plt
%matplotlib inline
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hoain\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Convert Lower Case, Remove Punctuation, Number and Tokenize

In [9]:
def tokenize(text):
    text = text.lower()
    text = nltk.word_tokenize(text)
    text = [word for word in text if word.isalpha()]
    return text


In [10]:
Review_Clean['comments'] = Review_Clean['comments'].apply(lambda x: tokenize(x))

### Remove Stopwords

In [11]:
nltk.download('stopwords')
StopWords = stopwords.words('english')
StopWords.append('room')
StopWords.append('room')
StopWords.append('hotel')
StopWords.append('place')
StopWords.append('will')
StopWords.append('stay')
StopWords.append('br')
StopWords.append('b')
StopWords.append('guest')
StopWords.append('house')
StopWords.append('apartment')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hoain\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
def remove_stopwords(text):
    text = [w for w in text if w not in StopWords]
    return ' '.join(text)

In [13]:
Review_Clean['comments'] = Review_Clean['comments'].apply(lambda x: remove_stopwords(x))

### Chunking - Adjective

In [14]:
def pre_process_adj(text):
    words = nltk.word_tokenize(text)
    tagged = nltk.pos_tag(words)
    adjectives = [word for word, pos in tagged
                 if (pos == 'JJ')]
    return adjectives

In [15]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\hoain\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [16]:
Review_Clean['comments.adj'] = Review_Clean['comments'].apply(lambda x: pre_process_adj(x))

In [17]:
Review_Clean.head()

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments,host_is_superhost,review_scores_value,comments.adj
0,109,449036,2011-08-15,927861,Edwin,host canceled reservation day arrival,f,4.0,[]
1,109,74506539,2016-05-15,22509885,Jenn,two friends stayed four half months great comf...,f,4.0,"[half, great, comfortable, short, overall, great]"
2,2708,13994902,2014-06-09,10905424,Kuberan,wonderful everything start end perfectbrbrwill...,t,4.85,"[wonderful, start]"
3,2708,14606598,2014-06-23,2247288,Camilla,charles amazing made special nice helpful abso...,t,4.85,"[special, nice, polite, quiet, tidy, clean, ta..."
4,2708,39597339,2015-07-25,27974696,Fallon,staying chas absolute pleasure accommodating r...,t,4.85,"[absolute, respectful, personal, nice, helpful..."


### Potter Stemming - Vader

In [21]:
import string
from nltk.stem import PorterStemmer

# Function to stem the words
def stem(words):
    stemmer = PorterStemmer()
    stem_words =[]
    for x in words:
        stem_words.append(stemmer.stem(x))
    return ' '.join(stem_words)

In [26]:
#Review_Clean['comments_adj'] = Review_Clean['comments.adj'].apply(stem)

In [18]:
Review_Clean['comments_adj'] = Review_Clean['comments.adj']

In [19]:
Review_Clean.head()

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments,host_is_superhost,review_scores_value,comments.adj,comments_adj
0,109,449036,2011-08-15,927861,Edwin,host canceled reservation day arrival,f,4.0,[],[]
1,109,74506539,2016-05-15,22509885,Jenn,two friends stayed four half months great comf...,f,4.0,"[half, great, comfortable, short, overall, great]","[half, great, comfortable, short, overall, great]"
2,2708,13994902,2014-06-09,10905424,Kuberan,wonderful everything start end perfectbrbrwill...,t,4.85,"[wonderful, start]","[wonderful, start]"
3,2708,14606598,2014-06-23,2247288,Camilla,charles amazing made special nice helpful abso...,t,4.85,"[special, nice, polite, quiet, tidy, clean, ta...","[special, nice, polite, quiet, tidy, clean, ta..."
4,2708,39597339,2015-07-25,27974696,Fallon,staying chas absolute pleasure accommodating r...,t,4.85,"[absolute, respectful, personal, nice, helpful...","[absolute, respectful, personal, nice, helpful..."


### Vander sentiment Analysis for Reviews

In [24]:
!pip install vaderSentiment



In [20]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

compound_val = []
sentiment_text =[]
sentiment_class =[]
analyzer = SentimentIntensityAnalyzer()

for comment in Review_Clean['comments.adj']:
    v1 = analyzer.polarity_scores(comment)
    score = v1['compound']
    compound_val.append(score)
    if score >= 0.05:
        sentiment_text.append("Positive")
        sentiment_class.append(1)
    elif (score > -0.05) and (score < 0.05):
        sentiment_text.append("Neutral")
        sentiment_class.append(0)
    elif score <= -0.05:
        sentiment_text.append("Negative")
        sentiment_class.append(-1)
        

In [21]:
Review_Clean['Sentiment_Emotion'] = sentiment_text
Review_Clean['Sentiment_Score'] = compound_val
Review_Clean['Sentiment_Class'] = sentiment_class
Review_Clean.head()

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments,host_is_superhost,review_scores_value,comments.adj,comments_adj,Sentiment_Emotion,Sentiment_Score,Sentiment_Class
0,109,449036,2011-08-15,927861,Edwin,host canceled reservation day arrival,f,4.0,[],[],Neutral,0.0,0
1,109,74506539,2016-05-15,22509885,Jenn,two friends stayed four half months great comf...,f,4.0,"[half, great, comfortable, short, overall, great]","[half, great, comfortable, short, overall, great]",Neutral,0.0,0
2,2708,13994902,2014-06-09,10905424,Kuberan,wonderful everything start end perfectbrbrwill...,t,4.85,"[wonderful, start]","[wonderful, start]",Neutral,0.0,0
3,2708,14606598,2014-06-23,2247288,Camilla,charles amazing made special nice helpful abso...,t,4.85,"[special, nice, polite, quiet, tidy, clean, ta...","[special, nice, polite, quiet, tidy, clean, ta...",Neutral,0.0,0
4,2708,39597339,2015-07-25,27974696,Fallon,staying chas absolute pleasure accommodating r...,t,4.85,"[absolute, respectful, personal, nice, helpful...","[absolute, respectful, personal, nice, helpful...",Neutral,0.0,0


In [22]:
Review_Clean.to_csv('SentimentReview2.csv', header = True, index = False)