In [19]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from selenium import webdriver
import time
import string
import nltk
import collections
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from nltk.classify.util import apply_features
from nltk.metrics.scores import (accuracy, precision, recall)
from sklearn.model_selection import train_test_split

# Data

In [2]:
# Use Selenium to get RMNP review data from All Trails
options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors')
options.add_argument("--test-type")
options.binary_location = ""
driver = webdriver.Chrome(options=options)
driver.get('https://www.alltrails.com/parks/us/colorado/rocky-mountain-national-park')

for i in np.arange(50):
    submit_button = driver.find_elements_by_xpath('//*[@id="load_more"]/div[2]/h3')[0]
    submit_button.click()
    time.sleep(10)

html = driver.page_source
soup = BeautifulSoup(html, "lxml")

In [36]:
# Clean comments from reviews
comments = soup.findAll("p", {"itemprop": "reviewBody"})
comments_clean = []
for i in comments:
    comments_clean.append(i.getText())

comments_tokenized = []
for i in comments_clean:
    tknzr = TweetTokenizer(preserve_case=False)
    tokens = tknzr.tokenize(i)
    tokens = [w for w in tokens if w not in string.punctuation]
    tokens = [w for w in tokens if w not in nltk.corpus.stopwords.words('english')]
    tokens = [w for w in tokens if len(w) > 2]
    comments_tokenized.append(tokens)

print("Total Number of Comments: "+ str(len(comments_tokenized)))

Total Number of Comments: 1525


In [37]:
# Clean ratings from reviews
ratings = soup.findAll("meta", {"itemprop": "ratingValue"})

ratings = ratings[1:len(ratings)]
ratings_clean = []
for i in ratings:
    ratings_clean.append(i.attrs["content"])

print("Total Number of Ratings: "+ str(len(ratings_clean)))

Total Number of Ratings: 1525


In [38]:
# Breakdown number of reviews by rating
print("Number of 5 star ratings " + str(ratings_clean.count("5")))
print("Number of 4 star ratings " + str(ratings_clean.count("4")))
print("Number of 3 star ratings " + str(ratings_clean.count("3")))
print("Number of 2 star ratings " + str(ratings_clean.count("2")))
print("Number of 1 star ratings " + str(ratings_clean.count("1")))

Number of 5 star ratings 1106
Number of 4 star ratings 334
Number of 3 star ratings 64
Number of 2 star ratings 12
Number of 1 star ratings 9


In [39]:
# Combine comments and ratings into reviews
alltrails_comments = list(zip(comments_tokenized,ratings_clean))

print("Total Number of Reviews: "+ str(len(alltrails_comments)))

Total Number of Reviews: 1525


# Model

In [43]:
# Create feature word extractor function
words = []
for i in comments_tokenized:
    words.append((" ").join(i))

word_features = []
word_features = (" ").join(words)
word_features = word_features.split(" ")
word_features = list(set(word_features))

def extract_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains(%s)' % word] = (word in document_words)
    return features

In [44]:
# Split into training and testing data
training_set, testing_set = train_test_split(alltrails_comments, test_size=0.4, random_state=1)
print("Training set size: "+ str(len(training_set)))
print("Training Set size: "+ str(len(testing_set)))

Training set size: 915
Training Set size: 610


In [30]:
# Create classification model & display top 10 important words
training_set = nltk.classify.util.apply_features(extract_features, training_set)
testing_set = nltk.classify.util.apply_features(extract_features, testing_set)
classifier = nltk.NaiveBayesClassifier.train(training_set)
print('Accuracy:', nltk.classify.util.accuracy(classifier, testing_set))
classifier.show_most_informative_features(n=10)

Accuracy: 0.6508196721311476
Most Informative Features
      contains(allowing) = True                1 : 5      =     95.4 : 1.0
           contains(dog) = True                1 : 5      =     95.4 : 1.0
         contains(nicer) = True                2 : 5      =     83.5 : 1.0
         contains(horse) = True                2 : 5      =     83.5 : 1.0
       contains(shorter) = True                2 : 5      =     83.5 : 1.0
         contains(tours) = True                2 : 5      =     83.5 : 1.0
       contains(counter) = True                2 : 5      =     83.5 : 1.0
         contains(flies) = True                2 : 5      =     83.5 : 1.0
 contains(disappointing) = True                2 : 5      =     83.5 : 1.0
      contains(compared) = True                2 : 5      =     83.5 : 1.0


In [41]:
# Create confusion matrix
prediction = []
actual = []
for i, (feats, label) in enumerate(testing_set):
    prediction.append(int(classifier.classify(feats)))
    actual.append(int(label))
    
confmatrix = pd.DataFrame()
confmatrix["prediction"] = prediction
confmatrix["actual"] = actual
pd.crosstab(confmatrix.actual, confmatrix.prediction)

prediction,2,3,4,5
actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0,0,0,3
2,0,1,1,3
3,0,1,3,22
4,0,6,27,104
5,1,15,54,369


In [42]:
# Calculate accuracy by rating
refsets = collections.defaultdict(set)
testsets = collections.defaultdict(set)
for i, (feats, label) in enumerate(testing_set):
    refsets[label].add(i)
    observed = classifier.classify(feats)
    testsets[observed].add(i)

print('% Correct 5 star ratings:', nltk.recall(refsets['5'], testsets['5']))
print('% Correct 4 star ratings:', nltk.recall(refsets['4'], testsets['4']))
print('% Correct 3 star ratings:', nltk.recall(refsets['3'], testsets['3']))
print('% Correct 2 star ratings:', nltk.recall(refsets['2'], testsets['2']))
print('% Correct 1 star ratings:', nltk.recall(refsets['1'], testsets['1']))

% Correct 5 star ratings: 0.8405466970387244
% Correct 4 star ratings: 0.19708029197080293
% Correct 3 star ratings: 0.038461538461538464
% Correct 2 star ratings: 0.0
% Correct 1 star ratings: 0.0


### Sources:

http://www.laurentluce.com/posts/twitter-sentiment-analysis-using-python-and-nltk/

https://streamhacker.com/2010/05/10/text-classification-sentiment-analysis-naive-bayes-classifier/
        
https://streamhacker.com/2010/06/16/text-classification-sentiment-analysis-eliminate-low-information-features/

https://www.youtube.com/watch?v=zaIrQ3vMoTw
