<a href="https://colab.research.google.com/github/Madlhawa/Hotel-Recommendation-System/blob/master/Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [None]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
from nltk.tag import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer
import re, string
from nltk.corpus import stopwords
from nltk import FreqDist
import random
from nltk import classify
from nltk import NaiveBayesClassifier
from nltk.tokenize import word_tokenize
import pandas as pd

# Functions

In [None]:
def lemmatize_sentence(tokens):
    lemmatizer = WordNetLemmatizer()
    lemmatized_sentence = []
    for word, tag in pos_tag(tokens):
        if tag.startswith('NN'):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'
        lemmatized_sentence.append(lemmatizer.lemmatize(word, pos))
    return lemmatized_sentence

In [None]:
def remove_noise(tweet_tokens, stop_words = ()):

    cleaned_tokens = []

    for token, tag in pos_tag(tweet_tokens):
        token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
                       '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', token)
        token = re.sub("(@[A-Za-z0-9_]+)","", token)

        if tag.startswith("NN"):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'

        lemmatizer = WordNetLemmatizer()
        token = lemmatizer.lemmatize(token, pos)

        if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
            cleaned_tokens.append(token.lower())
    return cleaned_tokens

In [None]:
def get_all_words(cleaned_tokens_list):
    for tokens in cleaned_tokens_list:
        for token in tokens:
            yield token

In [None]:
def get_reviews_for_model(cleaned_tokens_list):
    for tweet_tokens in cleaned_tokens_list:
        yield dict([token, True] for token in tweet_tokens)

# Main Code

## Two Class

In [None]:
data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Data/booking.csv")

negative = data[(data['rating']<=5)]['review_content'].dropna().tolist()[:500]
positive = data[(data['rating']>5)]['review_content'].dropna().tolist()[:500]

negative_tokens = [word_tokenize(text) for text in negative]
positive_tokens = [word_tokenize(text) for text in positive]

In [None]:
positive_cleaned_tokens_list = []
negative_cleaned_tokens_list = []

for tokens in positive_tokens:
    positive_cleaned_tokens_list.append(remove_noise(tokens, stopwords.words('english')))

for tokens in negative_tokens:
    negative_cleaned_tokens_list.append(remove_noise(tokens, stopwords.words('english')))

In [None]:
all_pos_words = get_all_words(positive_cleaned_tokens_list)
freq_dist_pos = FreqDist(all_pos_words)

positive_tokens_for_model = get_reviews_for_model(positive_cleaned_tokens_list)
negative_tokens_for_model = get_reviews_for_model(negative_cleaned_tokens_list)

In [None]:
positive_dataset = [(review_dict, "Positive")
                     for review_dict in positive_tokens_for_model]

negative_dataset = [(review_dict, "Negative")
                     for review_dict in negative_tokens_for_model]

dataset = positive_dataset + negative_dataset

random.shuffle(dataset)

train_data = dataset[:700]
test_data = dataset[700:]

print(train_data)

[({'view': True, 'breakfast': True, 'proximity': True, 'beach': True, 'shower': True, 'bathroom': True, 'condition': True, 'time': True, 'pool': True, 'close': True, 'way': True, 'early': True, 'heat': True, 'bed': True, 'comfortable': True, 'staff': True, 'checking': True, 'hospitable': True}, 'Negative'), ({'location': True, 'spacious': True, 'hotel': True, 'premise': True, 'nice': True, 'would': True, 'perfect': True, 'stay': True, 'staff': True, 'pay': True, 'attention': True, 'care': True, 'cleanliness': True, 'specially': True, 'din': True, 'area': True, 'restaurant': True, 'rooms': True, 'properly': True, 'clean': True, 'floor': True, 'dust': True, 'take': True, 'time': True, 'world': True, 'serve': True, 'us': True, 'morning': True, 'tea': True, 'despite': True, 'ask': True, 'make': True, 'fast': True}, 'Negative'), ({'side': True, 'location': True, 'bad': True, 'like': True, 'village': True, 'room': True, 'small': True, 'stay': True, 'didnt': True, 'fund': True, 'pool': True, 

In [None]:
classifier = NaiveBayesClassifier.train(train_data)

print("Accuracy is:", classify.accuracy(classifier, test_data))

print(classifier.show_most_informative_features(10))

Accuracy is: 0.8366666666666667
Most Informative Features
                     bit = True           Positi : Negati =     15.3 : 1.0
                 jacuzzi = True           Positi : Negati =     14.6 : 1.0
                   dirty = True           Negati : Positi =     12.8 : 1.0
                   never = True           Negati : Positi =     12.8 : 1.0
               fantastic = True           Positi : Negati =     12.5 : 1.0
                sigiriya = True           Positi : Negati =     11.6 : 1.0
                   order = True           Negati : Positi =     10.8 : 1.0
                    desk = True           Negati : Positi =      9.5 : 1.0
                  toilet = True           Negati : Positi =      9.5 : 1.0
                 perfect = True           Positi : Negati =      9.2 : 1.0
None


## Three Class

### Multiple Code Blocks

In [None]:
data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Data/booking.csv")

negative = data[(data['rating']<=2)]['review_content'].dropna().tolist()[:3333]
neutral = data[(data['rating']>2) & (data['rating']<=6)]['review_content'].dropna().tolist()[:3333]
positive = data[(data['rating']>6)]['review_content'].dropna().tolist()[:3333]

negative_tokens = [word_tokenize(text) for text in negative]
neutral_tokans = [word_tokenize(text) for text in neutral]
positive_tokens = [word_tokenize(text) for text in positive]

In [None]:
positive_cleaned_tokens_list = []
neutral_cleaned_tokens_list = []
negative_cleaned_tokens_list = []

for tokens in positive_tokens:
    positive_cleaned_tokens_list.append(remove_noise(tokens, stopwords.words('english')))

for tokens in neutral_tokans:
    neutral_cleaned_tokens_list.append(remove_noise(tokens, stopwords.words('english')))

for tokens in negative_tokens:
    negative_cleaned_tokens_list.append(remove_noise(tokens, stopwords.words('english')))

In [None]:
all_pos_words = get_all_words(positive_cleaned_tokens_list)
freq_dist_pos = FreqDist(all_pos_words)

positive_tokens_for_model = get_reviews_for_model(positive_cleaned_tokens_list)
neutral_tokens_for_model = get_reviews_for_model(neutral_cleaned_tokens_list)
negative_tokens_for_model = get_reviews_for_model(negative_cleaned_tokens_list)

In [None]:
positive_dataset = [(review_dict, "Positive")
                     for review_dict in positive_tokens_for_model]

neutral_dataset = [(review_dict, "Neutral")
                     for review_dict in neutral_tokens_for_model]

negative_dataset = [(review_dict, "Negative")
                     for review_dict in negative_tokens_for_model]

dataset = positive_dataset + negative_dataset + neutral_dataset

random.shuffle(dataset)

train_data = dataset[:7000]
test_data = dataset[7000:]

print(train_data)



In [None]:
classifier = NaiveBayesClassifier.train(train_data)

print("Accuracy is:", classify.accuracy(classifier, test_data))

print(classifier.show_most_informative_features(10))

Accuracy is: 0.6875
Most Informative Features
                 unclean = True           Negati : Positi =     72.9 : 1.0
                horrible = True           Negati : Positi =     51.3 : 1.0
                 disgust = True           Negati : Positi =     41.2 : 1.0
                   claim = True           Negati : Positi =     41.2 : 1.0
                    rude = True           Negati : Positi =     36.7 : 1.0
                  booked = True           Negati : Positi =     34.9 : 1.0
                    dirt = True           Negati : Positi =     34.9 : 1.0
                attitude = True           Negati : Positi =     32.3 : 1.0
             beautifully = True           Positi : Neutra =     29.7 : 1.0
                    unit = True           Negati : Positi =     28.5 : 1.0
None


### Single Code block

In [None]:
size = 1000

data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Data/booking.csv")

negative = data[(data['rating']<3)]['review_content'].dropna().tolist()[:int(size*0.3)]
neutral = data[(data['rating']>=3.5) & (data['rating']<6.5)]['review_content'].dropna().tolist()[:int(size*0.3)]
positive = data[(data['rating']>=6.5)]['review_content'].dropna().tolist()[:int(size*0.3)]

negative_tokens = [word_tokenize(text) for text in negative]
neutral_tokans = [word_tokenize(text) for text in neutral]
positive_tokens = [word_tokenize(text) for text in positive]

positive_cleaned_tokens_list = []
neutral_cleaned_tokens_list = []
negative_cleaned_tokens_list = []

for tokens in positive_tokens:
    positive_cleaned_tokens_list.append(remove_noise(tokens, stopwords.words('english')))
for tokens in neutral_tokans:
    neutral_cleaned_tokens_list.append(remove_noise(tokens, stopwords.words('english')))
for tokens in negative_tokens:
    negative_cleaned_tokens_list.append(remove_noise(tokens, stopwords.words('english')))

# all_pos_words = get_all_words(positive_cleaned_tokens_list)
# freq_dist_pos = FreqDist(all_pos_words)

positive_tokens_for_model = get_reviews_for_model(positive_cleaned_tokens_list)
neutral_tokens_for_model = get_reviews_for_model(neutral_cleaned_tokens_list)
negative_tokens_for_model = get_reviews_for_model(negative_cleaned_tokens_list)

positive_dataset = [(review_dict, "Positive") for review_dict in positive_tokens_for_model]
neutral_dataset = [(review_dict, "Neutral") for review_dict in neutral_tokens_for_model]
negative_dataset = [(review_dict, "Negative") for review_dict in negative_tokens_for_model]

dataset = positive_dataset + negative_dataset + neutral_dataset

random.shuffle(dataset)

train_data = dataset[:int(size*0.7)]
test_data = dataset[int(size*0.7):]

classifier = NaiveBayesClassifier.train(train_data)

print("Accuracy is:", classify.accuracy(classifier, test_data))
print(classifier.show_most_informative_features(10))


Accuracy is: 0.63
Most Informative Features
                spacious = True           Positi : Negati =     20.1 : 1.0
                 jacuzzi = True           Positi : Negati =     18.0 : 1.0
                   dirty = True           Negati : Positi =     16.1 : 1.0
               beautiful = True           Positi : Negati =     14.6 : 1.0
                   suite = True           Positi : Neutra =     13.6 : 1.0
                     bit = True           Positi : Negati =     13.0 : 1.0
                   smell = True           Negati : Positi =     12.3 : 1.0
                    ever = True           Negati : Neutra =     11.8 : 1.0
                    call = True           Negati : Positi =     11.7 : 1.0
                  always = True           Positi : Negati =     11.6 : 1.0
None


# Test

In [None]:
# custom_tweet = "I ordered just once from TerribleCo, they screwed up, never used the app again."
custom_tweet = 'Bed was good / comfortable. Had to wit for 20 min for them to do the check in. Thay did not know that their was a booking till I showed my email.  The room given to me did not have water for a shower even. After I tolled them they offered an room change. First room I had to request  for towels and soap. Basicaly they did not any any idea of what was happaning.'

custom_tokens = remove_noise(word_tokenize(custom_tweet))

print(classifier.classify(dict([token, True] for token in custom_tokens)))

Negative
