This notebook aims at achieving part of speech tagging combined with sentiment analysis. The data is preprocessed in previous notebook. Text reviews here is readable English hotel stay reviews from multiple online travel agencies. 

Breakdown as below:
1. clean and convert reviews into sentences;
2. extract and lemmatize sujects in each sentence;
3. use CountVectoriser to identify most frequent subjects;
4. manually group subjects in 11 categories;
5. label each sentence with one category;
6. balance dataset according to the least counted category;
7. validate manual label using pipeline built of word2vec tranformer and SVC;
8. label each review and separate postive and negative tags based on TextBlob sentiment analyzer;

#### Import statement

In [1]:
import pandas as pd 
import numpy as np

import gensim, nltk, spacy
from textblob import TextBlob

from sklearn.ensemble import AdaBoostClassifier, ExtraTreesClassifier, RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn import model_selection
from sklearn.metrics import accuracy_score, classification_report, plot_confusion_matrix 
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

import warnings
warnings.filterwarnings('ignore')

#### load data

In [2]:
df = pd.read_csv('./datasets/text_reviews.csv', index_col=0)
df.head(2)

Unnamed: 0,property_id,rating,badge,timestamp,content
0,106005,10.0,Exceptional,2020-07-05,Beautiful hotel with great views and easy loca...
1,106005,10.0,Exceptional,2020-06-23,Spectacular hotel and location. Very well appo...


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 52003 entries, 0 to 52002
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   property_id  52003 non-null  int64  
 1   rating       52003 non-null  float64
 2   badge        52003 non-null  object 
 3   timestamp    52003 non-null  object 
 4   content      52003 non-null  object 
dtypes: float64(1), int64(1), object(3)
memory usage: 2.4+ MB


#### Step1. clean and convert reviews into sentences

In [4]:
df.content = df.content.map(lambda x: x.replace('\n',' '))
df.content = df.content.map(lambda x: x.replace('\r',' '))
df['content_by_sentences'] = df.content.map(lambda x: nltk.tokenize.sent_tokenize(x))

sentences = []
for review in df.content_by_sentences:
    for sent in review:
        sentences.append(sent)
df_sentence = pd.DataFrame(sentences, columns = ['sentences'])
df_sentence

Unnamed: 0,sentences
0,Beautiful hotel with great views and easy loca...
1,Spectacular hotel and location.
2,Very well appointed rooms and attentive staff!
3,"Great service, everyone was friendly and respe..."
4,Cleanliness was 100% Will stay here again with...
...,...
144297,Because of this noise was an issue.
144298,Live music from pub across the road and tv fro...
144299,"Room cleanliness was avaerage, clearly visible..."
144300,Small kitchen with bar fridge and microwave go...


In [5]:
df.head(2)

Unnamed: 0,property_id,rating,badge,timestamp,content,content_by_sentences
0,106005,10.0,Exceptional,2020-07-05,Beautiful hotel with great views and easy loca...,[Beautiful hotel with great views and easy loc...
1,106005,10.0,Exceptional,2020-06-23,Spectacular hotel and location. Very well appo...,"[Spectacular hotel and location., Very well ap..."


#### Step2. extract and lemmatize sujects in each sentence

In [6]:
def review_to_words(review):
#     Convert a document into a list of lowercase tokens, ignoring tokens that are too short or too long, remove punctuation
    for sentence in review:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True, min_len=3))
        
df_sentence['cleaned_words'] = list(review_to_words(df_sentence.sentences))
df_sentence['cleaned_sentences'] = df_sentence.cleaned_words.map(lambda x: ' '.join(x))

nlp = spacy.load('en_core_web_sm')
aspects_list = []
for sent in df_sentence.cleaned_sentences:
    aspect = ''
    doc = nlp(sent)
    for token in doc:
        if token.pos_ == 'NOUN':
            aspect += token.text + ' '
    aspects_list.append(aspect)
df_sentence['aspects'] = aspects_list

lemmatized_data = []
for asp in df_sentence.aspects:
    tb = TextBlob(asp)
    lemmatized = [word.lemmatize('n') for word in tb.words]
    lemmatized_concat = ' '.join(lemmatized)
    lemmatized_data.append(lemmatized_concat)
df_sentence['lemmatized'] = lemmatized_data

In [7]:
df_sentence.head(2)

Unnamed: 0,sentences,cleaned_words,cleaned_sentences,aspects,lemmatized
0,Beautiful hotel with great views and easy loca...,"[beautiful, hotel, with, great, views, and, ea...",beautiful hotel with great views and easy loca...,hotel views location,hotel view location
1,Spectacular hotel and location.,"[spectacular, hotel, and, location]",spectacular hotel and location,hotel location,hotel location


#### Step3. use CountVectoriser to identify most frequent subjects. 

In [8]:
X = df_sentence.lemmatized

cv = CountVectorizer(min_df=10, max_df=0.2, ngram_range=(1,1))
X_cv = cv.fit_transform(X)
cv_dtm = pd.DataFrame(X_cv.toarray(), columns=cv.get_feature_names())

total_count = []
for col in range(cv_dtm.shape[1]):
    count = sum(cv_dtm.iloc[:,col])
    total_count.append(count)
    
token_count = pd.DataFrame(total_count, index = cv.get_feature_names(), columns=['token_count'])
token_count.reset_index(inplace=True)
token_count.shape

(1978, 2)

In [9]:
token_count.sort_values('token_count', ascending=False, inplace=True)
token_count[:10]

Unnamed: 0,index,token_count
1462,room,30175
1627,staff,17724
990,location,16658
837,hotel,16342
146,bed,6859
1524,service,5791
1147,night,5087
207,breakfast,4856
1873,view,4566
1712,sydney,4296


#### Step4. manually group subjects in 11 categories. This step required domain knowledge.

In [10]:
# fixed condition
location = ['location','darling', 'harbour','hyde','quay','city','bridge','sydney','area','town','distance','view','walk','train','rail','railway','ferry','shuttle','station','airport','transport','shopping','opera', 'house','attraction']
noise = ['construction','traffic','music','road']

# facilities
public_facilities = ['pool','spa','gym','lift','elevator','signage','safety','protocol','security','sanitiser','wifi','internet','car park','carpark','parking lot','parking','park']

# room condition
room_condition = ['room size','size','space','door','photo','carpet','furniture','condition','feature','blind','air conditioning','air con','air conditioner', 'noise','noisy']
room_amenities = ['clock','blanket','feather','kettle','sofa','chair','dryer','curtain','phone','kitchen','kitchenette','fridge','glass','amenity','coffee','tea','milk','clothes','robe','hanger','cup','teaspoon','cuttlery','knife','spoon','plate','iron','ironing','mini bar']
cleanliness = ['cleanliness','cleaner','cleaning','smell','hair','smoke']
bathroom = ['towel','bathroom','shower','toilet','water pressure','water','sink','soap','shampoo','conditioner','toothpaste','toothbrush','bathtub','toiletry','tile']
bed = ['bedroom','mattress','sheet','bed','linen','pillow','king','single']

service = ['disappointment','housekeeping','arrival','staff','service','member','check','reception','receptionist','front','counter','desk','concierge','complaint','response','customer','currency','help','manager','luggage','attention','champagne', 'anniversary','birthday','surprise']
value_for_money = ['value money','value','money','price','budget','cost','deal']

food_beverage = ['bar','restaurant','room service','food','drink','breakfast','dinner','soup','meal','buffet','kid','mushroom', 'bacon', 'muffin','chip','fish','steak','salmon']

#### Step5. label each sentence with one category. 
#### This step may introduce bias to some extent, however, this method could maintain label consistency.

In [11]:
label = []
for count, aspect in enumerate(df_sentence.lemmatized):
    aspect_list = aspect.split()
    for asp in aspect_list:
        if asp in noise:
            label.append('noise')
            break
        elif asp in cleanliness:
            label.append('cleanliness')
            break
        elif asp in location:
            label.append('location')
            break
        elif asp in value_for_money:
            label.append('value_for_money')
            break
        elif asp in public_facilities:
            label.append('public_facilities')
            break
        elif asp in room_condition:
            label.append('room_condition')
            break
        elif asp in room_amenities:
            label.append('room_amenities')
            break
        elif asp in bathroom:
            label.append('bathroom')
            break
        elif asp in bed:
            label.append('bed')
            break
        elif asp in service:
            label.append('service')
            break
        elif asp in food_beverage:
            label.append('food_beverage')
            break
        else:
            continue
            
    if len(label) == count: # if no label was appended in this loop
        label.append(np.nan)

df_sentence['label'] = label

In [12]:
df_sentence.head(2)

Unnamed: 0,sentences,cleaned_words,cleaned_sentences,aspects,lemmatized,label
0,Beautiful hotel with great views and easy loca...,"[beautiful, hotel, with, great, views, and, ea...",beautiful hotel with great views and easy loca...,hotel views location,hotel view location,location
1,Spectacular hotel and location.,"[spectacular, hotel, and, location]",spectacular hotel and location,hotel location,hotel location,location


#### Step6. balance dataset 
#### This step downsampled dataset according to the least category -  cleanliness. 

In [None]:
df_sentence['label'] = label
df_sentence = df_sentence[df_sentence.label.notna()]
df_sentence.shape

In [13]:
df_sentence.label.value_counts(dropna=False)

NaN                  50562
location             29967
service              22899
food_beverage         7757
bathroom              6336
public_facilities     6198
bed                   5992
room_condition        4304
value_for_money       4057
room_amenities        3969
noise                 1257
cleanliness           1004
Name: label, dtype: int64

In [14]:
location = df_sentence[df_sentence.label == 'location'].sample(n = 1004, replace = True)
service = df_sentence[df_sentence.label == 'service'].sample(n = 1004, replace = True)
f_b = df_sentence[df_sentence.label == 'food_beverage'].sample(n = 1004, replace = True)
bathroom = df_sentence[df_sentence.label == 'bathroom'].sample(n = 1004, replace = True)
public_facilities = df_sentence[df_sentence.label == 'public_facilities'].sample(n = 1004, replace = True)
bed = df_sentence[df_sentence.label == 'bed'].sample(n = 1004, replace = True)
room_condition = df_sentence[df_sentence.label == 'room_condition'].sample(n = 1004, replace = True)
value_for_money = df_sentence[df_sentence.label == 'value_for_money'].sample(n = 1004, replace = True)
room_amenities = df_sentence[df_sentence.label == 'room_amenities'].sample(n = 1004, replace = True)
noise = df_sentence[df_sentence.label == 'noise'].sample(n = 1004, replace = True)
cleanliness = df_sentence[df_sentence.label == 'cleanliness'].sample(n = 1004, replace = True)

balanced_sample = pd.concat([location,service,f_b,bathroom,public_facilities,bed,room_condition,room_amenities,value_for_money,noise,cleanliness])
balanced_sample.reset_index(drop=True, inplace=True)
balanced_sample.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11044 entries, 0 to 11043
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   sentences          11044 non-null  object
 1   cleaned_words      11044 non-null  object
 2   cleaned_sentences  11044 non-null  object
 3   aspects            11044 non-null  object
 4   lemmatized         11044 non-null  object
 5   label              11044 non-null  object
dtypes: object(6)
memory usage: 517.8+ KB


#### Step7. validate manual label using pipeline built of word2vec tranformer and SVC
#### word2vec transformer has been tuned with below parameters for the best performance.

In [15]:
X = balanced_sample.cleaned_sentences
X = X.str.replace('[^a-z ]', '').str.lower().str.split()

y = balanced_sample.label

X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, stratify=y, test_size=0.30, random_state=42)

In [16]:
from sklearn.base import BaseEstimator, TransformerMixin
from gensim.models import Word2Vec, Doc2Vec


class GensimWord2VecVectorizer(BaseEstimator, TransformerMixin):
    """
    Word vectors are averaged across to create the document-level vectors/features.
    gensim's own gensim.sklearn_api.W2VTransformer doesn't support out of vocabulary words,
    hence we roll out our own.
    All the parameters are gensim.models.Word2Vec's parameters.
    https://radimrehurek.com/gensim/models/word2vec.html#gensim.models.word2vec.Word2Vec
    """

    def __init__(self, vector_size=10, alpha=0.025, window=5, min_count=5, max_vocab_size=None,
                 sample= 5e-5, seed=1, workers=3, min_alpha=0.0001, sg=0, hs=0, negative=5,
                  cbow_mean=1, hashfxn=hash, epochs=50, null_word=0,
                 trim_rule=None, sorted_vocab=1, batch_words=1, compute_loss=False,
                 callbacks=()):
        self.vector_size = vector_size
        self.alpha = alpha
        self.window = window
        self.min_count = min_count
        self.max_vocab_size = max_vocab_size
        self.sample = sample
        self.seed = seed
        self.workers = workers
        self.min_alpha = min_alpha
        self.sg = sg
        self.hs = hs
        self.negative = negative        
        self.cbow_mean = cbow_mean
        self.hashfxn = hashfxn
        self.epochs = epochs
        self.null_word = null_word
        self.trim_rule = trim_rule
        self.sorted_vocab = sorted_vocab
        self.batch_words = batch_words
        self.compute_loss = compute_loss
        self.callbacks = callbacks
         
    def fit(self, X, y=None):
        self.model_ = Word2Vec(
            sentences=X,
            vector_size=self.vector_size, alpha=self.alpha, window=self.window, min_count=self.min_count,
            max_vocab_size=self.max_vocab_size, sample=self.sample, seed=self.seed,
            workers=self.workers, min_alpha=self.min_alpha, sg=self.sg, hs=self.hs,
            negative=self.negative, cbow_mean=self.cbow_mean,
            hashfxn=self.hashfxn, epochs=self.epochs, null_word=self.null_word,
            trim_rule=self.trim_rule, sorted_vocab=self.sorted_vocab, batch_words=self.batch_words,
            compute_loss=self.compute_loss, callbacks=self.callbacks)
        return self

    def transform(self, X):
        X_embeddings = np.array([self._get_embedding(words) for words in X])
        return X_embeddings

    def _get_embedding(self, words):
        valid_words = [word for word in words if word in self.model_.wv.index_to_key]
        if valid_words:
            embedding = np.zeros((len(valid_words), self.vector_size), dtype=np.float32)
            for idx, word in enumerate(valid_words):
                embedding[idx] = self.model_.wv[word]

            return np.mean(embedding, axis=0)
        else:
            return np.zeros(self.vector_size)

In [17]:
# cbow, vector_size = 100, sample = 0.01, train on balanced dataset
gensim_word2vec_tr = GensimWord2VecVectorizer(vector_size=100,window=5, min_count=3,sample=0.01, sg=0,hs=1, alpha=0.025, epochs=20)
svc = SVC()

model = Pipeline([
    ('w2vt', gensim_word2vec_tr),
    ('svc', svc)
])

model.fit(X_train, y_train)

Pipeline(steps=[('w2vt',
                 GensimWord2VecVectorizer(epochs=20, hs=1, min_count=3,
                                          sample=0.01, vector_size=100)),
                ('svc', SVC())])

In [18]:
print(classification_report(y_test, model.predict(X_test)))

                   precision    recall  f1-score   support

         bathroom       0.73      0.78      0.76       301
              bed       0.80      0.84      0.82       301
      cleanliness       0.85      0.79      0.82       301
    food_beverage       0.84      0.81      0.82       302
         location       0.80      0.75      0.78       301
            noise       0.87      0.90      0.89       301
public_facilities       0.79      0.72      0.75       302
   room_amenities       0.68      0.64      0.66       302
   room_condition       0.67      0.66      0.66       301
          service       0.71      0.81      0.76       301
  value_for_money       0.84      0.86      0.85       301

         accuracy                           0.78      3314
        macro avg       0.78      0.78      0.78      3314
     weighted avg       0.78      0.78      0.78      3314



#### Step8. label each review and separate postive and negative tags based on TextBlob sentiment analyzer

In [19]:
def pos_neg_separator(df):
    aspects_list = []
    for review in df.content_by_sentences:
        aspects = []
        for sentence in review:
            words = [gensim.utils.simple_preprocess(str(sentence), min_len=3, deacc=True)] # clean and convert sentence into words
            label = model.predict(words) # use model to label each sentence
            sentiment = TextBlob(sentence).sentiment # pass sentiment analyzer to get polarity score
            aspects.append({'aspect': label[0], 'polarity': round(sentiment[0],2)})  
        aspects_list.append(aspects)
    
    polarity_ave = []
    for asp in aspects_list:
        if str(asp) == 'nan':
            polarity_ave.append(np.nan)
        else:
            polarity_mean = pd.DataFrame(asp).groupby('aspect').mean().to_dict()['polarity']
            polarity_ave.append(polarity_mean)
    
    positive_aspects = []
    negative_aspects = []

    for asp in polarity_ave:
        positive_aspect = []
        negative_aspect = []

        for k,v in asp.items():
            if v > 0:
                positive_aspect.append(k)
            else:
                negative_aspect.append(k)

        if len(positive_aspect) > 0:
            positive_aspects.append(positive_aspect)
        else:
            positive_aspects.append(np.nan)
        if len(negative_aspect) > 0:    
            negative_aspects.append(negative_aspect)
        else:
            negative_aspects.append(np.nan)

    df['positive_aspects'] = positive_aspects
    df['negative_aspects'] = negative_aspects
    
    return df

In [20]:
pos_neg_separator(df)

Unnamed: 0,property_id,rating,badge,timestamp,content,content_by_sentences,positive_aspects,negative_aspects
0,106005,10.0,Exceptional,2020-07-05,Beautiful hotel with great views and easy loca...,[Beautiful hotel with great views and easy loc...,[location],
1,106005,10.0,Exceptional,2020-06-23,Spectacular hotel and location. Very well appo...,"[Spectacular hotel and location., Very well ap...","[location, service]",
2,106005,10.0,Exceptional,2020-06-20,"Great service, everyone was friendly and respe...","[Great service, everyone was friendly and resp...",[service],[location]
3,106005,10.0,Exceptional,2020-05-17,The lady that checked me was very kind and hel...,[The lady that checked me was very kind and he...,"[location, service]",
4,106005,10.0,Exceptional,2020-03-23,Impeccable! Attentive staff and relaxing room ...,"[Impeccable!, Attentive staff and relaxing roo...","[room_condition, service]",[location]
...,...,...,...,...,...,...,...,...
51998,530837,10.0,Exceptional,2018-05-15,"Very warm room on a cold and windy day, Staf...","[Very warm room on a cold and windy day, Sta...",[service],
51999,530837,8.0,Very Good,2018-05-12,"Easy to find, free parking available along wit...","[Easy to find, free parking available along wi...",[public_facilities],
52000,530837,8.0,Very Good,2018-05-11,The manager went out of her way to make me fee...,[The manager went out of her way to make me fe...,"[cleanliness, room_condition]",[food_beverage]
52001,530837,4.0,Fair,2018-03-14,No aircon very noisy Lack of aircon in Sydney...,[No aircon very noisy Lack of aircon in Sydne...,[noise],
