# Preprocessing

### Transform Data

Load dataset

In [1]:
import pandas as pd

df = pd.read_csv('./data/unprocessed/tripadvisor_hotel_reviews.csv')

df

Unnamed: 0,Review,Rating
0,nice hotel expensive parking got good deal sta...,4
1,ok nothing special charge diamond member hilto...,2
2,nice rooms not 4* experience hotel monaco seat...,3
3,"unique, great stay, wonderful time hotel monac...",5
4,"great stay great stay, went seahawk game aweso...",5
...,...,...
20486,"best kept secret 3rd time staying charm, not 5...",5
20487,great location price view hotel great quick pl...,4
20488,"ok just looks nice modern outside, desk staff ...",2
20489,hotel theft ruined vacation hotel opened sept ...,1


Function to extract lemmatized nouns and their sentiment context from a sentence

In [2]:
import spacy
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

nltk.download('vader_lexicon')

nlp = spacy.load("en_core_web_sm")
sia = SentimentIntensityAnalyzer()

def get_review_features(sentence):
    doc = nlp(sentence)

    feature_set = set()
    for token in doc:
        if token.pos_ == "NOUN":
            noun = token.lemma_  # lemmatize the noun
            adjectives = tuple(modifier.lemma_ for modifier in token.children)  # get adjectives attached to noun
            compound_sentiment_score = sia.polarity_scores(" ".join(adjectives + tuple(noun)))['compound']  #

            if compound_sentiment_score > 0:
                feature_set.add(f'{noun} (Positive)')
            elif compound_sentiment_score < 0:
                feature_set.add(f'{noun} (Negative)')

    return feature_set

get_review_features(df['Review'].iloc[3])

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\mhugh\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


{'feature (Positive)',
 'hotel (Positive)',
 'lobby (Positive)',
 'palatte (Positive)',
 'room (Positive)',
 'sign (Negative)',
 'staff (Positive)',
 'stay (Positive)',
 'stroll (Positive)',
 'time (Positive)',
 'touch (Positive)'}

Run function

In [3]:
from tqdm.autonotebook import tqdm

review_features = tuple(get_review_features(sentence) for sentence in tqdm(df['Review']))

  from tqdm.autonotebook import tqdm


  0%|          | 0/20491 [00:00<?, ?it/s]

Convert to pandas dataframe

In [4]:
from sklearn.preprocessing import MultiLabelBinarizer
from nltk.corpus import words

nltk.download('words')

mlb = MultiLabelBinarizer()

data = mlb.fit_transform(review_features)  # convert sets of words to bag of words

df_mod = pd.DataFrame(data, columns=mlb.classes_)  # display as dataframe

english_words = set(words.words())  # filter out any columns that are not english words

df_mod = df_mod.loc[:, [word for word in df_mod.columns if word.split(' (')[0] in english_words]]

df_mod.insert(0, 'Rating', df['Rating'])

df_mod

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\mhugh\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


Unnamed: 0,Rating,a (Negative),aa (Positive),abac (Positive),ability (Negative),ability (Positive),abode (Positive),abound (Positive),abrasion (Positive),abrasive (Negative),...,yorker (Positive),young (Negative),yuan (Negative),yuck (Positive),yummy (Positive),zipper (Negative),zombie (Positive),zone (Negative),zone (Positive),zoo (Positive)
0,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20486,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20487,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20488,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20489,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Save data

In [5]:
df_mod.to_csv('./data/preprocessed/df_mod.csv', index=False)

'Done'

'Done'