**Pre-processing Airbnb Review Data for NLP**

# Introduction

## Read in libraries, data, and set notebook preferences

**Read in libraries**

In [42]:
#Read in libraries
import pandas as pd
import dask as dd
import swifter
import numpy as np
import nltk

**Read in data**

In [43]:
#Set path to data
path = r'C:\Users\kishe\Documents\Data Science\Projects\Python Projects\In Progress\Air BnB - SF\Data\02_Intermediate'

#Read in data
df = pd.read_csv(path + '/2020_0131_Reviews_Cleaned.csv',sep=',',index_col=0,
                 parse_dates=['date'])

**Set preferences for notebook**

In [44]:
#Ignore warnings
import warnings; warnings.simplefilter('ignore')

#Increase number of columns and rows displayed by Pandas
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows',500)
pd.set_option('display.width',1000)

## Preview data

In [45]:
#View shape and dtypes. Preview head
print('Reviews data shape:', df.shape)
print('Data types: \n', df.dtypes)
display(df.head())

Reviews data shape: (425509, 2)
Data types: 
 comments            object
date        datetime64[ns]
dtype: object


Unnamed: 0,comments,date
19330,...,2013-12-01
143113,Stop and book it now. Rea (Website hi...,2017-06-07
1021372,So I moved to SF in late May from Mich...,2013-06-02
64636,"This was the perfect home from home, o...",2014-10-16
174143,We loved our time in beautiful SF! The ...,2018-08-10


# Feature engineering

## Sentiment Analysis with Vader

Reviews data does not contain review scores associated with the review. Assigning  compound semantic score using VADER.

In [46]:
#Import and instantiate sentiment intensity analyzer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

def compound_scores(series):
    #Function to capture compound semantic score 
    score = analyzer.polarity_scores(series)['compound']
    return score

In [47]:
#Apply compound_scores to comments 
df['sentiment_compound']= df['comments'].swifter.apply(compound_scores)

HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=425509.0, style=ProgressStyle(descript…




### Preview most positive and negative reviews

In [48]:
#View some of the most positive reviews
df[['comments','sentiment_compound' ]].sort_values(by = 'sentiment_compound', ascending= False).head(3)

Unnamed: 0,comments,sentiment_compound
40972,This was perhaps the most amazing AirBnB exper...,0.9997
101333,This is a perfect place to stay. We were a lar...,0.9996
642545,"Leslie is the best host in ""the best city"":) I...",0.9995


In [49]:
#View some of the most negative reviews
df[['comments','sentiment_compound' ]].sort_values(by = 'sentiment_compound', ascending= False).tail(3)

Unnamed: 0,comments,sentiment_compound
1352782,I did not stay with Hostwell as I didn’t feel ...,-0.9941
173772,My host tried to make me feel welcome but I'm ...,-0.9956
88818,"I stayed in her apt from Feb 6 to Feb 10, 201...",-0.9978


## Assign postive, negative, and neutral  labels to df

In [64]:
#Function that assigns positive, negative, or neutral label depending on vader score
def labeler(vader_score):
    if vader_score >.25:
        return 'positive'
    elif vader_score <-.25:
        return 'negative'
    else:
        return 'neutral'

#Apply to df
df['label']= df.sentiment_compound.apply(labeler)

#check
display(df.head())

Unnamed: 0,comments,date,sentiment_compound,word_count,comments_pos_tag,comments_lemma,label
19330,...,2013-12-01,0.9534,39,"[(hello, NN), (josh, NN), (thank, VBD), (much,...","[hello, josh, thank, much, everything, comfort...",positive
143113,Stop and book it now. Rea (Website hi...,2017-06-07,0.9334,122,"[(stop, VB), (book, NN), (rea, JJ), (website, ...","[stop, book, rea, website, hide, later, person...",positive
1021372,So I moved to SF in late May from Mich...,2013-06-02,0.986,175,"[(moved, VBN), (may, MD), (michigan, VB), (sum...","[move, may, michigan, summer, day, look, perma...",positive
64636,"This was the perfect home from home, o...",2014-10-16,0.9287,19,"[(perfect, JJ), (home, NN), (home, NN), (host,...","[perfect, home, home, host, amaze, like, calif...",positive
174143,We loved our time in beautiful SF! The ...,2018-08-10,0.9824,50,"[(time, NN), (place, NN), (location, NN), (nea...","[time, place, location, near, everything, nadi...",positive


## Word counts

In [55]:
#Capture number of characters used in comments
df['word_count'] = [len(x.split()) for x in df['comments'].tolist()]

#Check
display(df.head())

Unnamed: 0,comments,date,sentiment_compound,word_count,comments_pos_tag,comments_lemma
19330,...,2013-12-01,0.9534,39,"[(hello, NN), (josh, NN), (thank, VBD), (much,...","[hello, josh, thank, much, everything, comfort..."
143113,Stop and book it now. Rea (Website hi...,2017-06-07,0.9334,122,"[(stop, VB), (book, NN), (rea, JJ), (website, ...","[stop, book, rea, website, hide, later, person..."
1021372,So I moved to SF in late May from Mich...,2013-06-02,0.986,175,"[(moved, VBN), (may, MD), (michigan, VB), (sum...","[move, may, michigan, summer, day, look, perma..."
64636,"This was the perfect home from home, o...",2014-10-16,0.9287,19,"[(perfect, JJ), (home, NN), (home, NN), (host,...","[perfect, home, home, host, amaze, like, calif..."
174143,We loved our time in beautiful SF! The ...,2018-08-10,0.9824,50,"[(time, NN), (place, NN), (location, NN), (nea...","[time, place, location, near, everything, nadi..."


# Preprocessing comments data

## Language Processing Pipeline

### Build Pipeline

In [51]:
#Import libraries
import re
from nltk.corpus import stopwords, wordnet
stop = stopwords.words('english')
from nltk.corpus import wordnet
from nltk.stem.wordnet import WordNetLemmatizer

def get_wordnet_pos(treebank_tag):
#Create function for to capture POS of token for lemmatizer
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def comment_preprocessor(series):
    """
    Function that completes the following preprocessing steps
    Remove numbers
    Tokenize columns of text data in pandas. 
    Additionally remove punctuation and lower-case text    
    Remove tokens with < 1 character
    Remove stopwords
    POS Tagging  
    """
    series = re.sub(r'\d+', '', series) #remove numbers from text
    tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+') #Instantiat tokenizer
    tokens = tokenizer.tokenize(series) #Tokenize series and remove punctuation
    tokens = [token.lower() for token in tokens] #convert tokens to lowercase
    tokens = [tokens.remove(token) if len(token) <3 else token for token in tokens] #remove tokens with len <3
    tokens = [token for token in tokens if token] #Remove None from tokens
    tokens = [token for token in tokens if token not in stop] #Remove stopwords
    tokens = nltk.tag.pos_tag(tokens)#apply word tags to tokens
    return tokens

def lemmatisation(pos_tags):
    lemmatizer = WordNetLemmatizer()
    lemma = [lemmatizer.lemmatize(pos_tag[0], get_wordnet_pos(pos_tag[1])) for pos_tag in pos_tags]
    return lemma

### Apply pipeline to reviews data

In [52]:
#Import progress bar to track time to apply comment_preprocessor
from tqdm import tqdm, tqdm_pandas
tqdm_pandas(tqdm)

#Apply comment_preprocessor to comments
df['comments_pos_tag'] = df['comments'].progress_apply(comment_preprocessor)

#Apply lemmatisation to comments_pos_tag
df['comments_lemma'] = df['comments_pos_tag'].progress_apply(lemmatisation)






  0%|                                                                                       | 0/425509 [00:00<?, ?it/s][A[A[A[A



  0%|                                                                             | 20/425509 [00:00<36:23, 194.86it/s][A[A[A[A



  0%|                                                                             | 48/425509 [00:00<33:09, 213.81it/s][A[A[A[A



  0%|                                                                             | 81/425509 [00:00<29:44, 238.34it/s][A[A[A[A



  0%|                                                                            | 122/425509 [00:00<26:03, 272.03it/s][A[A[A[A



  0%|                                                                            | 152/425509 [00:00<25:21, 279.60it/s][A[A[A[A



  0%|                                                                            | 187/425509 [00:00<23:52, 296.95it/s][A[A[A[A



  0%|                                             

In [53]:
#check
display(df.head())

Unnamed: 0,comments,date,sentiment_compound,character_count,word_count,comments_pos_tag,comments_lemma
19330,...,2013-12-01,0.9534,283,39,"[(hello, NN), (josh, NN), (thank, VBD), (much,...","[hello, josh, thank, much, everything, comfort..."
143113,Stop and book it now. Rea (Website hi...,2017-06-07,0.9334,663,122,"[(stop, VB), (book, NN), (rea, JJ), (website, ...","[stop, book, rea, website, hide, later, person..."
1021372,So I moved to SF in late May from Mich...,2013-06-02,0.986,974,175,"[(moved, VBN), (may, MD), (michigan, VB), (sum...","[move, may, michigan, summer, day, look, perma..."
64636,"This was the perfect home from home, o...",2014-10-16,0.9287,113,19,"[(perfect, JJ), (home, NN), (home, NN), (host,...","[perfect, home, home, host, amaze, like, calif..."
174143,We loved our time in beautiful SF! The ...,2018-08-10,0.9824,297,50,"[(time, NN), (place, NN), (location, NN), (nea...","[time, place, location, near, everything, nadi..."


# Write file to csv

In [65]:
#View reviews shape
print('Final reviews shape:',df.shape)

#Set path to write processed data
path = r'C:\Users\kishe\Documents\Data Science\Projects\Python Projects\In Progress\Air BnB - SF\Data\03_Processed'

#Write to csv
df.to_csv(path + '/2020_0208_Reviews_Processed_NLP.csv',sep=',', index=False)

Final reviews shape: (425509, 7)
