In [1]:
import re
from pandas import Series, DataFrame
import pandas as pd
from numpy.random import randn
import numpy as np
%matplotlib inline 
import matplotlib.pyplot as plt
import os
import nltk
# use wordnet find the first level keywords
from nltk.corpus import wordnet as wn
import spacy

**********************************************************************************************
## Load data. 
### Merge sample homes and their reviews into one dataframe.

In [2]:
# Load Necessary Data: reviews
reviews_df = pd.read_csv("./Data/reviews.csv", encoding="utf-8")
reviews_df.columns = ['home_id', 'review_id', 'date', 'reviewer_id', 'reviewer_name', 'comments']
reviews_df.dropna()
reviews_df.head(2)

Unnamed: 0,home_id,review_id,date,reviewer_id,reviewer_name,comments
0,7202016,38917982,2015-07-19,28943674,Bianca,Cute and cozy place. Perfect location to every...
1,7202016,39087409,2015-07-20,32440555,Frank,Kelly has a great room in a very central locat...


In [4]:
sample1_df = pd.read_csv("./Data/sample_data_for_testing.csv", sep='\t', encoding="utf-8")
sample1_df = sample1_df.drop("Unnamed: 0", axis=1)
sample1_df.head(2)

Unnamed: 0,home_id,accommodates,price,number_of_reviews,scores_overall,scores_accuracy,scores_cleanliness,scores_checkin,scores_communication,scores_location,scores_value,reviews_per_month
0,241032,4,85,207,95.0,10.0,10.0,10.0,10.0,9.0,10.0,4.07
1,699460,4,95,151,97.0,10.0,10.0,10.0,10.0,10.0,10.0,3.85


In [5]:
# Merge the reviews and homes in the sample data.
df1 = sample1_df[['home_id', 'scores_cleanliness']]
df2 = reviews_df[['home_id', 'review_id', 'comments']]

print("-" * 40 
      + '\nTotal number of reviews: ' 
      + str(sample1_df['number_of_reviews'].sum()) 
      + "\n" + "-" * 40)

sample1_rh_df = pd.merge(df1, df2, on="home_id")
sample1_rh_df.head(3)

# sample1_rh_df.stack()[0].comments

----------------------------------------
Total number of reviews: 9908
----------------------------------------


Unnamed: 0,home_id,scores_cleanliness,review_id,comments
0,241032,10.0,682061,Excellent all the way around. \r\n\r\nMaija wa...
1,241032,10.0,691712,Maija's apartment was a wonderful place to sta...
2,241032,10.0,702999,one of the most pleasant stays i've had in my ...


**********************************************************************************************
## Analyze `cleanliness` aspect
### Use the selected aspect keywords to analyze reviews 
**********************************************************************************************
### Brief summary

In [6]:
# sample2_rh_df is a copy of sample1_rh_df to 
# in case unexpected modification for original data.
sample2_rh_df = sample1_rh_df
print('*' * 40 + '\nThere are:\n' + '-' * 40)
print(str(len(sample2_rh_df.groupby('home_id'))) + " Airbnb homes in total.\n" + '-' * 40)
print(str(len(sample2_rh_df)) + " reviews in total.\n" + '-' * 40)

****************************************
There are:
----------------------------------------
87 Airbnb homes in total.
----------------------------------------
9908 reviews in total.
----------------------------------------


In [7]:
# group by scores_cleanliness
df1 = pd.DataFrame(sample2_rh_df.groupby(['scores_cleanliness'])['home_id'].nunique())
df2 = pd.DataFrame(sample2_rh_df.groupby(['scores_cleanliness'])['review_id'].nunique())
cleanliness_overview_df = pd.merge(df1, df2, on = 'scores_cleanliness')
cleanliness_overview_df.columns = ['number of homes', 'number of reveiws']
cleanliness_overview_df

# what makes people comments
# number of review selection ()

Unnamed: 0_level_0,number of homes,number of reveiws
scores_cleanliness,Unnamed: 1_level_1,Unnamed: 2_level_1
8.0,3,342
9.0,24,2611
10.0,60,6955


**********************************************************************************************
### Chose one cleanliness score to do further analysis.
#### Here I choose Airbnb homes with cleanliness score as `8.0` to do the "pliot test". 
#### As in above table, there are `3` homes with `342` reviews in total.

In [68]:
sample2_rh_df[sample2_rh_df.scores_cleanliness == 9.0].groupby(['home_id'])['comments'].count()

home_id
58503       76
193836      92
208670      78
442487     104
490076      84
565703     158
741699     145
817115     121
988176     121
1018204     78
1090449    135
1163345    107
1602488    118
2130652     96
2154617    159
2284536    111
2367298    109
2399736     81
2763078     97
3040278    156
3386862     77
3449059     92
3592838    114
3852117    101
Name: comments, dtype: int64

*******
### Here, the logic of determine the aspect that a sentence talking about is:

* `aspect_keywords_dic` is a dictionary contains aspects and their relevant keywords.**

* Firstly, I use the Word2Vec similarity algorithm to count the similarity score between two words.

* For each sentence, the aspect-similarity-score will `+1` when one word in it has the word-similarity-score with any word in keyword list lager than `0.7`(this number can change based on needs). 

* I tried use the total scores of all vectors(word-in-sentence to word-in-keywords), however, the results are very strange.

In [9]:
nlp = spacy.load('en_core_web_md')

In [64]:
aspect_keywords_dic = {
    'location': ['region', 'locality', 'neck_of_the_woods', 'location', 'vicinity',
                 'neighbourhood', 'neighborhood'],
    'cleanliness': ['tidy_up', 'straighten_out', 'cleanliness', 'clean', 
                    'neaten', 'square_away', 'straighten', 'clean_house', 'make_clean', 
                    'tidy', 'houseclean', 'clean_up', 'scavenge',
                   'soiled', 'unclean', 'colly', 'bemire', 'uncleanliness', 'soil', 'begrime',
                    'grime', 'untidy', 'dirty']
}

def sim_sents(doc, aspect, sim):
    
    sim_sents = []
    aspect_keywords = nlp(' '.join(aspect_keywords_dic[aspect]))
    
    for which_sen in range(len(list(doc.sents))):
        new_doc = list(doc.sents)[which_sen].text        
        sen_keywords = nlp(new_doc)
        
        if compute_score(sen_keywords, aspect_keywords, sim) > 0:
            sim_sents.append(new_doc)
    
    return sim_sents

def compute_score(sen_keywords, aspect_keywords, sim):
    count = 0
    for token1 in sen_keywords:
        for token2 in aspect_keywords:
            if token1.similarity(token2) >= sim:
                count += 1 
    return count

In [65]:
def summary_of_homes_with_score(score, aspect):
    
    score_dic = {}
    score_name = 'scores_'+aspect
    
    score_df = sample2_rh_df[sample2_rh_df[score_name] == score]
    score_series = score_df.groupby(['home_id'])['review_id'].count()
    
    summary_df = pd.DataFrame(columns=['home_id','num_of_reviews', 'num_of_sents',
                                       'aspect', 'num_of_sents_0.5', 'num_of_sents_0.6', 
                                       'num_of_sents_0.7', 'sents_0.5', 'sents_0.6', 'sents_0.7'])

    for i in range(len(score_series)):
        
        print("home" + str(i+1))
        home_id = score_series.index[i]
        num_of_reviews = score_series.values[i]
        num_of_sents = count_sents(home_id)[0]
        num_of_sents_05, sents_05 = home_sents_process(home_id, aspect, 0.5)
        num_of_sents_06, sents_06 = home_sents_process(home_id, aspect, 0.6)
        num_of_sents_07, sents_07 = home_sents_process(home_id, aspect, 0.7)

        summary_df.loc[i] = [home_id, num_of_reviews, num_of_sents, aspect, num_of_sents_05, 
                             num_of_sents_06, num_of_sents_07, sents_05, sents_06, sents_07]

    return summary_df

def count_sents(home_id):
    # all comments together
    comments_list = [x for x in list(sample2_rh_df[sample2_rh_df.home_id == home_id].comments) if isinstance(x, str)]
    comments = ''.join(comments_list)
    doc = nlp(comments)
    
    return len(list(doc.sents)), doc
    
def home_sents_process(home_id, aspect, sim):
    doc = count_sents(home_id)[1]
#     sents_score_dic = sents_sim_score(doc, aspect, sim)
#     home_asp_sents = sents_score_dic[sents_score_dic['aspect_sim_score'] != 0].text.values
    home_asp_sents = sim_sents(doc, aspect, sim)

    return len(home_asp_sents), home_asp_sents

### Let's see the result of homes with cleanliness score `8.0`

In [66]:
clean_score8_homes_reviews = summary_of_homes_with_score(8.0, 'cleanliness')
clean_score8_homes_reviews.head()
# clean_score8_homes_reviews[score8_homes_reviews.home_id == 338091]['sents_0.5'][0]

home1
home2
home3


Unnamed: 0,home_id,num_of_reviews,num_of_sents,aspect,num_of_sents_0.5,num_of_sents_0.6,num_of_sents_0.7,sents_0.5,sents_0.6,sents_0.7
0,338091,84,377,cleanliness,177,28,26,"[Judy was really nice and accommodating., The ...",[The apartment is lovely and it was really cle...,[The apartment is lovely and it was really cle...
1,1100714,134,692,cleanliness,307,42,42,[Erika was super easy to work with and helpful...,[My husband and I both agreed that for the pri...,[My husband and I both agreed that for the pri...
2,1773803,124,714,cleanliness,302,48,42,[Sheldon is a wonderful host that will tell yo...,"[The apartment is vast and clean.\n, The apart...","[The apartment is vast and clean.\n, The apart..."


In [63]:
clean_score9_homes_reviews = summary_of_homes_with_score(9.0, 'cleanliness')
clean_score9_homes_reviews
# clean_score9_homes_reviews[clean_score9_homes_reviews.home_id == 338091]['sents_0.5'][0]

home1
home2
home3
home4
home5
home6
home7
home8
home9
home10
home11
home12
home13
home14
home15
home16
home17
home18
home19
home20
home21
home22
home23
home24


Unnamed: 0,home_id,num_of_reviews,num_of_sents,aspect,num_of_sents_0.5,num_of_sents_0.6,num_of_sents_0.7,sents_0.5,sents_0.6,sents_0.7
0,58503,76,411,cleanliness,168,13,13,"[The space was large, clean, and quiet. , The...","[The space was large, clean, and quiet. , Ver...","[The space was large, clean, and quiet. , Ver..."
1,193836,92,510,cleanliness,231,41,40,"[My husband and I had two nights in Seattle, a...",[It was spotlessly clean and nicely decorated....,[It was spotlessly clean and nicely decorated....
2,208670,78,358,cleanliness,156,22,21,"[At the same time, the bungalow was in a very ...","[Place was nice, clean and the location was st...","[Place was nice, clean and the location was st..."
3,442487,104,581,cleanliness,259,43,40,"[It's a small walk to some very nice cafes, bo...","[The place is a beautiful, clean and spacious ...","[The place is a beautiful, clean and spacious ..."
4,490076,84,466,cleanliness,201,18,17,"[Ann was a very attentive and available host.,...","[The place was clean and comfortable. , It is...","[The place was clean and comfortable. , It is..."
5,565703,158,733,cleanliness,333,41,41,[The space is roomy enough for 3 yet has a coz...,"[This apartment is charming, clean, comfortabl...","[This apartment is charming, clean, comfortabl..."
6,741699,145,796,cleanliness,319,47,44,[It's situated about 10 blocks away from Colum...,[The house is very nice spacious clean and lig...,[The house is very nice spacious clean and lig...
7,817115,121,646,cleanliness,296,30,27,[This special little getaway is exactly as des...,"[Well-cared for and very clean, with a very co...","[Well-cared for and very clean, with a very co..."
8,988176,121,572,cleanliness,232,38,37,[Mack's apartment was small but very cute and ...,"[Everything was clean, comfortable, and just a...","[Everything was clean, comfortable, and just a..."
9,1018204,79,404,cleanliness,180,34,30,"[Really nice and comfertable room, clean pleac...","[Really nice and comfertable room, clean pleac...","[Really nice and comfertable room, clean pleac..."


### Let's see the result of homes with location score `7.0`

In [70]:
sample1_df = pd.read_csv("./Data/sample_data_for_testing.csv", sep='\t', encoding="utf-8")
sample1_df = sample1_df.drop("Unnamed: 0", axis=1)
df1 = sample1_df[['home_id', 'scores_location']]
df2 = reviews_df[['home_id', 'review_id', 'comments']]

sample1_rh_df = pd.merge(df1, df2, on="home_id")
sample2_rh_df = sample1_rh_df
df1 = pd.DataFrame(sample2_rh_df.groupby(['scores_location'])['home_id'].nunique())
df2 = pd.DataFrame(sample2_rh_df.groupby(['scores_location'])['review_id'].nunique())
location_overview_df = pd.merge(df1, df2, on = 'scores_location')
location_overview_df.columns = ['number of homes', 'number of reveiws']
location_overview_df

Unnamed: 0_level_0,number of homes,number of reveiws
scores_location,Unnamed: 1_level_1,Unnamed: 2_level_1
8.0,2,196
9.0,31,3457
10.0,54,6255


In [71]:
score_df = sample2_rh_df[sample2_rh_df.scores_location == 8.0]
score_df.groupby(['home_id'])['review_id'].count()

home_id
1566487    104
3449059     92
Name: review_id, dtype: int64

In [72]:
loc_score8_homes_reviews = summary_of_homes_with_score(8.0, 'location')
loc_score8_homes_reviews

home1
home2


Unnamed: 0,home_id,num_of_reviews,num_of_sents,aspect,num_of_sents_0.5,num_of_sents_0.6,num_of_sents_0.7,sents_0.5,sents_0.6,sents_0.7
0,1566487,104,504,location,232,90,58,"[Rodrigo's apartment was amazing!, Getting to ...","[Short drive to fun neighborhoods, but a nice ...","[Short drive to fun neighborhoods, but a nice ..."
1,3449059,92,570,location,269,100,70,[The atmosphere of the apartment and surroundi...,[The atmosphere of the apartment and surroundi...,"[The place is very cute and spacious, plus it ..."


In [76]:
loc_score8_homes_reviews[loc_score8_homes_reviews.home_id == 1566487]['sents_0.7'][0]

['Short drive to fun neighborhoods, but a nice quiet place to come back to after a long day of touring Seattle and super easy parking right out front.',
 'His place is situated in a good location between Lake Washington and downtown as well as Capitol Hill.  ',
 'In addition to all of this, the location was great, 10-15 minutes to Pike Public Market by bus (with two options of bus),',
 'Very nice apartment in a quiet neighborhood.',
 'Apartment was great, super clean, wonderfully designed/curated stuff, and in a good location for us.\r\n\r\n',
 'Great proximity to downtown.',
 'The area is very well served with Metro bus services to downtown and Capitol hill (15min), and other neighbourhoods.',
 'I would say that this apartment is in the Central District and not actually in Capitol Hill neighborhood of Seattle. \r\n',
 'The location is very good.',
 "The only negative thing that I could say about the experience (which I read from the reviews of other's now) is that the pin on the map f

*****************************************************
### The sentiment of sentence.
### Stanford nlp
### Gate
#### The Lexicon Approach --- NLTK Text-Processing API
The backend logic of Lexicon Approach type methods for sentiment analysis is to look at each word individually, attribute positive points for positive words and negative points for negative words, and then total the points.

The text-processing prediction api I used here is an NLTK 2.0.4 powered text classification process. The english sentiment in nltk uses classifiers trained on both twitter sentiment as well as movie reviews. Considering that the Airbnb reviews dataset has many similar characteristics with thoes, I assumed that the results from this api should be reasonable. So, I implemented this and computed the accuracy of prediction.

Basically, the syntax is run the terminal command 'curl -d "text =review" http://text-processing.com/api/sentiment/', then it will return the predited results of the review in the following Json format:

{ "probability": { "neg": 0.30135019761690551, "neutral": 0.27119050546800266, "pos": 0.69864980238309449 }, "label": "pos" }

Label: will be either pos if the text is determined to be positive, neg if the text is negative, or neutral if the text is neither pos nor neg.

Probability: an object that contains the probability for each label. neg and pos will add up to 1, while neutral is standalone. If neutral is greater than 0.5 then the label will be neutral. Otherwise, the label will be pos or neg, whichever has the greater probability.

In [10]:
comment258571 = ''.join(list(sample2_rh_df[sample2_rh_df.home_id == 258571].comments))
doc = nlp(comment258571)

print("-" * 80 +
      "\nThere are in total " 
      + str(len(list(doc.sents))) 
      + " sentences in these 278 comments.\n" 
      + "-" * 80)

--------------------------------------------------------------------------------
There are in total 1478 sentences in these 278 comments.
--------------------------------------------------------------------------------


In [21]:
# Use an avaliable text-processing api to get the sentiment of all reviews
import os
import json

# Functions to get the API predicted results
def getLabel(text):
    cmd = 'curl -d "text=' + text + '" http://text-processing.com/api/sentiment/'
    print(cmd)
    res = os.popen(cmd).read()
    return json.loads(res)['label']

# function to assign labels dataframe.
def predLabels(df):
    labels = []
    i = 0
    for str in df.comments:
        labels.append(getLabel(str))
        i += 1
        if i % 100 == 0:
            print(i)
#     df.label = labels
    return labels

In [None]:
# Here it tooks about 15 minites to finish. Please be patient if you replicate this experiment.
predicted_labels = predLabels(all_samples)
labels = np.array([1 if s=="POSITIVE" else 0 for s in all_samples.Sentiment.values])

**********************************************************************************************
### Try a small sample: home 258571 which has 278 number of comments.

In [24]:
home258571_sent_score = sents_sim_score(doc, 'cleanliness', 0.7)
home258571_sent_scxore.head(5)
# home258571_cleanliness_text = [home258571_sent_score[k]['sentence'] for k in home258571_sent_score.keys() if home258571_sent_score[k]['aspect_sim_score'] != 0]

NameError: name 'doc' is not defined

In [105]:
a = home258571_sent_score[home258571_sent_score['aspect_sim_score'] != 0].text
len(a)

91

In [23]:
a

NameError: name 'a' is not defined

*******************

### Result summary

In [29]:
len(home258571_cleanliness_text)

627

In [22]:
print("-" * 80
      + "\nThere are " 
      + str(len(home258571_cleanliness_text))
      + "/"
      + str(len(list(doc.sents)))
      + " sentences talking about the cleanliness aspect.\n"
      + "-" * 80)

NameError: name 'home258571_cleanliness_text' is not defined

**********************************************************************************************

In [19]:
def score_detail(sen_keywords, aspect_keywords):
    score = 0
    for token1 in sen_keywords:
        for token2 in aspect_keywords:
            if token1.similarity(token2) >= 0.7:
                print(token1.text, token2.text, token1.similarity(token2))
                score += 1
    return score

In [20]:
h = nlp(u"Nick was very communicative and gracious enough to transport me to and from the light rail upon my arrival and departure, and went the extra mile by leaving me maps, recommendations, and even printed out a bus route based on where I'd told him I planned on going.")
aspect_keywords = nlp(' '.join(aspect_keywords_dic['cleanliness']))

# print(len(sen_keywords), len(aspect_keywords))
score_detail(h, aspect_keywords)

0

In [21]:
def word_similarity(w1, w2):
    token1 = nlp(w1)
    token2 = nlp(w2)
    return token1.similarity(token2)

word_similarity('scrubbing', 'clean')

0.648041676443703

In [22]:
# print("Brief pre-processing logic for one sentence.")
# print("*" * 93 + "\nAn example:\n" + "-" * 80 + "\nThe first sentence:")
# print(list(doc.sents)[0].text)

# print("-" * 93 + "\nSentence keywords got from keyword_extraction function:")
# sen_keywords = list(keyword_extraction(list(doc.sents)[0].text).keys())
# print(sen_keywords)

# print("-" * 93 + "\nJoin sentence keywords together for further similarity use:")
# sen_keywords = ' '.join(sen_keywords)
# print(sen_keywords)
# print("-" * 93)

# **So, temporarily, I tried to use the count of the words in a sentance 
# that with lager than `0.7 `similarity score to determine the relevance.
# And it works well for now.**